diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz index 36e9fb15b..141e80920 100644 Binary files a/examples/server/public/index.html.gz and b/examples/server/public/index.html.gz differ diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f76e72274..0718806c8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -334,24 +334,24 @@ struct server_task { if (data.contains("json_schema") && !data.contains("grammar")) { try { auto schema = json_value(data, "json_schema", json::object()); - LOG_DBG("JSON schema: %s\n", schema.dump(2).c_str()); + SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str()); params.sampling.grammar = json_schema_to_grammar(schema); - LOG_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str()); + SRV_DBG("Converted grammar: %s\n", params.sampling.grammar.c_str()); } catch (const std::exception & e) { throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); } } else { params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar); - LOG_DBG("Grammar: %s\n", params.sampling.grammar.c_str()); + SRV_DBG("Grammar: %s\n", params.sampling.grammar.c_str()); params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy); - LOG_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false"); + SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false"); } { auto it = data.find("chat_format"); if (it != data.end()) { params.oaicompat_chat_format = static_cast(it->get()); - LOG_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str()); + SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_format).c_str()); } else { params.oaicompat_chat_format = defaults.oaicompat_chat_format; } @@ -367,12 +367,12 @@ struct server_task { auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true); if (ids.size() == 1) { - LOG_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str()); + SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str()); params.sampling.grammar_trigger_tokens.push_back(ids[0]); params.sampling.preserved_tokens.insert(ids[0]); continue; } - LOG_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str()); + SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str()); params.sampling.grammar_trigger_words.push_back(trigger); } } @@ -381,11 +381,11 @@ struct server_task { for (const auto & t : *preserved_tokens) { auto ids = common_tokenize(vocab, t.get(), /* add_special= */ false, /* parse_special= */ true); if (ids.size() == 1) { - LOG_DBG("Preserved token: %d\n", ids[0]); + SRV_DBG("Preserved token: %d\n", ids[0]); params.sampling.preserved_tokens.insert(ids[0]); } else { // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. - LOG_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get().c_str()); + SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get().c_str()); } } } @@ -717,7 +717,7 @@ struct server_task_result_cmpl_final : server_task_result { std::string finish_reason = "length"; common_chat_msg msg; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - LOG_DBG("Parsing chat message: %s\n", content.c_str()); + SRV_DBG("Parsing chat message: %s\n", content.c_str()); msg = common_chat_parse(content, oaicompat_chat_format); finish_reason = msg.tool_calls.empty() ? "stop" : "tool_calls"; } else { @@ -1885,7 +1885,7 @@ struct server_context { } if (params_base.chat_template.empty() && !validate_builtin_chat_template(params.use_jinja)) { - LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); + SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); chat_templates = common_chat_templates_from_model(model, "chatml"); } else { chat_templates = common_chat_templates_from_model(model, params_base.chat_template); @@ -3355,10 +3355,10 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch - LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); + SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status); - LOG_DBG("request: %s\n", req.body.c_str()); - LOG_DBG("response: %s\n", res.body.c_str()); + SRV_DBG("request: %s\n", req.body.c_str()); + SRV_DBG("response: %s\n", res.body.c_str()); } std::function shutdown_handler; @@ -3860,7 +3860,9 @@ int main(int argc, char ** argv) { try { const auto & prompt = data.at("prompt"); - LOG_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); + // TODO: this log can become very long, put it behind a flag or think about a more compact format + //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); tasks.reserve(tokenized_prompts.size()); for (size_t i = 0; i < tokenized_prompts.size(); i++) { diff --git a/examples/server/webui/src/components/ChatMessage.tsx b/examples/server/webui/src/components/ChatMessage.tsx index 7fae73492..ec72196ba 100644 --- a/examples/server/webui/src/components/ChatMessage.tsx +++ b/examples/server/webui/src/components/ChatMessage.tsx @@ -92,7 +92,7 @@ export default function ChatMessage({ <> diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 59efaeb71..fdb430a43 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13856,9 +13856,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { tp->ec = GGML_STATUS_ABORTED; } - ggml_barrier(state->threadpool); + if (node_n + 1 < cgraph->n_nodes) { + ggml_barrier(state->threadpool); + } } + ggml_barrier(state->threadpool); + return 0; } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e30db66e9..0f4b62c43 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1275,7 +1275,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const bool use_mmap_buffer = true; - LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, use_mmap_buffer ? "true" : "false"); + LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false"); // build a list of buffer types for the CPU and GPU devices pimpl->cpu_buft_list = make_cpu_buft_list(devices); diff --git a/src/llama.cpp b/src/llama.cpp index 3b6a21d81..607f27861 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9430,7 +9430,6 @@ static struct llama_model * llama_model_load_from_file_impl( struct llama_model_params params) { ggml_time_init(); - unsigned cur_percentage = 0; if (params.progress_callback == NULL) { params.progress_callback_user_data = &cur_percentage;