From 04b5a92478a3040758e458cdec4d2aa913e5ccc9 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 24 Feb 2024 13:12:47 +0100 Subject: [PATCH] server: logs - always use JSON logger, add add thread_id in message, log task_id and slot_id --- examples/server/server.cpp | 136 +++++++++++++++++++++++++++++-------- examples/server/utils.hpp | 26 ++++--- 2 files changed, 125 insertions(+), 37 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9fb436c2a..a5c10f445 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -301,12 +301,43 @@ struct llama_client_slot } void print_timings() const { - LOG_TEE("\n"); - LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); + static char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); } }; @@ -363,7 +394,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_TEE("Multi Modal Mode Enabled"); + LOG_INFO("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -729,10 +760,16 @@ struct llama_server_context img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); + LOG_WARNING("failed to load image", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); return false; } - LOG_TEE("slot %i - loaded image\n", slot->id); + LOG_VERBOSE("image loaded", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -792,7 +829,10 @@ struct llama_server_context all_slots_are_idle = false; - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + LOG_INFO("slot is processing task", { + {"slot_id", slot->id}, + {"task_id", slot->task_id}, + }); return true; } @@ -1355,7 +1395,7 @@ struct llama_server_context if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {}); + LOG_VERBOSE("no slot is available", {{"task_id", task.id}}); queue_tasks.defer(task); break; } @@ -1431,7 +1471,17 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + LOG_INFO("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots} + }); + LOG_VERBOSE("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots}, + {"slots", slots_data} + }); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1469,7 +1519,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_TEE("updating system prompt\n"); + LOG_INFO("updating system prompt", {}); update_system_prompt(); } @@ -1479,12 +1529,13 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); + LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; } + LOG_VERBOSE("posting NEXT_RESPONSE", {}); task_server task; task.type = TASK_TYPE_NEXT_RESPONSE; task.target_id = -1; @@ -1498,10 +1549,20 @@ struct llama_server_context { // Shift context const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = system_tokens.size() + slot.n_past - n_keep; + const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); + LOG_INFO("slot context shift", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_keep", n_keep}, + {"n_left", n_left}, + {"n_discard", n_discard}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()} + }); llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1515,17 +1576,12 @@ struct llama_server_context slot.n_past -= n_discard; slot.truncated = true; - - LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, - { "n_keep", n_keep }, - { "n_left", n_left }, - }); } } } // decode any currently ongoing sequences + LOG_VERBOSE("decoding ongoing sequences", {}); for (auto & slot : slots) { // release the slot @@ -1535,7 +1591,15 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + LOG_INFO("slot released", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); queue_tasks.notify_slot_changed(); continue; @@ -1683,7 +1747,12 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG_INFO("slot progression", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + }); } slot.cache_tokens = prompt_tokens; @@ -1691,7 +1760,10 @@ struct llama_server_context if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); + LOG_INFO("we have to evaluate at least 1 token to generate logits", { + { "slot_id", slot.id }, + { "task_id", slot.task_id } + }); slot.n_past--; if (slot.ga_i > 0) { @@ -1699,9 +1771,13 @@ struct llama_server_context } } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + int p0 = (int) system_tokens.size() + slot.n_past; + LOG_INFO("kv cache rm [p0, end)", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "p0", p0 } + }); + llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, @@ -1736,7 +1812,7 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); + LOG_WARNING("failed processing images", {}); return false; } @@ -1876,6 +1952,8 @@ struct llama_server_context slot.i_batch = -1; } } + + LOG_VERBOSE("Slots updated", {}); return true; } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 88545eb69..955732794 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -145,6 +145,11 @@ static inline void server_log(const char *level, const char *function, int line, {"message", message}, }; + auto thread_id = std::this_thread::get_id(); + std::stringstream ss_thread_id; + ss_thread_id << thread_id; + log.push_back({"thread_id", ss_thread_id.str()}); + if (!extra.empty()) { log.merge_patch(extra); @@ -234,6 +239,7 @@ struct llama_server_queue { std::unique_lock lock(mutex_tasks); if (task.id == -1) { task.id = id++; + LOG_VERBOSE("new task id", {{"new_id", task.id}}); } queue_tasks.push_back(std::move(task)); condition_tasks.notify_one(); @@ -249,7 +255,9 @@ struct llama_server_queue { // Get the next id for creating anew task int get_new_id() { std::unique_lock lock(mutex_tasks); - return id++; + int new_id = id++; + LOG_VERBOSE("new task id", {{"new_id", new_id}}); + return new_id; } // Register function to process a new task @@ -290,8 +298,8 @@ struct llama_server_queue { void start_loop() { running = true; while (true) { - // new task arrived - LOG_VERBOSE("have new task", {}); + // new task will arrive + LOG_VERBOSE("new task may arrive", {}); { while (true) { @@ -303,7 +311,7 @@ struct llama_server_queue { task_server task = queue_tasks.front(); queue_tasks.erase(queue_tasks.begin()); lock.unlock(); - LOG_VERBOSE("callback_new_task", {}); + LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); callback_new_task(task); } LOG_VERBOSE("callback_all_task_finished", {}); @@ -384,11 +392,13 @@ struct llama_server_response { std::condition_variable condition_results; void add_waiting_task_id(int task_id) { + LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); } void remove_waiting_task_id(int task_id) { + LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); } @@ -401,7 +411,7 @@ struct llama_server_response { condition_results.wait(lock, [&]{ return !queue_results.empty(); }); - LOG_VERBOSE("condition_results unblock", {}); + LOG_VERBOSE("condition_results unblock", {{"task_id", task_id}}); for (int i = 0; i < (int) queue_results.size(); i++) { @@ -426,20 +436,20 @@ struct llama_server_response { // Send a new result to a waiting task_id void send(task_result result) { std::unique_lock lock(mutex_results); - LOG_VERBOSE("send new result", {}); + LOG_VERBOSE("send new result", {{"task_id", result.id}}); for (auto& task_id : waiting_task_ids) { // LOG_TEE("waiting task id %i \n", task_id); // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result if (result.multitask_id == task_id) { - LOG_VERBOSE("callback_update_multitask", {}); + LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}}); callback_update_multitask(task_id, result.id, result); continue; } if (result.id == task_id) { - LOG_VERBOSE("queue_results.push_back", {}); + LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}}); queue_results.push_back(result); condition_results.notify_one(); return;