server: logs - always use JSON logger, add add thread_id in message, log task_id and slot_id

This commit is contained in:
Pierrick HYMBERT 2024-02-24 13:12:47 +01:00
parent 525213d2f5
commit 04b5a92478
2 changed files with 125 additions and 37 deletions

View file

@ -301,12 +301,43 @@ struct llama_client_slot
}
void print_timings() const {
LOG_TEE("\n");
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded);
LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation);
static char buffer[512];
double t_token = t_prompt_processing / num_prompt_tokens_processed;
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
t_prompt_processing, num_prompt_tokens_processed,
t_token, n_tokens_second);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
{"t_prompt_processing", t_prompt_processing},
{"num_prompt_tokens_processed", num_prompt_tokens_processed},
{"t_token", t_token},
{"n_tokens_second", n_tokens_second},
});
t_token = t_token_generation / n_decoded;
n_tokens_second = 1e3 / t_token_generation * n_decoded;
sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)",
t_token_generation, n_decoded,
t_token, n_tokens_second);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
{"t_token_generation", t_token_generation},
{"n_decoded", n_decoded},
{"t_token", t_token},
{"n_tokens_second", n_tokens_second},
});
sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation);
LOG_INFO(buffer, {
{"slot_id", id},
{"task_id", task_id},
{"t_prompt_processing", t_prompt_processing},
{"t_token_generation", t_token_generation},
{"t_total", t_prompt_processing + t_token_generation},
});
}
};
@ -363,7 +394,7 @@ struct llama_server_context
params = params_;
if (!params.mmproj.empty()) {
multimodal = true;
LOG_TEE("Multi Modal Mode Enabled");
LOG_INFO("Multi Modal Mode Enabled", {});
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
if(clp_ctx == nullptr) {
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
@ -729,10 +760,16 @@ struct llama_server_context
img_sl.img_data = clip_image_u8_init();
if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data))
{
LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id);
LOG_WARNING("failed to load image", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
return false;
}
LOG_TEE("slot %i - loaded image\n", slot->id);
LOG_VERBOSE("image loaded", {
{"slot_id", slot->id},
{"img_sl_id", img_sl.id}
});
img_sl.request_encode_image = true;
slot->images.push_back(img_sl);
}
@ -792,7 +829,10 @@ struct llama_server_context
all_slots_are_idle = false;
LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
LOG_INFO("slot is processing task", {
{"slot_id", slot->id},
{"task_id", slot->task_id},
});
return true;
}
@ -1355,7 +1395,7 @@ struct llama_server_context
if (slot == nullptr)
{
// if no slot is available, we defer this task for processing later
LOG_VERBOSE("no slot is available", {});
LOG_VERBOSE("no slot is available", {{"task_id", task.id}});
queue_tasks.defer(task);
break;
}
@ -1431,7 +1471,17 @@ struct llama_server_context
}
slots_data.push_back(slot_data);
}
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
LOG_INFO("slot data", {
{"task_id", task.id},
{"n_idle_slots", n_idle_slots},
{"n_processing_slots", n_processing_slots}
});
LOG_VERBOSE("slot data", {
{"task_id", task.id},
{"n_idle_slots", n_idle_slots},
{"n_processing_slots", n_processing_slots},
{"slots", slots_data}
});
task_result res;
res.id = task.id;
res.multitask_id = task.multitask_id;
@ -1469,7 +1519,7 @@ struct llama_server_context
bool update_slots() {
if (system_need_update)
{
LOG_TEE("updating system prompt\n");
LOG_INFO("updating system prompt", {});
update_system_prompt();
}
@ -1479,12 +1529,13 @@ struct llama_server_context
{
if (system_prompt.empty() && clean_kv_cache)
{
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {});
kv_cache_clear();
}
return true;
}
LOG_VERBOSE("posting NEXT_RESPONSE", {});
task_server task;
task.type = TASK_TYPE_NEXT_RESPONSE;
task.target_id = -1;
@ -1498,10 +1549,20 @@ struct llama_server_context
{
// Shift context
const int n_keep = slot.params.n_keep + add_bos_token;
const int n_left = system_tokens.size() + slot.n_past - n_keep;
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
const int n_discard = n_left / 2;
LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard);
LOG_INFO("slot context shift", {
{"slot_id", slot.id},
{"task_id", slot.task_id},
{"n_keep", n_keep},
{"n_left", n_left},
{"n_discard", n_discard},
{"n_ctx", n_ctx},
{"n_past", slot.n_past},
{"n_system_tokens", system_tokens.size()},
{"n_cache_tokens", slot.cache_tokens.size()}
});
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
@ -1515,17 +1576,12 @@ struct llama_server_context
slot.n_past -= n_discard;
slot.truncated = true;
LOG_VERBOSE("context shift", {
{ "n_ctx", n_ctx },
{ "n_keep", n_keep },
{ "n_left", n_left },
});
}
}
}
// decode any currently ongoing sequences
LOG_VERBOSE("decoding ongoing sequences", {});
for (auto & slot : slots)
{
// release the slot
@ -1535,7 +1591,15 @@ struct llama_server_context
slot.command = NONE;
slot.t_last_used = ggml_time_us();
LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
LOG_INFO("slot released", {
{"slot_id", slot.id},
{"task_id", slot.task_id},
{"n_ctx", n_ctx},
{"n_past", slot.n_past},
{"n_system_tokens", system_tokens.size()},
{"n_cache_tokens", slot.cache_tokens.size()},
{"truncated", slot.truncated}
});
queue_tasks.notify_slot_changed();
continue;
@ -1683,7 +1747,12 @@ struct llama_server_context
slot.ga_i = ga_i;
}
LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
LOG_INFO("slot progression", {
{ "slot_id", slot.id },
{ "task_id", slot.task_id },
{ "n_past", slot.n_past },
{ "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
});
}
slot.cache_tokens = prompt_tokens;
@ -1691,7 +1760,10 @@ struct llama_server_context
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
{
// we have to evaluate at least 1 token to generate logits.
LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
{ "slot_id", slot.id },
{ "task_id", slot.task_id }
});
slot.n_past--;
if (slot.ga_i > 0)
{
@ -1699,9 +1771,13 @@ struct llama_server_context
}
}
LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past);
llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1);
int p0 = (int) system_tokens.size() + slot.n_past;
LOG_INFO("kv cache rm [p0, end)", {
{ "slot_id", slot.id },
{ "task_id", slot.task_id },
{ "p0", p0 }
});
llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
LOG_VERBOSE("prompt ingested", {
{"n_past", slot.n_past},
@ -1736,7 +1812,7 @@ struct llama_server_context
if (has_images && !ingest_images(slot, n_batch))
{
LOG_TEE("failed processing images\n");
LOG_WARNING("failed processing images", {});
return false;
}
@ -1876,6 +1952,8 @@ struct llama_server_context
slot.i_batch = -1;
}
}
LOG_VERBOSE("Slots updated", {});
return true;
}

View file

@ -145,6 +145,11 @@ static inline void server_log(const char *level, const char *function, int line,
{"message", message},
};
auto thread_id = std::this_thread::get_id();
std::stringstream ss_thread_id;
ss_thread_id << thread_id;
log.push_back({"thread_id", ss_thread_id.str()});
if (!extra.empty())
{
log.merge_patch(extra);
@ -234,6 +239,7 @@ struct llama_server_queue {
std::unique_lock<std::mutex> lock(mutex_tasks);
if (task.id == -1) {
task.id = id++;
LOG_VERBOSE("new task id", {{"new_id", task.id}});
}
queue_tasks.push_back(std::move(task));
condition_tasks.notify_one();
@ -249,7 +255,9 @@ struct llama_server_queue {
// Get the next id for creating anew task
int get_new_id() {
std::unique_lock<std::mutex> lock(mutex_tasks);
return id++;
int new_id = id++;
LOG_VERBOSE("new task id", {{"new_id", new_id}});
return new_id;
}
// Register function to process a new task
@ -290,8 +298,8 @@ struct llama_server_queue {
void start_loop() {
running = true;
while (true) {
// new task arrived
LOG_VERBOSE("have new task", {});
// new task will arrive
LOG_VERBOSE("new task may arrive", {});
{
while (true)
{
@ -303,7 +311,7 @@ struct llama_server_queue {
task_server task = queue_tasks.front();
queue_tasks.erase(queue_tasks.begin());
lock.unlock();
LOG_VERBOSE("callback_new_task", {});
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
callback_new_task(task);
}
LOG_VERBOSE("callback_all_task_finished", {});
@ -384,11 +392,13 @@ struct llama_server_response {
std::condition_variable condition_results;
void add_waiting_task_id(int task_id) {
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.insert(task_id);
}
void remove_waiting_task_id(int task_id) {
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
std::unique_lock<std::mutex> lock(mutex_results);
waiting_task_ids.erase(task_id);
}
@ -401,7 +411,7 @@ struct llama_server_response {
condition_results.wait(lock, [&]{
return !queue_results.empty();
});
LOG_VERBOSE("condition_results unblock", {});
LOG_VERBOSE("condition_results unblock", {{"task_id", task_id}});
for (int i = 0; i < (int) queue_results.size(); i++)
{
@ -426,20 +436,20 @@ struct llama_server_response {
// Send a new result to a waiting task_id
void send(task_result result) {
std::unique_lock<std::mutex> lock(mutex_results);
LOG_VERBOSE("send new result", {});
LOG_VERBOSE("send new result", {{"task_id", result.id}});
for (auto& task_id : waiting_task_ids) {
// LOG_TEE("waiting task id %i \n", task_id);
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
if (result.multitask_id == task_id)
{
LOG_VERBOSE("callback_update_multitask", {});
LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}});
callback_update_multitask(task_id, result.id, result);
continue;
}
if (result.id == task_id)
{
LOG_VERBOSE("queue_results.push_back", {});
LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}});
queue_results.push_back(result);
condition_results.notify_one();
return;