server: metrics - move to a dedicated struct
This commit is contained in:
parent
7b29648da5
commit
542f42a604
1 changed files with 43 additions and 36 deletions
|
@ -311,6 +311,39 @@ struct llama_client_slot
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_metrics {
|
||||||
|
uint64_t n_prompt_tokens_processed_total = 0;
|
||||||
|
uint64_t n_tokens_predicted_total = 0;
|
||||||
|
|
||||||
|
uint64_t n_prompt_tokens_processed = 0;
|
||||||
|
uint64_t t_prompt_processing = 0;
|
||||||
|
|
||||||
|
uint64_t n_tokens_predicted = 0;
|
||||||
|
uint64_t t_tokens_generation = 0;
|
||||||
|
|
||||||
|
|
||||||
|
void on_prompt_eval(const llama_client_slot &slot) {
|
||||||
|
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
|
||||||
|
|
||||||
|
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
|
||||||
|
t_prompt_processing += slot.t_prompt_processing;
|
||||||
|
}
|
||||||
|
|
||||||
|
void on_prediction(const llama_client_slot &slot) {
|
||||||
|
n_tokens_predicted_total += slot.n_decoded;
|
||||||
|
|
||||||
|
n_tokens_predicted += slot.n_decoded;
|
||||||
|
t_tokens_generation += slot.t_token_generation;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset_bucket() {
|
||||||
|
n_prompt_tokens_processed = 0;
|
||||||
|
t_prompt_processing = 0;
|
||||||
|
n_tokens_predicted = 0;
|
||||||
|
t_tokens_generation = 0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_server_context
|
struct llama_server_context
|
||||||
{
|
{
|
||||||
llama_model *model = nullptr;
|
llama_model *model = nullptr;
|
||||||
|
@ -345,15 +378,7 @@ struct llama_server_context
|
||||||
llama_server_queue queue_tasks;
|
llama_server_queue queue_tasks;
|
||||||
llama_server_response queue_results;
|
llama_server_response queue_results;
|
||||||
|
|
||||||
// metrics
|
llama_metrics metrics;
|
||||||
uint64_t n_prompt_tokens_processed_total = 0;
|
|
||||||
uint64_t n_tokens_predicted_total = 0;
|
|
||||||
|
|
||||||
uint64_t n_prompt_tokens_processed = 0;
|
|
||||||
uint64_t t_prompt_processing = 0;
|
|
||||||
|
|
||||||
uint64_t n_tokens_predicted = 0;
|
|
||||||
uint64_t t_tokens_generation = 0;
|
|
||||||
|
|
||||||
~llama_server_context()
|
~llama_server_context()
|
||||||
{
|
{
|
||||||
|
@ -1453,24 +1478,20 @@ struct llama_server_context
|
||||||
{ "processing", n_processing_slots },
|
{ "processing", n_processing_slots },
|
||||||
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
{ "deferred", queue_tasks.queue_tasks_deferred.size() },
|
||||||
|
|
||||||
{ "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total},
|
{ "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
|
||||||
{ "n_tokens_predicted_total", n_tokens_predicted_total},
|
{ "n_tokens_predicted_total", metrics.n_tokens_predicted_total},
|
||||||
|
|
||||||
{ "n_prompt_tokens_processed", n_prompt_tokens_processed},
|
{ "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed},
|
||||||
{ "t_prompt_processing", t_prompt_processing},
|
{ "t_prompt_processing", metrics.t_prompt_processing},
|
||||||
{ "n_tokens_predicted", n_tokens_predicted},
|
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||||
{ "t_tokens_generation", t_tokens_generation},
|
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||||
|
|
||||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||||
|
|
||||||
{ "slots", slots_data },
|
{ "slots", slots_data },
|
||||||
};
|
};
|
||||||
// reset metrics for the next bucket
|
metrics.reset_bucket();
|
||||||
n_prompt_tokens_processed = 0;
|
|
||||||
t_prompt_processing = 0;
|
|
||||||
n_tokens_predicted = 0;
|
|
||||||
t_tokens_generation = 0;
|
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -1878,7 +1899,7 @@ struct llama_server_context
|
||||||
{
|
{
|
||||||
slot.t_start_genereration = ggml_time_us();
|
slot.t_start_genereration = ggml_time_us();
|
||||||
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
|
||||||
update_metrics_prompt_eval(slot);
|
metrics.on_prompt_eval(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||||
|
@ -1901,7 +1922,7 @@ struct llama_server_context
|
||||||
slot.release();
|
slot.release();
|
||||||
slot.print_timings();
|
slot.print_timings();
|
||||||
send_final_response(slot);
|
send_final_response(slot);
|
||||||
update_metrics_prediction(slot);
|
metrics.on_prediction(slot);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.i_batch = -1;
|
slot.i_batch = -1;
|
||||||
|
@ -1913,20 +1934,6 @@ struct llama_server_context
|
||||||
void run_on_all_tasks_finished() {
|
void run_on_all_tasks_finished() {
|
||||||
update_slots();
|
update_slots();
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_metrics_prompt_eval(const llama_client_slot &slot) {
|
|
||||||
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
|
|
||||||
|
|
||||||
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
|
|
||||||
t_prompt_processing += slot.t_prompt_processing;
|
|
||||||
}
|
|
||||||
|
|
||||||
void update_metrics_prediction(const llama_client_slot &slot) {
|
|
||||||
n_tokens_predicted_total += slot.n_decoded;
|
|
||||||
|
|
||||||
n_tokens_predicted += slot.n_decoded;
|
|
||||||
t_tokens_generation += slot.t_token_generation;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue