export llama_timings as struct and expose them in server

2023-07-04 21:52:04 -04:00 · 2023-07-04 21:52:04 -04:00 · 30d973dc42
commit 30d973dc42
parent a76ce02a6c
7 changed files with 1631 additions and 1309 deletions
--- a/llama.h
+++ b/llama.h
@ -134,6 +134,20 @@ extern "C" {
        bool quantize_output_tensor; // quantize output.weight
    } llama_model_quantize_params;

+    // performance timing information
+    struct llama_timings {
+        double t_start_ms;
+        double t_end_ms;
+        double load_time_ms;
+        double sample_time_ms;
+        double prompt_eval_time_ms;
+        double eval_time_ms;
+
+        int32_t n_sample;
+        int32_t n_p_eval;
+        int32_t n_eval;
+    };
+
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();

@ -331,6 +345,7 @@ extern "C" {
    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);

    // Performance information
+    LLAMA_API llama_timings llama_get_timings(struct llama_context * ctx);
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);