server: Add "tokens per second" information in the backend (#10548)

* add cmake rvv support * add timings * remove space * update readme * fix * fix code * remove empty line * add test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-12-02 21:45:54 +08:00 · 2024-12-02 21:45:54 +08:00 · 64ed2091b2
commit 64ed2091b2
parent 991f8aabee
5 changed files with 44 additions and 1 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -177,6 +177,8 @@ struct server_slot {
    bool stopped_word   = false;
    bool stopped_limit  = false;

+    bool timings_per_token = false;
+
    bool oaicompat = false;

    std::string oaicompat_model;
@ -882,6 +884,8 @@ struct server_context {
            slot.oaicompat_model = "";
        }

+        slot.timings_per_token       = json_value(data, "timings_per_token",  false);
+
        slot.params.stream           = json_value(data, "stream",             false);
        slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
        slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
@ -1279,6 +1283,7 @@ struct server_context {
            {"speculative.n_max",         slot.params.speculative.n_max},
            {"speculative.n_min",         slot.params.speculative.n_min},
            {"speculative.p_min",         slot.params.speculative.p_min},
+            {"timings_per_token",         slot.timings_per_token},
        };
    }

@ -1336,6 +1341,10 @@ struct server_context {
            res.data["model"] = slot.oaicompat_model;
        }

+        if (slot.timings_per_token) {
+            res.data["timings"] = slot.get_formated_timings();
+        }
+
        queue_results.send(res);
    }

@ -2274,12 +2283,17 @@ struct server_context {
                common_sampler_accept(slot.smpl, id, true);

                slot.n_decoded += 1;
+
+                const int64_t t_current = ggml_time_us();
+
                if (slot.n_decoded == 1) {
-                    slot.t_start_generation = ggml_time_us();
+                    slot.t_start_generation = t_current;
                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
                    metrics.on_prompt_eval(slot);
                }

+                slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
+
                completion_token_output result;
                result.tok = id;