server: Add "tokens per second" information in the backend (#10548)

* add cmake rvv support * add timings * remove space * update readme * fix * fix code * remove empty line * add test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
2024-12-02 21:45:54 +08:00 · 2024-12-02 21:45:54 +08:00 · 64ed2091b2
commit 64ed2091b2
parent 991f8aabee
5 changed files with 44 additions and 1 deletions
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r
        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
    }

+    if (result.contains("timings")) {
+        res.push_back({"timings", json_value(result, "timings", json::object())});
+    }
+
    return res;
 }

@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
        {"model",   modelname},
        {"object",  "chat.completion.chunk"}
    };
+
+    if (result.contains("timings")) {
+        ret.push_back({"timings", json_value(result, "timings", json::object())});
+    }
+
    if (!finish_reason.empty()) {
        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);