server: Add "tokens per second" information in the backend (#10548)

* add cmake rvv support

* add timings

* remove space

* update readme

* fix

* fix code

* remove empty line

* add test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
haopeng 2024-12-02 21:45:54 +08:00 committed by GitHub
parent 991f8aabee
commit 64ed2091b2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 44 additions and 1 deletions

View file

@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
if (result.contains("timings")) {
res.push_back({"timings", json_value(result, "timings", json::object())});
}
return res;
}
@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
{"model", modelname},
{"object", "chat.completion.chunk"}
};
if (result.contains("timings")) {
ret.push_back({"timings", json_value(result, "timings", json::object())});
}
if (!finish_reason.empty()) {
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);