export llama_timings as struct and expose them in server

This commit is contained in:
Tobias Lütke 2023-07-04 21:52:04 -04:00
parent c19daa4eb5
commit efa86bf2a6
No known key found for this signature in database
GPG key ID: 1FC0DBB14164709A
7 changed files with 1170 additions and 1001 deletions

View file

@ -726,6 +726,8 @@ static json format_generation_settings(llama_server_context & llama) {
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
return json {
{ "n_ctx", llama.params.n_ctx },
{ "model", llama.params.model_alias },
{ "seed", llama.params.seed },
{ "temp", llama.params.temp },
{ "top_k", llama.params.top_k },
@ -756,13 +758,29 @@ static json format_embedding_response(llama_server_context & llama) {
};
}
static json format_timings(llama_server_context & llama) {
const auto timings = llama_get_timings(llama.ctx);
assert(timings.n_eval == llama.num_tokens_predicted);
return json {
{ "prompt_n", timings.n_eval },
{ "prompt_ms", timings.prompt_eval_time_ms },
{ "prompt_per_token_ms", timings.prompt_eval_time_ms / timings.n_p_eval },
{ "prompt_per_second", 1e3 / timings.prompt_eval_time_ms * timings.n_p_eval },
{ "predicted_n", timings.n_eval },
{ "predicted_ms", timings.eval_time_ms },
{ "predicted_per_token_ms", timings.eval_time_ms / timings.n_eval },
{ "predicted_per_second", 1e3 / timings.eval_time_ms * timings.n_eval },
};
}
static json format_final_response(llama_server_context & llama, const std::string & content, const std::vector<completion_token_output> & probs) {
json res = json {
{ "content", content },
{ "stop", true },
{ "model", llama.params.model_alias },
{ "tokens_predicted", llama.num_tokens_predicted },
{ "generation_settings", format_generation_settings(llama) },
{ "prompt", llama.params.prompt },
{ "truncated", llama.truncated },
@ -770,6 +788,9 @@ static json format_final_response(llama_server_context & llama, const std::strin
{ "stopped_word", llama.stopped_word },
{ "stopped_limit", llama.stopped_limit },
{ "stopping_word", llama.stopping_word },
{ "tokens_cached", llama.n_past },
{ "tokens_predicted", llama.num_tokens_predicted },
{ "timings", format_timings(llama) },
};
if (llama.params.n_probs > 0) {
@ -913,18 +934,18 @@ int main(int argc, char ** argv) {
{ "Access-Control-Allow-Headers", "content-type" }
});
// this is only called if no index.js is found in the public --path
svr.Get("/index.js", [](const Request &, Response & res) {
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
return false;
});
// this is only called if no index.html is found in the public --path
svr.Get("/", [](const Request &, Response & res) {
res.set_content(reinterpret_cast<const char*>(&index_html), index_html_len, "text/html");
return false;
});
// this is only called if no index.js is found in the public --path
svr.Get("/index.js", [](const Request &, Response & res) {
res.set_content(reinterpret_cast<const char *>(&index_js), index_js_len, "text/javascript");
return false;
});
// this is only called if no index.html is found in the public --path
svr.Get("/completion.js", [](const Request &, Response & res) {
res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
@ -1039,6 +1060,11 @@ int main(int argc, char ** argv) {
}
});
svr.Get("/model.json", [&llama](const Request &, Response & res) {
const json data = format_generation_settings(llama);
return res.set_content(data.dump(), "application/json");
});
svr.Options(R"(/.*)", [](const Request &, Response & res) {
return res.set_content("", "application/json");
});