From 12610861639c30201bf6071fc951fd5954bb2b2e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 19:36:37 +0100 Subject: [PATCH] minor style fix --- examples/server/server.cpp | 25 ++++++++++++++++--------- examples/server/server.hpp | 9 +++++---- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 60947a17f..469663b2e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1170,12 +1170,15 @@ struct server_context { server_task_result_cmpl_partial res; res.id = slot.id_task; res.index = slot.index; + res.content = tkn.text_to_send; + + res.truncated = slot.truncated; res.n_decoded = slot.n_decoded; res.n_prompt_tokens = slot.n_prompt_tokens; - res.content = tkn.text_to_send; - res.stop = slot.stop; - res.truncated = slot.truncated; + res.stop = slot.stop; + + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); @@ -1206,20 +1209,22 @@ struct server_context { server_task_result_cmpl_final res; res.id = slot.id_task; res.id_slot = slot.id; + res.index = slot.index; res.content = slot.generated_text; + res.timings = slot.get_timings(); + res.model_alias = slot.oaicompat_model; + res.prompt = common_detokenize(ctx, slot.prompt_tokens, true); + res.truncated = slot.truncated; res.n_decoded = slot.n_decoded; res.n_prompt_tokens = slot.n_prompt_tokens; - res.has_new_line = slot.has_new_line; res.n_tokens_cached = slot.n_past; - res.content = slot.generated_text; + res.has_new_line = slot.has_new_line; + res.stopping_word = slot.stopping_word; res.stop = slot.stop; - res.truncated = slot.truncated; - res.timings = slot.get_timings(); - - res.generation_params = slot.params; // copy the parameters + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); @@ -1235,6 +1240,8 @@ struct server_context { } } + res.generation_params = slot.params; // copy the parameters + queue_results.send(res); } diff --git a/examples/server/server.hpp b/examples/server/server.hpp index e9c94fa56..1e65614f6 100644 --- a/examples/server/server.hpp +++ b/examples/server/server.hpp @@ -237,7 +237,6 @@ struct server_task_result_cmpl_final : server_task_result { int index = 0; std::string content; bool stream; - bool timings_per_token; result_timings timings; std::string model_alias; std::string prompt; @@ -245,10 +244,11 @@ struct server_task_result_cmpl_final : server_task_result { bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; - int32_t has_new_line; - int32_t stopping_word; int32_t n_tokens_cached; + int32_t has_new_line; + std::string stopping_word; stop_type stop = STOP_TYPE_NONE; + std::vector probs_output; slot_params generation_params; @@ -291,6 +291,7 @@ struct server_task_result_cmpl_partial : server_task_result { int32_t n_prompt_tokens; stop_type stop = STOP_TYPE_NONE; + std::vector probs_output; result_timings timings; @@ -346,7 +347,7 @@ struct server_task_result_embd : server_task_result { struct server_task_result_rerank : server_task_result { server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {} int index = 0; - float score; + float score = -1e6; json to_json() { return json {