minor style fix

2024-12-04 19:36:37 +01:00 · 2024-12-04 19:36:37 +01:00 · 1261086163
commit 1261086163
parent 3b41ad53a3
2 changed files with 21 additions and 13 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1170,12 +1170,15 @@ struct server_context {
        server_task_result_cmpl_partial res;
        res.id              = slot.id_task;
        res.index           = slot.index;
+        res.content         = tkn.text_to_send;
+
+        res.truncated       = slot.truncated;
        res.n_decoded       = slot.n_decoded;
        res.n_prompt_tokens = slot.n_prompt_tokens;
-        res.content         = tkn.text_to_send;
-        res.stop            = slot.stop;
-        res.truncated       = slot.truncated;

+        res.stop            = slot.stop;
+
+        // populate res.probs_output
        if (slot.params.sampling.n_probs > 0) {
            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
            const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
@ -1206,20 +1209,22 @@ struct server_context {
        server_task_result_cmpl_final res;
        res.id              = slot.id_task;
        res.id_slot         = slot.id;
+
        res.index           = slot.index;
        res.content         = slot.generated_text;
+        res.timings         = slot.get_timings();
+        res.model_alias     = slot.oaicompat_model;
+        res.prompt          = common_detokenize(ctx, slot.prompt_tokens, true);

+        res.truncated       = slot.truncated;
        res.n_decoded       = slot.n_decoded;
        res.n_prompt_tokens = slot.n_prompt_tokens;
-        res.has_new_line    = slot.has_new_line;
        res.n_tokens_cached = slot.n_past;
-        res.content         = slot.generated_text;
+        res.has_new_line    = slot.has_new_line;
+        res.stopping_word   = slot.stopping_word;
        res.stop            = slot.stop;
-        res.truncated       = slot.truncated;
-        res.timings         = slot.get_timings();
-
-        res.generation_params = slot.params; // copy the parameters

+        // populate res.probs_output
        if (slot.params.sampling.n_probs > 0) {
            if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) {
                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@ -1235,6 +1240,8 @@ struct server_context {
            }
        }

+        res.generation_params = slot.params; // copy the parameters
+
        queue_results.send(res);
    }

--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@ -237,7 +237,6 @@ struct server_task_result_cmpl_final : server_task_result {
    int index = 0;
    std::string content;
    bool stream;
-    bool timings_per_token;
    result_timings timings;
    std::string model_alias;
    std::string prompt;
@ -245,10 +244,11 @@ struct server_task_result_cmpl_final : server_task_result {
    bool truncated;
    int32_t n_decoded;
    int32_t n_prompt_tokens;
-    int32_t has_new_line;
-    int32_t stopping_word;
    int32_t n_tokens_cached;
+    int32_t has_new_line;
+    std::string stopping_word;
    stop_type stop = STOP_TYPE_NONE;
+
    std::vector<completion_token_output> probs_output;

    slot_params generation_params;
@ -291,6 +291,7 @@ struct server_task_result_cmpl_partial : server_task_result {
    int32_t n_prompt_tokens;

    stop_type stop = STOP_TYPE_NONE;
+
    std::vector<completion_token_output> probs_output;
    result_timings timings;

@ -346,7 +347,7 @@ struct server_task_result_embd : server_task_result {
 struct server_task_result_rerank : server_task_result {
    server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {}
    int index = 0;
-    float score;
+    float score = -1e6;

    json to_json() {
        return json {