always send partial response for get correct probs of last to_send

2023-08-21 13:26:23 +08:00 · 2023-08-21 13:26:23 +08:00 · 1e9fe8a954
commit 1e9fe8a954
parent 371cc14815
1 changed files with 37 additions and 16 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1031,7 +1031,7 @@ static json format_final_response(llama_server_context &llama, const std::string
 {

    json res = json{
-        {"content", content},
+        {"content", ""},
        {"stop", true},
        {"model", llama.params.model_alias},
        {"tokens_predicted", llama.num_tokens_predicted},
@ -1312,24 +1312,45 @@ int main(int argc, char **argv)
                        sent_token_probs_index = probs_stop_pos;
                    }

-                    const json data = llama.has_next_token
-                                          ? format_partial_response(llama, to_send, probs_output)
-                                          // Generation is done, send extra information.
-                                          : format_final_response(llama, to_send, llama.generated_token_probs);
+                    {
+                        // Always send partial response
+                        // so we can get the correct partial response of the last to_send in the client
+                        const json data = format_partial_response(llama, to_send, probs_output);

-                    const std::string str =
-                        "data: " +
-                        data.dump(-1, ' ', false, json::error_handler_t::replace) +
-                        "\n\n";
+                        const std::string str =
+                            "data: " +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            "\n\n";

-                    LOG_VERBOSE("data stream", {
-                        { "to_send", str }
-                    });
+                        LOG_VERBOSE("data stream", {
+                            { "to_send", str }
+                        });

-                    if (!sink.write(str.data(), str.size())) {
-                        LOG_VERBOSE("stream closed", {});
-                        llama_print_timings(llama.ctx);
-                        return false;
+                        if (!sink.write(str.data(), str.size())) {
+                            LOG_VERBOSE("stream closed", {});
+                            llama_print_timings(llama.ctx);
+                            return false;
+                        }
+                    }
+                    
+                    if (!llama.has_next_token) {
+                        // Generation is done, send extra information.
+                        const json data = format_final_response(llama, to_send, llama.generated_token_probs);
+
+                        const std::string str =
+                            "data: " +
+                            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+                            "\n\n";
+
+                        LOG_VERBOSE("data stream", {
+                            { "to_send", str }
+                        });
+
+                        if (!sink.write(str.data(), str.size())) {
+                            LOG_VERBOSE("stream closed", {});
+                            llama_print_timings(llama.ctx);
+                            return false;
+                        }
                    }
                }