diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c9fe92428..c8ee52206 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1031,7 +1031,7 @@ static json format_final_response(llama_server_context &llama, const std::string { json res = json{ - {"content", content}, + {"content", ""}, {"stop", true}, {"model", llama.params.model_alias}, {"tokens_predicted", llama.num_tokens_predicted}, @@ -1312,24 +1312,45 @@ int main(int argc, char **argv) sent_token_probs_index = probs_stop_pos; } - const json data = llama.has_next_token - ? format_partial_response(llama, to_send, probs_output) - // Generation is done, send extra information. - : format_final_response(llama, to_send, llama.generated_token_probs); + { + // Always send partial response + // so we can get the correct partial response of the last to_send in the client + const json data = format_partial_response(llama, to_send, probs_output); - const std::string str = - "data: " + - data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; + const std::string str = + "data: " + + data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; - LOG_VERBOSE("data stream", { - { "to_send", str } - }); + LOG_VERBOSE("data stream", { + { "to_send", str } + }); - if (!sink.write(str.data(), str.size())) { - LOG_VERBOSE("stream closed", {}); - llama_print_timings(llama.ctx); - return false; + if (!sink.write(str.data(), str.size())) { + LOG_VERBOSE("stream closed", {}); + llama_print_timings(llama.ctx); + return false; + } + } + + if (!llama.has_next_token) { + // Generation is done, send extra information. + const json data = format_final_response(llama, to_send, llama.generated_token_probs); + + const std::string str = + "data: " + + data.dump(-1, ' ', false, json::error_handler_t::replace) + + "\n\n"; + + LOG_VERBOSE("data stream", { + { "to_send", str } + }); + + if (!sink.write(str.data(), str.size())) { + LOG_VERBOSE("stream closed", {}); + llama_print_timings(llama.ctx); + return false; + } } }