diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b78992a13..acccbc9d7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,7 +1,9 @@ -#include -#include #include "common.h" #include "llama.h" +#include "build-info.h" + +#include +#include struct server_params { @@ -30,7 +32,7 @@ struct llama_server_context std::vector embd_inp; std::vector last_prompt_tokens; - llama_context *ctx; + llama_context *ctx = nullptr; gpt_params params; std::string stopping_word; @@ -38,6 +40,14 @@ struct llama_server_context bool verbose = false; int json_indent = -1; + ~llama_server_context() + { + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } + } + void rewind() { params.antiprompt.clear(); num_tokens_predicted = 0; @@ -765,6 +775,8 @@ std::string log(const Request &req, const Response &res) int main(int argc, char **argv) { + llama_init_backend(); + // own arguments required by this example gpt_params params; server_params sparams; @@ -785,6 +797,10 @@ int main(int argc, char **argv) params.model_alias = params.model; } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, + std::thread::hardware_concurrency(), llama_print_system_info()); + // load the model if (!llama.loadModel(params)) { @@ -809,6 +825,7 @@ int main(int argc, char **argv) } llama.rewind(); + llama_reset_timings(llama.ctx); if (parse_options_completion(json::parse(req.body), llama, res) == false) { return; @@ -837,6 +854,11 @@ int main(int argc, char **argv) {"generation_settings", format_generation_settings(llama)}, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}}; + + if (llama.verbose) { + llama_print_timings(llama.ctx); + } + return res.set_content( data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), "application/json"); @@ -894,18 +916,29 @@ int main(int argc, char **argv) {"generated_text", llama.generated_text}}; } - std::string str = "data: " + - data.dump(llama.json_indent, ' ', false, - json::error_handler_t::replace) + - "\n\n"; + std::string str = + "data: " + + data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + + if (llama.verbose) { + fprintf(stderr, "to_send=%s", str.c_str()); + } + if (!sink.write(str.data(), str.size())) { if (llama.verbose) { fprintf(stderr, "stream closed\n"); + llama_print_timings(llama.ctx); } return false; } } + if (llama.verbose) { + llama_print_timings(llama.ctx); + } + sink.done(); return true; }; @@ -978,4 +1011,6 @@ int main(int argc, char **argv) if (!svr.listen_after_bind()) { return 1; } + + return 0; }