From c1cbde82a12d59a0ee8ae2ae6025c99f18c1e526 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 00:00:56 -0300 Subject: [PATCH 01/10] print error when server can't bind to the interface --- examples/server/server.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index ad46f56e9..5c1662865 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -901,8 +901,6 @@ int main(int argc, char **argv) return res.set_content(data.dump(llama.json_indent), "application/json"); }); - fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); - if(params.embedding) { fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); } @@ -930,5 +928,16 @@ int main(int argc, char **argv) // set timeouts and change hostname and port svr.set_read_timeout(sparams.read_timeout); svr.set_write_timeout(sparams.write_timeout); - svr.listen(sparams.hostname, sparams.port); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + fprintf(stderr, "%s: ERROR: couldn't bind server to %s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + return 1; + } + + fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, + sparams.hostname.c_str(), sparams.port); + if (!svr.listen_after_bind()) { + return 1; + } } From 2c08f29691d6a69bb1c26db2a239e8a8124c313d Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 09:02:32 -0300 Subject: [PATCH 02/10] make api server use only a single thread --- examples/server/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index b38fa864a..67b086754 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -2,6 +2,9 @@ set(TARGET server) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp json.hpp httplib.h) target_compile_definitions(${TARGET} PRIVATE + # single thread + CPPHTTPLIB_THREAD_POOL_COUNT=1 + # crash the server in the debug mode, otherwise send http 500 error $<$: CPPHTTPLIB_NO_EXCEPTIONS=1 > From 284bc293b1e003659416e776d1b9528ebca38d10 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:46:06 -0300 Subject: [PATCH 03/10] reserve memory for generated_text --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5c1662865..b42333228 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -42,6 +42,7 @@ struct llama_server_context params.antiprompt.clear(); num_tokens_predicted = 0; generated_text = ""; + generated_text.reserve(params.n_ctx); stopping_word = ""; //processed_tokens.clear(); From f1710b90dcd4fb47a170e8e05faceb26ed594580 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:35:25 -0300 Subject: [PATCH 04/10] add infinite generation when n_predict is -1 --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b42333228..b0f0486b7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -250,7 +250,7 @@ struct llama_server_context return result; } - has_next_token = n_remain != 0; + has_next_token = params.n_predict == -1 ? true : n_remain != 0; return result; } From aa2bbb2d357617907278b5102abbae49bab2236a Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:36:51 -0300 Subject: [PATCH 05/10] fix parameter type --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b0f0486b7..37b5b78d3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -52,7 +52,7 @@ struct llama_server_context n_consumed = 0; } - bool loadModel(gpt_params params_) + bool loadModel(const gpt_params ¶ms_) { params = params_; ctx = llama_init_from_gpt_params(params); From 27911d6d68d465dc944af508aeb284288019eb3b Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:37:52 -0300 Subject: [PATCH 06/10] fix default model alias --- examples/server/server.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 37b5b78d3..fbfcc6b7f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -781,6 +781,10 @@ int main(int argc, char **argv) llama.verbose = sparams.verbose; llama.json_indent = sparams.verbose ? 4 : -1; + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + // load the model if (!llama.loadModel(params)) { From dd3021933232573bfdde2cb249c22ab332d353f3 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:40:42 -0300 Subject: [PATCH 07/10] buffer incomplete multi-byte characters --- examples/server/server.cpp | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fbfcc6b7f..b78992a13 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -842,16 +842,49 @@ int main(int argc, char **argv) "application/json"); } else { const auto chunked_content_provider = [&](size_t, DataSink &sink) { + size_t sent_count = 0; + int32_t multibyte_pending = 0; + while (llama.has_next_token) { std::string token_text = llama.doCompletion(); + if (multibyte_pending > 0) { + multibyte_pending -= token_text.size(); + } else if (token_text.size() == 1) { + const char c = token_text[0]; + // 2-byte characters: 110xxxxx 10xxxxxx + if ((c & 0xE0) == 0xC0) { + multibyte_pending = 1; + // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF0) == 0xE0) { + multibyte_pending = 2; + // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + } else if ((c & 0xF8) == 0xF0) { + multibyte_pending = 3; + } else { + multibyte_pending = 0; + } + } + + if (multibyte_pending > 0) { + if (!llama.has_next_token) { + llama.has_next_token = true; + llama.n_remain++; + } + continue; + } + + const size_t pos = std::min(sent_count, llama.generated_text.size()); + std::string to_send = llama.generated_text.substr(pos); + sent_count += to_send.size(); + json data; if (llama.has_next_token) { - data = {{"content", token_text}, {"stop", false}}; + data = {{"content", to_send}, {"stop", false}}; } else { // Generation is done, send extra information. data = { - {"content", token_text}, + {"content", to_send}, {"stop", true}, {"model", llama.params.model_alias}, {"tokens_predicted", llama.num_tokens_predicted}, From 40e13805d983c93598249c2673ba9fc4e8f1dc0d Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 10:41:47 -0300 Subject: [PATCH 08/10] print timings + build info I don't know if llama_free is needed but it was used in main.cpp. --- examples/server/server.cpp | 49 ++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b78992a13..acccbc9d7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,7 +1,9 @@ -#include -#include #include "common.h" #include "llama.h" +#include "build-info.h" + +#include +#include struct server_params { @@ -30,7 +32,7 @@ struct llama_server_context std::vector embd_inp; std::vector last_prompt_tokens; - llama_context *ctx; + llama_context *ctx = nullptr; gpt_params params; std::string stopping_word; @@ -38,6 +40,14 @@ struct llama_server_context bool verbose = false; int json_indent = -1; + ~llama_server_context() + { + if (ctx) { + llama_free(ctx); + ctx = nullptr; + } + } + void rewind() { params.antiprompt.clear(); num_tokens_predicted = 0; @@ -765,6 +775,8 @@ std::string log(const Request &req, const Response &res) int main(int argc, char **argv) { + llama_init_backend(); + // own arguments required by this example gpt_params params; server_params sparams; @@ -785,6 +797,10 @@ int main(int argc, char **argv) params.model_alias = params.model; } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads, + std::thread::hardware_concurrency(), llama_print_system_info()); + // load the model if (!llama.loadModel(params)) { @@ -809,6 +825,7 @@ int main(int argc, char **argv) } llama.rewind(); + llama_reset_timings(llama.ctx); if (parse_options_completion(json::parse(req.body), llama, res) == false) { return; @@ -837,6 +854,11 @@ int main(int argc, char **argv) {"generation_settings", format_generation_settings(llama)}, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}}; + + if (llama.verbose) { + llama_print_timings(llama.ctx); + } + return res.set_content( data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), "application/json"); @@ -894,18 +916,29 @@ int main(int argc, char **argv) {"generated_text", llama.generated_text}}; } - std::string str = "data: " + - data.dump(llama.json_indent, ' ', false, - json::error_handler_t::replace) + - "\n\n"; + std::string str = + "data: " + + data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false, + json::error_handler_t::replace) + + "\n\n"; + + if (llama.verbose) { + fprintf(stderr, "to_send=%s", str.c_str()); + } + if (!sink.write(str.data(), str.size())) { if (llama.verbose) { fprintf(stderr, "stream closed\n"); + llama_print_timings(llama.ctx); } return false; } } + if (llama.verbose) { + llama_print_timings(llama.ctx); + } + sink.done(); return true; }; @@ -978,4 +1011,6 @@ int main(int argc, char **argv) if (!svr.listen_after_bind()) { return 1; } + + return 0; } From d58e48663d119d439abbd388390f7101dec3bbe5 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 11:56:12 -0300 Subject: [PATCH 09/10] default penalize_nl to false + format --- examples/server/server.cpp | 287 +++++++++++++++---------------------- 1 file changed, 114 insertions(+), 173 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index acccbc9d7..eb75ab1de 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -507,210 +507,151 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para return true; } -bool parse_options_completion(json body, llama_server_context& llama, Response &res) { +bool parse_options_completion(json body, llama_server_context& llama, Response &res) +{ gpt_params default_params; - if (!body["stream"].is_null()) - { - llama.stream = body["stream"].get(); + if (!body["stream"].is_null()) { + llama.stream = body["stream"].get(); + } else { + llama.stream = false; } - else - { - llama.stream = false; + if (!body["n_predict"].is_null()) { + llama.params.n_predict = body["n_predict"].get(); + } else { + llama.params.n_predict = default_params.n_predict; } - if (!body["n_predict"].is_null()) - { - llama.params.n_predict = body["n_predict"].get(); + if (!body["top_k"].is_null()) { + llama.params.top_k = body["top_k"].get(); + } else { + llama.params.top_k = default_params.top_k; } - else - { - llama.params.n_predict = default_params.n_predict; + if (!body["top_p"].is_null()) { + llama.params.top_p = body["top_p"].get(); + } else { + llama.params.top_p = default_params.top_p; } - if (!body["top_k"].is_null()) - { - llama.params.top_k = body["top_k"].get(); + if (!body["tfs_z"].is_null()) { + llama.params.tfs_z = body["tfs_z"].get(); + } else { + llama.params.tfs_z = default_params.tfs_z; } - else - { - llama.params.top_k = default_params.top_k; + if (!body["typical_p"].is_null()) { + llama.params.typical_p = body["typical_p"].get(); + } else { + llama.params.typical_p = default_params.typical_p; } - if (!body["top_p"].is_null()) - { - llama.params.top_p = body["top_p"].get(); + if (!body["repeat_last_n"].is_null()) { + llama.params.repeat_last_n = body["repeat_last_n"].get(); + } else { + llama.params.repeat_last_n = default_params.repeat_last_n; } - else - { - llama.params.top_p = default_params.top_p; + if (!body["temperature"].is_null()) { + llama.params.temp = body["temperature"].get(); + } else { + llama.params.temp = default_params.temp; } - if (!body["tfs_z"].is_null()) - { - llama.params.tfs_z = body["tfs_z"].get(); + if (!body["repeat_penalty"].is_null()) { + llama.params.repeat_penalty = body["repeat_penalty"].get(); + } else { + llama.params.repeat_penalty = default_params.repeat_penalty; } - else - { - llama.params.tfs_z = default_params.tfs_z; + if (!body["presence_penalty"].is_null()) { + llama.params.presence_penalty = body["presence_penalty"].get(); + } else { + llama.params.presence_penalty = default_params.presence_penalty; } - if (!body["typical_p"].is_null()) - { - llama.params.typical_p = body["typical_p"].get(); + if (!body["frequency_penalty"].is_null()) { + llama.params.frequency_penalty = body["frequency_penalty"].get(); + } else { + llama.params.frequency_penalty = default_params.frequency_penalty; } - else - { - llama.params.typical_p = default_params.typical_p; + if (!body["mirostat"].is_null()) { + llama.params.mirostat = body["mirostat"].get(); + } else { + llama.params.mirostat = default_params.mirostat; } - if (!body["repeat_last_n"].is_null()) - { - llama.params.repeat_last_n = body["repeat_last_n"].get(); + if (!body["mirostat_tau"].is_null()) { + llama.params.mirostat_tau = body["mirostat_tau"].get(); + } else { + llama.params.mirostat_tau = default_params.mirostat_tau; } - else - { - llama.params.repeat_last_n = default_params.repeat_last_n; + if (!body["mirostat_eta"].is_null()) { + llama.params.mirostat_eta = body["mirostat_eta"].get(); + } else { + llama.params.mirostat_eta = default_params.mirostat_eta; } - if (!body["temperature"].is_null()) - { - llama.params.temp = body["temperature"].get(); + if (!body["penalize_nl"].is_null()) { + llama.params.penalize_nl = body["penalize_nl"].get(); + } else { + llama.params.penalize_nl = false; } - else - { - llama.params.temp = default_params.temp; + if (!body["n_keep"].is_null()) { + llama.params.n_keep = body["n_keep"].get(); + } else { + llama.params.n_keep = default_params.n_keep; } - if (!body["repeat_penalty"].is_null()) - { - llama.params.repeat_penalty = body["repeat_penalty"].get(); - } - else - { - llama.params.repeat_penalty = default_params.repeat_penalty; - } - if (!body["presence_penalty"].is_null()) - { - llama.params.presence_penalty = body["presence_penalty"].get(); - } - else - { - llama.params.presence_penalty = default_params.presence_penalty; - } - if (!body["frequency_penalty"].is_null()) - { - llama.params.frequency_penalty = body["frequency_penalty"].get(); - } - else - { - llama.params.frequency_penalty = default_params.frequency_penalty; - } - if (!body["mirostat"].is_null()) - { - llama.params.mirostat = body["mirostat"].get(); - } - else - { - llama.params.mirostat = default_params.mirostat; - } - if (!body["mirostat_tau"].is_null()) - { - llama.params.mirostat_tau = body["mirostat_tau"].get(); - } - else - { - llama.params.mirostat_tau = default_params.mirostat_tau; - } - if (!body["mirostat_eta"].is_null()) - { - llama.params.mirostat_eta = body["mirostat_eta"].get(); - } - else - { - llama.params.mirostat_eta = default_params.mirostat_eta; - } - if (!body["penalize_nl"].is_null()) - { - llama.params.penalize_nl = body["penalize_nl"].get(); - } - else - { - llama.params.penalize_nl = default_params.penalize_nl; - } - if (!body["n_keep"].is_null()) - { - llama.params.n_keep = body["n_keep"].get(); - } - else - { - llama.params.n_keep = default_params.n_keep; - } - if (!body["seed"].is_null()) - { + if (!body["seed"].is_null()) { llama.params.seed = body["seed"].get(); - } - else - { + } else { llama.params.seed = time(NULL); } - if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) - { - llama.params.logit_bias[llama_token_eos()] = -INFINITY; + if (!body["ignore_eos"].is_null() && body["ignore_eos"].get()) { + llama.params.logit_bias[llama_token_eos()] = -INFINITY; + } else { + llama.params.logit_bias.erase(llama_token_eos()); } - else - { - llama.params.logit_bias.erase(llama_token_eos()); - } - if (!body["prompt"].is_null()) - { + if (!body["prompt"].is_null()) { llama.params.prompt = body["prompt"].get(); - } - else - { - json data = { - {"status", "error"}, - {"reason", "You need to pass the prompt"}}; + } else { + json data = {{"status", "error"}, {"reason", "You need to pass the prompt"}}; res.set_content(data.dump(llama.json_indent), "application/json"); res.status = 400; return false; } - if (!body["stop"].is_null()) - { + if (!body["stop"].is_null()) { llama.params.antiprompt = body["stop"].get>(); - } - else - { - llama.params.antiprompt.clear(); + } else { + llama.params.antiprompt.clear(); } if (llama.verbose) { - std::string tmp_stop = - std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), - std::string{}, [](std::string a, std::string b) { - return a + (a != "" ? ", \"" : "\"") + b + "\""; - }); + std::string tmp_stop = + std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), + std::string{}, [](std::string a, std::string b) { + return a + (a != "" ? ", \"" : "\"") + b + "\""; + }); - fprintf(stderr, - "-------------------------\n" - "/completion parameters: {\n" - " stream: %d,\n" - " frequency_penalty: %f,\n" - " mirostat: %d,\n" - " mirostat_eta: %f,\n" - " mirostat_tau: %f,\n" - " n_keep: %d,\n" - " n_predict: %d,\n" - " penalize_nl: %d,\n" - " presence_penalty: %f,\n" - " repeat_last_n: %d,\n" - " repeat_penalty: %f,\n" - " seed: %d,\n" - " stop: [%s],\n" - " temperature: %f,\n" - " tfs_z: %f,\n" - " top_k: %d,\n" - " top_p: %f,\n" - " typical_p: %f,\n" - "}\nPROMPT[%s]\n", - llama.stream, llama.params.frequency_penalty, llama.params.mirostat, - llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep, - llama.params.n_predict, llama.params.penalize_nl, - llama.params.presence_penalty, llama.params.repeat_last_n, - llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(), - llama.params.temp, llama.params.tfs_z, llama.params.top_k, - llama.params.top_p, llama.params.typical_p, llama.params.prompt.c_str()); + fprintf(stderr, + "-------------------------\n" + "/completion parameters: {\n" + " stream: %d,\n" + " ignore_eos: %d,\n" + " frequency_penalty: %f,\n" + " mirostat: %d,\n" + " mirostat_eta: %f,\n" + " mirostat_tau: %f,\n" + " n_keep: %d,\n" + " n_predict: %d,\n" + " penalize_nl: %d,\n" + " presence_penalty: %f,\n" + " repeat_last_n: %d,\n" + " repeat_penalty: %f,\n" + " seed: %d,\n" + " stop: [%s],\n" + " temperature: %f,\n" + " tfs_z: %f,\n" + " top_k: %d,\n" + " top_p: %f,\n" + " typical_p: %f,\n" + "}\nPROMPT[%s]\n", + llama.stream, -INFINITY == llama.params.logit_bias[llama_token_eos()], + llama.params.frequency_penalty, llama.params.mirostat, + llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep, + llama.params.n_predict, llama.params.penalize_nl, + llama.params.presence_penalty, llama.params.repeat_last_n, + llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(), + llama.params.temp, llama.params.tfs_z, llama.params.top_k, llama.params.top_p, + llama.params.typical_p, llama.params.prompt.c_str()); } return true; From 3edaf6bd8bdc853f7f0a10f9e397bd01d0e99238 Mon Sep 17 00:00:00 2001 From: anon Date: Wed, 31 May 2023 12:55:19 -0300 Subject: [PATCH 10/10] print timings by default --- examples/server/server.cpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eb75ab1de..d6fb84cd9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -796,9 +796,7 @@ int main(int argc, char **argv) {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}}; - if (llama.verbose) { - llama_print_timings(llama.ctx); - } + llama_print_timings(llama.ctx); return res.set_content( data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), @@ -870,16 +868,13 @@ int main(int argc, char **argv) if (!sink.write(str.data(), str.size())) { if (llama.verbose) { fprintf(stderr, "stream closed\n"); - llama_print_timings(llama.ctx); } + llama_print_timings(llama.ctx); return false; } } - if (llama.verbose) { - llama_print_timings(llama.ctx); - } - + llama_print_timings(llama.ctx); sink.done(); return true; };