From aa0788b65056151957ed14c0d2fce0687a0ba1b2 Mon Sep 17 00:00:00 2001 From: anon Date: Tue, 30 May 2023 21:41:55 -0300 Subject: [PATCH] add --verbose flag and request logging --- examples/server/server.cpp | 156 +++++++++++++++++++++++++++++++++---- 1 file changed, 142 insertions(+), 14 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index bcb7cd4b9..a4bb38244 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -9,6 +9,7 @@ struct server_params int32_t port = 8080; int32_t read_timeout = 600; int32_t write_timeout = 600; + bool verbose = false; }; struct llama_server_context @@ -34,6 +35,9 @@ struct llama_server_context std::string stopping_word; + bool verbose = false; + int json_indent = -1; + void rewind() { params.antiprompt.clear(); num_tokens_predicted = 0; @@ -239,6 +243,9 @@ struct llama_server_context if (!embd.empty() && embd.back() == llama_token_eos()) { stopping_word = llama_token_to_str(ctx, embd.back()); has_next_token = false; + if (verbose) { + fprintf(stderr, "eos token found!\n"); + } return result; } @@ -256,6 +263,20 @@ struct llama_server_context std::string token_text = llama_token_to_str(ctx, token); generated_text += token_text; + if (verbose) { + fprintf(stderr, + "next token: {\n" + " token: %d,\n" + " token_text: \"%s\",\n" + " has_next_token: %d,\n" + " n_remain: %ld,\n" + " num_tokens_predicted: %ld,\n" + " stopping_word: \"%s\",\n" + "}\n", + token, token_text.c_str(), has_next_token, n_remain, num_tokens_predicted, + stopping_word.c_str()); + } + for (const std::string& word : params.antiprompt) { size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size())); if (i != std::string::npos) { @@ -298,6 +319,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -v, --verbose verbose output (default: false)\n"); fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); @@ -454,6 +476,8 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para break; } params.lora_base = argv[i]; + } else if (arg == "-v" || arg == "--verbose") { + sparams.verbose = true; } else { @@ -627,7 +651,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & json data = { {"status", "error"}, {"reason", "You need to pass the prompt"}}; - res.set_content(data.dump(), "application/json"); + res.set_content(data.dump(llama.json_indent), "application/json"); res.status = 400; return false; } @@ -639,6 +663,45 @@ bool parse_options_completion(json body, llama_server_context& llama, Response & { llama.params.antiprompt.clear(); } + + if (llama.verbose) { + std::string tmp_stop = + std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(), + std::string{}, [](std::string a, std::string b) { + return a + (a != "" ? ", \"" : "") + b + "\""; + }); + + fprintf(stderr, + "-------------------------\n" + "/completion parameters: {\n" + " stream: %d,\n" + " frequency_penalty: %f,\n" + " mirostat: %d,\n" + " mirostat_eta: %f,\n" + " mirostat_tau: %f,\n" + " n_keep: %d,\n" + " n_predict: %d,\n" + " penalize_nl: %d,\n" + " presence_penalty: %f,\n" + " repeat_last_n: %d,\n" + " repeat_penalty: %f,\n" + " seed: %d,\n" + " stop: [%s],\n" + " temperature: %f,\n" + " tfs_z: %f,\n" + " top_k: %d,\n" + " top_p: %f,\n" + " typical_p: %f,\n" + "}\nPROMPT[%s]\n", + llama.stream, llama.params.frequency_penalty, llama.params.mirostat, + llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep, + llama.params.n_predict, llama.params.penalize_nl, + llama.params.presence_penalty, llama.params.repeat_last_n, + llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(), + llama.params.temp, llama.params.tfs_z, llama.params.top_k, + llama.params.top_p, llama.params.typical_p, llama.params.prompt.c_str()); + } + return true; } @@ -661,6 +724,44 @@ json format_generation_settings(const llama_server_context& llama) { }; } +std::string log(const Request &req, const Response &res) +{ + std::string s; + + s += "============ REQUEST ===========\n"; + s += "< "; + s += req.method; + s += " "; + s += req.path; + s += " "; + s += req.version; + s += "\n"; + + if (!req.body.empty()) { + std::string line; + std::istringstream stream(req.body); + while (std::getline(stream, line)) { + s += "< " + line + "\n"; + } + } + + s += "------------ RESPONSE ------------\n> "; + s += res.version; + s += " "; + s += std::to_string(res.status); + s += "\n"; + + if (!res.body.empty()) { + std::string line; + std::istringstream stream(res.body); + while (std::getline(stream, line)) { + s += "> " + line + "\n"; + } + } + + return s; +} + int main(int argc, char **argv) { // own arguments required by this example @@ -676,6 +777,9 @@ int main(int argc, char **argv) return 1; } + llama.verbose = sparams.verbose; + llama.json_indent = sparams.verbose ? 4 : -1; + // load the model if (!llama.loadModel(params)) { @@ -692,8 +796,9 @@ int main(int argc, char **argv) json data = { {"status", "error"}, {"reason", "To use completion function, disable embedding mode"}}; - res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), - "application/json"); + res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); res.status = 400; return; } @@ -706,8 +811,9 @@ int main(int argc, char **argv) if (!llama.loadPrompt()) { json data = {{"status", "error"}, {"reason", "Context too long."}}; - res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), - "application/json"); + res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); res.status = 400; return; } @@ -721,12 +827,14 @@ int main(int argc, char **argv) json data = {{"content", llama.generated_text}, {"stop", true}, - {"model", llama.params.model_alias }, + {"model", llama.params.model_alias}, {"tokens_predicted", llama.num_tokens_predicted}, {"generation_settings", format_generation_settings(llama)}, {"prompt", llama.params.prompt}, {"stopping_word", llama.stopping_word}}; - return res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); + return res.set_content( + data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), + "application/json"); } else { const auto chunked_content_provider = [&](size_t, DataSink &sink) { while (llama.has_next_token) { @@ -748,10 +856,10 @@ int main(int argc, char **argv) {"generated_text", llama.generated_text}}; } - std::string str = - "data: " + - data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; + std::string str = "data: " + + data.dump(llama.json_indent, ' ', false, + json::error_handler_t::replace) + + "\n\n"; sink.write(str.data(), str.size()); } @@ -768,7 +876,7 @@ int main(int argc, char **argv) json body = json::parse(req.body); json data = { {"tokens", ::llama_tokenize(llama.ctx, body["content"].get(), false) } }; - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(llama.json_indent), "application/json"); }); svr.Post("/embedding", [&llama](const Request &req, Response &res) @@ -778,14 +886,14 @@ int main(int argc, char **argv) json data = { {"embedding", empty}}; fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n"); - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(llama.json_indent), "application/json"); } json body = json::parse(req.body); std::string content = body["content"].get(); int threads = body["threads"].get(); json data = { {"embedding", llama.embedding(content, threads) } }; - return res.set_content(data.dump(), "application/json"); + return res.set_content(data.dump(llama.json_indent), "application/json"); }); fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port); @@ -794,6 +902,26 @@ int main(int argc, char **argv) fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n"); } + if (llama.verbose) { + svr.set_logger([](const Request &req, const Response &res) { + fprintf(stderr, "%s", log(req, res).c_str()); + }); + } + + svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { + auto fmt = "500 Internal Server Error\n%s"; + char buf[BUFSIZ]; + try { + std::rethrow_exception(ep); + } catch (std::exception &e) { + snprintf(buf, sizeof(buf), fmt, e.what()); + } catch (...) { + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + } + res.set_content(buf, "text/plain"); + res.status = 500; + }); + // set timeouts and change hostname and port svr.set_read_timeout(sparams.read_timeout); svr.set_write_timeout(sparams.write_timeout);