add --verbose flag and request logging
This commit is contained in:
parent
7a853dc56d
commit
aa0788b650
1 changed files with 142 additions and 14 deletions
|
@ -9,6 +9,7 @@ struct server_params
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
bool verbose = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_server_context
|
struct llama_server_context
|
||||||
|
@ -34,6 +35,9 @@ struct llama_server_context
|
||||||
|
|
||||||
std::string stopping_word;
|
std::string stopping_word;
|
||||||
|
|
||||||
|
bool verbose = false;
|
||||||
|
int json_indent = -1;
|
||||||
|
|
||||||
void rewind() {
|
void rewind() {
|
||||||
params.antiprompt.clear();
|
params.antiprompt.clear();
|
||||||
num_tokens_predicted = 0;
|
num_tokens_predicted = 0;
|
||||||
|
@ -239,6 +243,9 @@ struct llama_server_context
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
||||||
stopping_word = llama_token_to_str(ctx, embd.back());
|
stopping_word = llama_token_to_str(ctx, embd.back());
|
||||||
has_next_token = false;
|
has_next_token = false;
|
||||||
|
if (verbose) {
|
||||||
|
fprintf(stderr, "eos token found!\n");
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -256,6 +263,20 @@ struct llama_server_context
|
||||||
std::string token_text = llama_token_to_str(ctx, token);
|
std::string token_text = llama_token_to_str(ctx, token);
|
||||||
generated_text += token_text;
|
generated_text += token_text;
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"next token: {\n"
|
||||||
|
" token: %d,\n"
|
||||||
|
" token_text: \"%s\",\n"
|
||||||
|
" has_next_token: %d,\n"
|
||||||
|
" n_remain: %ld,\n"
|
||||||
|
" num_tokens_predicted: %ld,\n"
|
||||||
|
" stopping_word: \"%s\",\n"
|
||||||
|
"}\n",
|
||||||
|
token, token_text.c_str(), has_next_token, n_remain, num_tokens_predicted,
|
||||||
|
stopping_word.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
for (const std::string& word : params.antiprompt) {
|
for (const std::string& word : params.antiprompt) {
|
||||||
size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size()));
|
size_t i = generated_text.find(word, generated_text.size() - (word.size() + token_text.size()));
|
||||||
if (i != std::string::npos) {
|
if (i != std::string::npos) {
|
||||||
|
@ -298,6 +319,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms, con
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
|
fprintf(stderr, " -v, --verbose verbose output (default: false)\n");
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
|
@ -454,6 +476,8 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_base = argv[i];
|
params.lora_base = argv[i];
|
||||||
|
} else if (arg == "-v" || arg == "--verbose") {
|
||||||
|
sparams.verbose = true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -627,7 +651,7 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
|
||||||
json data = {
|
json data = {
|
||||||
{"status", "error"},
|
{"status", "error"},
|
||||||
{"reason", "You need to pass the prompt"}};
|
{"reason", "You need to pass the prompt"}};
|
||||||
res.set_content(data.dump(), "application/json");
|
res.set_content(data.dump(llama.json_indent), "application/json");
|
||||||
res.status = 400;
|
res.status = 400;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -639,6 +663,45 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
|
||||||
{
|
{
|
||||||
llama.params.antiprompt.clear();
|
llama.params.antiprompt.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (llama.verbose) {
|
||||||
|
std::string tmp_stop =
|
||||||
|
std::accumulate(llama.params.antiprompt.begin(), llama.params.antiprompt.end(),
|
||||||
|
std::string{}, [](std::string a, std::string b) {
|
||||||
|
return a + (a != "" ? ", \"" : "") + b + "\"";
|
||||||
|
});
|
||||||
|
|
||||||
|
fprintf(stderr,
|
||||||
|
"-------------------------\n"
|
||||||
|
"/completion parameters: {\n"
|
||||||
|
" stream: %d,\n"
|
||||||
|
" frequency_penalty: %f,\n"
|
||||||
|
" mirostat: %d,\n"
|
||||||
|
" mirostat_eta: %f,\n"
|
||||||
|
" mirostat_tau: %f,\n"
|
||||||
|
" n_keep: %d,\n"
|
||||||
|
" n_predict: %d,\n"
|
||||||
|
" penalize_nl: %d,\n"
|
||||||
|
" presence_penalty: %f,\n"
|
||||||
|
" repeat_last_n: %d,\n"
|
||||||
|
" repeat_penalty: %f,\n"
|
||||||
|
" seed: %d,\n"
|
||||||
|
" stop: [%s],\n"
|
||||||
|
" temperature: %f,\n"
|
||||||
|
" tfs_z: %f,\n"
|
||||||
|
" top_k: %d,\n"
|
||||||
|
" top_p: %f,\n"
|
||||||
|
" typical_p: %f,\n"
|
||||||
|
"}\nPROMPT[%s]\n",
|
||||||
|
llama.stream, llama.params.frequency_penalty, llama.params.mirostat,
|
||||||
|
llama.params.mirostat_eta, llama.params.mirostat_tau, llama.params.n_keep,
|
||||||
|
llama.params.n_predict, llama.params.penalize_nl,
|
||||||
|
llama.params.presence_penalty, llama.params.repeat_last_n,
|
||||||
|
llama.params.repeat_penalty, llama.params.seed, tmp_stop.c_str(),
|
||||||
|
llama.params.temp, llama.params.tfs_z, llama.params.top_k,
|
||||||
|
llama.params.top_p, llama.params.typical_p, llama.params.prompt.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -661,6 +724,44 @@ json format_generation_settings(const llama_server_context& llama) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string log(const Request &req, const Response &res)
|
||||||
|
{
|
||||||
|
std::string s;
|
||||||
|
|
||||||
|
s += "============ REQUEST ===========\n";
|
||||||
|
s += "< ";
|
||||||
|
s += req.method;
|
||||||
|
s += " ";
|
||||||
|
s += req.path;
|
||||||
|
s += " ";
|
||||||
|
s += req.version;
|
||||||
|
s += "\n";
|
||||||
|
|
||||||
|
if (!req.body.empty()) {
|
||||||
|
std::string line;
|
||||||
|
std::istringstream stream(req.body);
|
||||||
|
while (std::getline(stream, line)) {
|
||||||
|
s += "< " + line + "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s += "------------ RESPONSE ------------\n> ";
|
||||||
|
s += res.version;
|
||||||
|
s += " ";
|
||||||
|
s += std::to_string(res.status);
|
||||||
|
s += "\n";
|
||||||
|
|
||||||
|
if (!res.body.empty()) {
|
||||||
|
std::string line;
|
||||||
|
std::istringstream stream(res.body);
|
||||||
|
while (std::getline(stream, line)) {
|
||||||
|
s += "> " + line + "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
|
@ -676,6 +777,9 @@ int main(int argc, char **argv)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama.verbose = sparams.verbose;
|
||||||
|
llama.json_indent = sparams.verbose ? 4 : -1;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
if (!llama.loadModel(params))
|
if (!llama.loadModel(params))
|
||||||
{
|
{
|
||||||
|
@ -692,8 +796,9 @@ int main(int argc, char **argv)
|
||||||
json data = {
|
json data = {
|
||||||
{"status", "error"},
|
{"status", "error"},
|
||||||
{"reason", "To use completion function, disable embedding mode"}};
|
{"reason", "To use completion function, disable embedding mode"}};
|
||||||
res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
|
res.set_content(
|
||||||
"application/json");
|
data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
|
||||||
|
"application/json");
|
||||||
res.status = 400;
|
res.status = 400;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -706,8 +811,9 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
if (!llama.loadPrompt()) {
|
if (!llama.loadPrompt()) {
|
||||||
json data = {{"status", "error"}, {"reason", "Context too long."}};
|
json data = {{"status", "error"}, {"reason", "Context too long."}};
|
||||||
res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
|
res.set_content(
|
||||||
"application/json");
|
data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
|
||||||
|
"application/json");
|
||||||
res.status = 400;
|
res.status = 400;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -721,12 +827,14 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
json data = {{"content", llama.generated_text},
|
json data = {{"content", llama.generated_text},
|
||||||
{"stop", true},
|
{"stop", true},
|
||||||
{"model", llama.params.model_alias },
|
{"model", llama.params.model_alias},
|
||||||
{"tokens_predicted", llama.num_tokens_predicted},
|
{"tokens_predicted", llama.num_tokens_predicted},
|
||||||
{"generation_settings", format_generation_settings(llama)},
|
{"generation_settings", format_generation_settings(llama)},
|
||||||
{"prompt", llama.params.prompt},
|
{"prompt", llama.params.prompt},
|
||||||
{"stopping_word", llama.stopping_word}};
|
{"stopping_word", llama.stopping_word}};
|
||||||
return res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
return res.set_content(
|
||||||
|
data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
|
||||||
|
"application/json");
|
||||||
} else {
|
} else {
|
||||||
const auto chunked_content_provider = [&](size_t, DataSink &sink) {
|
const auto chunked_content_provider = [&](size_t, DataSink &sink) {
|
||||||
while (llama.has_next_token) {
|
while (llama.has_next_token) {
|
||||||
|
@ -748,10 +856,10 @@ int main(int argc, char **argv)
|
||||||
{"generated_text", llama.generated_text}};
|
{"generated_text", llama.generated_text}};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string str =
|
std::string str = "data: " +
|
||||||
"data: " +
|
data.dump(llama.json_indent, ' ', false,
|
||||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
json::error_handler_t::replace) +
|
||||||
"\n\n";
|
"\n\n";
|
||||||
sink.write(str.data(), str.size());
|
sink.write(str.data(), str.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -768,7 +876,7 @@ int main(int argc, char **argv)
|
||||||
json body = json::parse(req.body);
|
json body = json::parse(req.body);
|
||||||
json data = {
|
json data = {
|
||||||
{"tokens", ::llama_tokenize(llama.ctx, body["content"].get<std::string>(), false) } };
|
{"tokens", ::llama_tokenize(llama.ctx, body["content"].get<std::string>(), false) } };
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(llama.json_indent), "application/json");
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
svr.Post("/embedding", [&llama](const Request &req, Response &res)
|
||||||
|
@ -778,14 +886,14 @@ int main(int argc, char **argv)
|
||||||
json data = {
|
json data = {
|
||||||
{"embedding", empty}};
|
{"embedding", empty}};
|
||||||
fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n");
|
fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n");
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(llama.json_indent), "application/json");
|
||||||
}
|
}
|
||||||
json body = json::parse(req.body);
|
json body = json::parse(req.body);
|
||||||
std::string content = body["content"].get<std::string>();
|
std::string content = body["content"].get<std::string>();
|
||||||
int threads = body["threads"].get<int>();
|
int threads = body["threads"].get<int>();
|
||||||
json data = {
|
json data = {
|
||||||
{"embedding", llama.embedding(content, threads) } };
|
{"embedding", llama.embedding(content, threads) } };
|
||||||
return res.set_content(data.dump(), "application/json");
|
return res.set_content(data.dump(llama.json_indent), "application/json");
|
||||||
});
|
});
|
||||||
|
|
||||||
fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
|
fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
|
||||||
|
@ -794,6 +902,26 @@ int main(int argc, char **argv)
|
||||||
fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
|
fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (llama.verbose) {
|
||||||
|
svr.set_logger([](const Request &req, const Response &res) {
|
||||||
|
fprintf(stderr, "%s", log(req, res).c_str());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
|
||||||
|
auto fmt = "500 Internal Server Error\n%s";
|
||||||
|
char buf[BUFSIZ];
|
||||||
|
try {
|
||||||
|
std::rethrow_exception(ep);
|
||||||
|
} catch (std::exception &e) {
|
||||||
|
snprintf(buf, sizeof(buf), fmt, e.what());
|
||||||
|
} catch (...) {
|
||||||
|
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
|
||||||
|
}
|
||||||
|
res.set_content(buf, "text/plain");
|
||||||
|
res.status = 500;
|
||||||
|
});
|
||||||
|
|
||||||
// set timeouts and change hostname and port
|
// set timeouts and change hostname and port
|
||||||
svr.set_read_timeout(sparams.read_timeout);
|
svr.set_read_timeout(sparams.read_timeout);
|
||||||
svr.set_write_timeout(sparams.write_timeout);
|
svr.set_write_timeout(sparams.write_timeout);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue