print timings + build info

I don't know if llama_free is needed but it was used in main.cpp.
This commit is contained in:
anon 2023-05-31 10:41:47 -03:00
parent dd30219332
commit 40e13805d9

View file

@ -1,7 +1,9 @@
#include <httplib.h>
#include <json.hpp>
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "build-info.h"
#include <httplib.h>
#include <json.hpp>
struct server_params struct server_params
{ {
@ -30,7 +32,7 @@ struct llama_server_context
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
std::vector<llama_token> last_prompt_tokens; std::vector<llama_token> last_prompt_tokens;
llama_context *ctx; llama_context *ctx = nullptr;
gpt_params params; gpt_params params;
std::string stopping_word; std::string stopping_word;
@ -38,6 +40,14 @@ struct llama_server_context
bool verbose = false; bool verbose = false;
int json_indent = -1; int json_indent = -1;
~llama_server_context()
{
if (ctx) {
llama_free(ctx);
ctx = nullptr;
}
}
void rewind() { void rewind() {
params.antiprompt.clear(); params.antiprompt.clear();
num_tokens_predicted = 0; num_tokens_predicted = 0;
@ -765,6 +775,8 @@ std::string log(const Request &req, const Response &res)
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
llama_init_backend();
// own arguments required by this example // own arguments required by this example
gpt_params params; gpt_params params;
server_params sparams; server_params sparams;
@ -785,6 +797,10 @@ int main(int argc, char **argv)
params.model_alias = params.model; params.model_alias = params.model;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n\n", params.n_threads,
std::thread::hardware_concurrency(), llama_print_system_info());
// load the model // load the model
if (!llama.loadModel(params)) if (!llama.loadModel(params))
{ {
@ -809,6 +825,7 @@ int main(int argc, char **argv)
} }
llama.rewind(); llama.rewind();
llama_reset_timings(llama.ctx);
if (parse_options_completion(json::parse(req.body), llama, res) == false) { if (parse_options_completion(json::parse(req.body), llama, res) == false) {
return; return;
@ -837,6 +854,11 @@ int main(int argc, char **argv)
{"generation_settings", format_generation_settings(llama)}, {"generation_settings", format_generation_settings(llama)},
{"prompt", llama.params.prompt}, {"prompt", llama.params.prompt},
{"stopping_word", llama.stopping_word}}; {"stopping_word", llama.stopping_word}};
if (llama.verbose) {
llama_print_timings(llama.ctx);
}
return res.set_content( return res.set_content(
data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace), data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
"application/json"); "application/json");
@ -894,18 +916,29 @@ int main(int argc, char **argv)
{"generated_text", llama.generated_text}}; {"generated_text", llama.generated_text}};
} }
std::string str = "data: " + std::string str =
data.dump(llama.json_indent, ' ', false, "data: " +
json::error_handler_t::replace) + data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false,
"\n\n"; json::error_handler_t::replace) +
"\n\n";
if (llama.verbose) {
fprintf(stderr, "to_send=%s", str.c_str());
}
if (!sink.write(str.data(), str.size())) { if (!sink.write(str.data(), str.size())) {
if (llama.verbose) { if (llama.verbose) {
fprintf(stderr, "stream closed\n"); fprintf(stderr, "stream closed\n");
llama_print_timings(llama.ctx);
} }
return false; return false;
} }
} }
if (llama.verbose) {
llama_print_timings(llama.ctx);
}
sink.done(); sink.done();
return true; return true;
}; };
@ -978,4 +1011,6 @@ int main(int argc, char **argv)
if (!svr.listen_after_bind()) { if (!svr.listen_after_bind()) {
return 1; return 1;
} }
return 0;
} }