Merge pull request #23 from anon998/fix-linter-warnings
Fix linter warnings + stuff
This commit is contained in:
commit
6075d7862d
1 changed files with 55 additions and 63 deletions
|
@ -24,7 +24,6 @@ struct server_params {
|
||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
bool verbose = false;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
static size_t common_part(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
||||||
|
@ -38,12 +37,13 @@ enum stop_type {
|
||||||
STOP_PARTIAL,
|
STOP_PARTIAL,
|
||||||
};
|
};
|
||||||
|
|
||||||
bool ends_with(const std::string & str, const std::string & suffix) {
|
static bool ends_with(const std::string & str, const std::string & suffix) {
|
||||||
return str.size() >= suffix.size() &&
|
return str.size() >= suffix.size() &&
|
||||||
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t find_partial_stop_string(const std::string & stop, const std::string & text) {
|
static size_t find_partial_stop_string(const std::string & stop,
|
||||||
|
const std::string & text) {
|
||||||
if (!text.empty() && !stop.empty()) {
|
if (!text.empty() && !stop.empty()) {
|
||||||
const char text_last_char = text.back();
|
const char text_last_char = text.back();
|
||||||
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
|
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
|
||||||
|
@ -67,9 +67,10 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void server_log(const char * level, const char * function, int line, const char * message, nlohmann::ordered_json extra) {
|
static void server_log(const char * level, const char * function, int line,
|
||||||
|
const char * message, const nlohmann::ordered_json & extra) {
|
||||||
nlohmann::ordered_json log {
|
nlohmann::ordered_json log {
|
||||||
{ "timestamp", time(NULL) },
|
{ "timestamp", time(nullptr) },
|
||||||
{ "level", level },
|
{ "level", level },
|
||||||
{ "function", function },
|
{ "function", function },
|
||||||
{ "line", line },
|
{ "line", line },
|
||||||
|
@ -80,19 +81,23 @@ static void server_log(const char * level, const char * function, int line, cons
|
||||||
log.merge_patch(extra);
|
log.merge_patch(extra);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||||
fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
|
fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool server_verbose = false;
|
static bool server_verbose = false;
|
||||||
|
|
||||||
#define LOG_VERBOSE(MSG, ...) \
|
#if SERVER_VERBOSE != 1
|
||||||
|
# define LOG_VERBOSE(MSG, ...)
|
||||||
|
#else
|
||||||
|
# define LOG_VERBOSE(MSG, ...) \
|
||||||
do { \
|
do { \
|
||||||
if (server_verbose) { \
|
if (server_verbose) { \
|
||||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_ERROR(MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
@ -101,7 +106,7 @@ static bool server_verbose = false;
|
||||||
struct llama_server_context {
|
struct llama_server_context {
|
||||||
bool stream = false;
|
bool stream = false;
|
||||||
bool has_next_token = false;
|
bool has_next_token = false;
|
||||||
std::string generated_text = "";
|
std::string generated_text;
|
||||||
|
|
||||||
size_t num_tokens_predicted = 0;
|
size_t num_tokens_predicted = 0;
|
||||||
size_t n_past = 0;
|
size_t n_past = 0;
|
||||||
|
@ -118,8 +123,6 @@ struct llama_server_context {
|
||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
bool stopped_limit = false;
|
bool stopped_limit = false;
|
||||||
std::string stopping_word;
|
std::string stopping_word;
|
||||||
|
|
||||||
int json_indent = -1;
|
|
||||||
int32_t multibyte_pending = 0;
|
int32_t multibyte_pending = 0;
|
||||||
|
|
||||||
~llama_server_context() {
|
~llama_server_context() {
|
||||||
|
@ -148,7 +151,7 @@ struct llama_server_context {
|
||||||
bool loadModel(const gpt_params & params_) {
|
bool loadModel(const gpt_params & params_) {
|
||||||
params = params_;
|
params = params_;
|
||||||
ctx = llama_init_from_gpt_params(params);
|
ctx = llama_init_from_gpt_params(params);
|
||||||
if (ctx == NULL) {
|
if (ctx == nullptr) {
|
||||||
LOG_ERROR("unable to load model", { { "model", params_.model } });
|
LOG_ERROR("unable to load model", { { "model", params_.model } });
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -265,7 +268,9 @@ struct llama_server_context {
|
||||||
const float mirostat_tau = params.mirostat_tau;
|
const float mirostat_tau = params.mirostat_tau;
|
||||||
const float mirostat_eta = params.mirostat_eta;
|
const float mirostat_eta = params.mirostat_eta;
|
||||||
const bool penalize_nl = params.penalize_nl;
|
const bool penalize_nl = params.penalize_nl;
|
||||||
llama_token id = 0; {
|
llama_token id = 0;
|
||||||
|
|
||||||
|
{
|
||||||
auto * logits = llama_get_logits(ctx);
|
auto * logits = llama_get_logits(ctx);
|
||||||
auto n_vocab = llama_n_vocab(ctx);
|
auto n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
|
@ -342,7 +347,7 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t findStoppingStrings(const std::string & text, const size_t last_token_size,
|
size_t findStoppingStrings(const std::string & text, const size_t last_token_size,
|
||||||
const stop_type type) {
|
const stop_type type) {
|
||||||
size_t stop_pos = std::string::npos;
|
size_t stop_pos = std::string::npos;
|
||||||
for (const std::string & word : params.antiprompt) {
|
for (const std::string & word : params.antiprompt) {
|
||||||
size_t pos;
|
size_t pos;
|
||||||
|
@ -368,9 +373,9 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string doCompletion() {
|
std::string doCompletion() {
|
||||||
llama_token token = nextToken();
|
const llama_token token = nextToken();
|
||||||
|
|
||||||
std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token);
|
const std::string token_text = token == -1 ? "" : llama_token_to_str(ctx, token);
|
||||||
generated_text += token_text;
|
generated_text += token_text;
|
||||||
|
|
||||||
if (multibyte_pending > 0) {
|
if (multibyte_pending > 0) {
|
||||||
|
@ -380,10 +385,10 @@ struct llama_server_context {
|
||||||
// 2-byte characters: 110xxxxx 10xxxxxx
|
// 2-byte characters: 110xxxxx 10xxxxxx
|
||||||
if ((c & 0xE0) == 0xC0) {
|
if ((c & 0xE0) == 0xC0) {
|
||||||
multibyte_pending = 1;
|
multibyte_pending = 1;
|
||||||
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
// 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx
|
||||||
} else if ((c & 0xF0) == 0xE0) {
|
} else if ((c & 0xF0) == 0xE0) {
|
||||||
multibyte_pending = 2;
|
multibyte_pending = 2;
|
||||||
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
// 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
} else if ((c & 0xF8) == 0xF0) {
|
} else if ((c & 0xF8) == 0xF0) {
|
||||||
multibyte_pending = 3;
|
multibyte_pending = 3;
|
||||||
} else {
|
} else {
|
||||||
|
@ -416,12 +421,13 @@ struct llama_server_context {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
|
static void server_print_usage(const char * argv0, const gpt_params & params,
|
||||||
|
const server_params & sparams) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv0);
|
fprintf(stderr, "usage: %s [options]\n", argv0);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", sparams.verbose ? "enabled" : "disabled");
|
fprintf(stderr, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
|
@ -453,8 +459,8 @@ void server_print_usage(const char * argv0, const gpt_params & params, const ser
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_params_parse(int argc, char ** argv, server_params & sparams,
|
static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
gpt_params & params) {
|
gpt_params & params) {
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
server_params default_sparams;
|
server_params default_sparams;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
|
@ -543,12 +549,12 @@ void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
std::vector<std::string> split_arg{ it, {} };
|
std::vector<std::string> split_arg{ it, {} };
|
||||||
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||||
|
|
||||||
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
|
||||||
if (i < split_arg.size()) {
|
if (i_device < split_arg.size()) {
|
||||||
params.tensor_split[i] = std::stof(split_arg[i]);
|
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
params.tensor_split[i] = 0.0f;
|
params.tensor_split[i_device] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
@ -579,9 +585,10 @@ void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
}
|
}
|
||||||
params.lora_base = argv[i];
|
params.lora_base = argv[i];
|
||||||
} else if (arg == "-v" || arg == "--verbose") {
|
} else if (arg == "-v" || arg == "--verbose") {
|
||||||
sparams.verbose = true;
|
|
||||||
#if SERVER_VERBOSE != 1
|
#if SERVER_VERBOSE != 1
|
||||||
LOG_WARNING("server.cpp is not built with verbose logging.", {});
|
LOG_WARNING("server.cpp is not built with verbose logging.", {});
|
||||||
|
#else
|
||||||
|
server_verbose = true;
|
||||||
#endif
|
#endif
|
||||||
} else if (arg == "--mlock") {
|
} else if (arg == "--mlock") {
|
||||||
params.use_mlock = true;
|
params.use_mlock = true;
|
||||||
|
@ -659,7 +666,7 @@ static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool parse_options_completion(json body, llama_server_context & llama) {
|
static void parse_options_completion(const json & body, llama_server_context & llama) {
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
|
|
||||||
llama.stream = body.value("stream", false);
|
llama.stream = body.value("stream", false);
|
||||||
|
@ -687,7 +694,7 @@ bool parse_options_completion(json body, llama_server_context & llama) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (body["logit_bias"].is_array()) {
|
if (body["logit_bias"].is_array()) {
|
||||||
int n_vocab = llama_n_vocab(llama.ctx);
|
const int n_vocab = llama_n_vocab(llama.ctx);
|
||||||
for (const auto & el : body["logit_bias"]) {
|
for (const auto & el : body["logit_bias"]) {
|
||||||
if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
|
if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
|
||||||
llama_token tok = el[0].get<llama_token>();
|
llama_token tok = el[0].get<llama_token>();
|
||||||
|
@ -711,8 +718,6 @@ bool parse_options_completion(json body, llama_server_context & llama) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void log_server_request(const Request & req, const Response & res) {
|
static void log_server_request(const Request & req, const Response & res) {
|
||||||
|
@ -736,10 +741,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
server_params_parse(argc, argv, sparams, params);
|
server_params_parse(argc, argv, sparams, params);
|
||||||
|
|
||||||
#if SERVER_VERBOSE == 1
|
|
||||||
server_verbose = sparams.verbose;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (params.model_alias == "unknown") {
|
if (params.model_alias == "unknown") {
|
||||||
params.model_alias = params.model;
|
params.model_alias = params.model;
|
||||||
}
|
}
|
||||||
|
@ -773,13 +774,10 @@ int main(int argc, char ** argv) {
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/completion", [&llama](const Request & req, Response & res) {
|
svr.Post("/completion", [&llama](const Request & req, Response & res) {
|
||||||
|
|
||||||
llama.rewind();
|
llama.rewind();
|
||||||
llama_reset_timings(llama.ctx);
|
llama_reset_timings(llama.ctx);
|
||||||
|
|
||||||
if (!parse_options_completion(json::parse(req.body), llama)) {
|
parse_options_completion(json::parse(req.body), llama);
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama.loadPrompt();
|
llama.loadPrompt();
|
||||||
llama.beginCompletion();
|
llama.beginCompletion();
|
||||||
|
@ -802,15 +800,13 @@ int main(int argc, char ** argv) {
|
||||||
llama.generated_text.end());
|
llama.generated_text.end());
|
||||||
}
|
}
|
||||||
|
|
||||||
json data = format_final_response(llama, llama.generated_text);
|
const json data = format_final_response(llama, llama.generated_text);
|
||||||
|
|
||||||
llama_print_timings(llama.ctx);
|
llama_print_timings(llama.ctx);
|
||||||
|
|
||||||
res.set_content(
|
res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace),
|
||||||
data.dump(llama.json_indent, ' ', false, json::error_handler_t::replace),
|
"application/json");
|
||||||
"application/json");
|
} else {
|
||||||
}
|
|
||||||
else {
|
|
||||||
const auto chunked_content_provider = [&](size_t, DataSink & sink) {
|
const auto chunked_content_provider = [&](size_t, DataSink & sink) {
|
||||||
size_t sent_count = 0;
|
size_t sent_count = 0;
|
||||||
|
|
||||||
|
@ -822,7 +818,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
size_t pos = std::min(sent_count, llama.generated_text.size());
|
size_t pos = std::min(sent_count, llama.generated_text.size());
|
||||||
|
|
||||||
const char* str_test = llama.generated_text.c_str() + pos;
|
const std::string str_test = llama.generated_text.substr(pos);
|
||||||
size_t stop_pos =
|
size_t stop_pos =
|
||||||
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
|
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
|
||||||
if (stop_pos != std::string::npos) {
|
if (stop_pos != std::string::npos) {
|
||||||
|
@ -835,21 +831,17 @@ int main(int argc, char ** argv) {
|
||||||
STOP_PARTIAL);
|
STOP_PARTIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string to_send = llama.generated_text.substr(pos, stop_pos);
|
const std::string to_send = llama.generated_text.substr(pos, stop_pos);
|
||||||
sent_count += to_send.size();
|
sent_count += to_send.size();
|
||||||
|
|
||||||
json data;
|
const json data = llama.has_next_token
|
||||||
if (llama.has_next_token) {
|
? format_partial_response(to_send)
|
||||||
data = format_partial_response(to_send);
|
// Generation is done, send extra information.
|
||||||
} else {
|
: format_final_response(llama, to_send);
|
||||||
// Generation is done, send extra information.
|
|
||||||
data = format_final_response(llama, to_send);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string str =
|
const std::string str =
|
||||||
"data: " +
|
"data: " +
|
||||||
data.dump(llama.has_next_token ? -1 : llama.json_indent, ' ', false,
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
json::error_handler_t::replace) +
|
|
||||||
"\n\n";
|
"\n\n";
|
||||||
|
|
||||||
LOG_VERBOSE("data stream", {
|
LOG_VERBOSE("data stream", {
|
||||||
|
@ -876,11 +868,11 @@ int main(int argc, char ** argv) {
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
||||||
json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
std::string content = body["content"].get<std::string>();
|
const std::string content = body["content"].get<std::string>();
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(llama.ctx, content, false);
|
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
||||||
json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(llama.json_indent), "application/json");
|
return res.set_content(data.dump(), "application/json");
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.set_logger(log_server_request);
|
svr.set_logger(log_server_request);
|
||||||
|
@ -890,14 +882,14 @@ int main(int argc, char ** argv) {
|
||||||
char buf[BUFSIZ];
|
char buf[BUFSIZ];
|
||||||
try {
|
try {
|
||||||
std::rethrow_exception(std::move(ep));
|
std::rethrow_exception(std::move(ep));
|
||||||
} catch (std::exception& e) {
|
} catch (std::exception & e) {
|
||||||
snprintf(buf, sizeof(buf), fmt, e.what());
|
snprintf(buf, sizeof(buf), fmt, e.what());
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
|
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
|
||||||
}
|
}
|
||||||
res.set_content(buf, "text/plain");
|
res.set_content(buf, "text/plain");
|
||||||
res.status = 500;
|
res.status = 500;
|
||||||
});
|
});
|
||||||
|
|
||||||
// set timeouts and change hostname and port
|
// set timeouts and change hostname and port
|
||||||
svr.set_read_timeout(sparams.read_timeout);
|
svr.set_read_timeout(sparams.read_timeout);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue