server : refactor middleware and /health endpoint
This commit is contained in:
parent
4b9afbbe90
commit
b337a7bf99
1 changed files with 111 additions and 163 deletions
|
@ -15,6 +15,8 @@
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
// mime type for sending response
|
||||||
|
#define MIMETYPE_JSON "application/json; charset=utf-8"
|
||||||
|
|
||||||
// auto generated files (update with ./deps.sh)
|
// auto generated files (update with ./deps.sh)
|
||||||
#include "colorthemes.css.hpp"
|
#include "colorthemes.css.hpp"
|
||||||
|
@ -67,7 +69,6 @@ enum slot_command {
|
||||||
enum server_state {
|
enum server_state {
|
||||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||||
SERVER_STATE_ERROR // An error occurred, load_model failed
|
|
||||||
};
|
};
|
||||||
|
|
||||||
enum server_task_type {
|
enum server_task_type {
|
||||||
|
@ -689,12 +690,11 @@ struct server_context {
|
||||||
if (model == nullptr) {
|
if (model == nullptr) {
|
||||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||||
return false;
|
return false;
|
||||||
|
} else {
|
||||||
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
add_bos_token = llama_add_bos_token(model);
|
||||||
|
has_eos_token = !llama_add_eos_token(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
|
||||||
|
|
||||||
add_bos_token = llama_add_bos_token(model);
|
|
||||||
has_eos_token = !llama_add_eos_token(model);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2555,19 +2555,19 @@ int main(int argc, char ** argv) {
|
||||||
svr->set_default_headers({{"Server", "llama.cpp"}});
|
svr->set_default_headers({{"Server", "llama.cpp"}});
|
||||||
|
|
||||||
// CORS preflight
|
// CORS preflight
|
||||||
svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) {
|
svr->Options(R"(.*)", [](const httplib::Request &, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
// Access-Control-Allow-Origin is already set by middleware
|
||||||
res.set_header("Access-Control-Allow-Credentials", "true");
|
res.set_header("Access-Control-Allow-Credentials", "true");
|
||||||
res.set_header("Access-Control-Allow-Methods", "POST");
|
res.set_header("Access-Control-Allow-Methods", "POST");
|
||||||
res.set_header("Access-Control-Allow-Headers", "*");
|
res.set_header("Access-Control-Allow-Headers", "*");
|
||||||
return res.set_content("", "application/json; charset=utf-8");
|
return res.set_content("", "text/html"); // blank response, no data
|
||||||
});
|
});
|
||||||
|
|
||||||
svr->set_logger(log_server_request);
|
svr->set_logger(log_server_request);
|
||||||
|
|
||||||
auto res_error = [](httplib::Response & res, json error_data) {
|
auto res_error = [](httplib::Response & res, json error_data) {
|
||||||
json final_response {{"error", error_data}};
|
json final_response {{"error", error_data}};
|
||||||
res.set_content(final_response.dump(), "application/json; charset=utf-8");
|
res.set_content(final_response.dump(), MIMETYPE_JSON);
|
||||||
res.status = json_value(error_data, "code", 500);
|
res.status = json_value(error_data, "code", 500);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2597,11 +2597,6 @@ int main(int argc, char ** argv) {
|
||||||
svr->set_read_timeout (params.timeout_read);
|
svr->set_read_timeout (params.timeout_read);
|
||||||
svr->set_write_timeout(params.timeout_write);
|
svr->set_write_timeout(params.timeout_write);
|
||||||
|
|
||||||
if (!svr->bind_to_port(params.hostname, params.port)) {
|
|
||||||
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unordered_map<std::string, std::string> log_data;
|
std::unordered_map<std::string, std::string> log_data;
|
||||||
|
|
||||||
log_data["hostname"] = params.hostname;
|
log_data["hostname"] = params.hostname;
|
||||||
|
@ -2617,35 +2612,6 @@ int main(int argc, char ** argv) {
|
||||||
// Necessary similarity of prompt for slot selection
|
// Necessary similarity of prompt for slot selection
|
||||||
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
|
ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
|
||||||
|
|
||||||
// load the model
|
|
||||||
if (!ctx_server.load_model(params)) {
|
|
||||||
state.store(SERVER_STATE_ERROR);
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
ctx_server.init();
|
|
||||||
state.store(SERVER_STATE_READY);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_INFO("model loaded", {});
|
|
||||||
|
|
||||||
const auto model_meta = ctx_server.model_meta();
|
|
||||||
|
|
||||||
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
|
||||||
if (params.chat_template.empty()) {
|
|
||||||
if (!ctx_server.validate_model_chat_template()) {
|
|
||||||
LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
|
||||||
params.chat_template = "chatml";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// print sample chat example to make it clear which template is used
|
|
||||||
{
|
|
||||||
LOG_INFO("chat template", {
|
|
||||||
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
|
||||||
{"built_in", params.chat_template.empty()},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// Middlewares
|
// Middlewares
|
||||||
//
|
//
|
||||||
|
@ -2689,8 +2655,6 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// API key is invalid or not provided
|
// API key is invalid or not provided
|
||||||
// TODO: make another middleware for CORS related logic
|
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
|
res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
|
||||||
|
|
||||||
LOG_WARNING("Unauthorized: Invalid API Key", {});
|
LOG_WARNING("Unauthorized: Invalid API Key", {});
|
||||||
|
@ -2698,8 +2662,21 @@ int main(int argc, char ** argv) {
|
||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) {
|
||||||
|
server_state current_state = state.load();
|
||||||
|
if (current_state == SERVER_STATE_LOADING_MODEL) {
|
||||||
|
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
// register server middlewares
|
// register server middlewares
|
||||||
svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) {
|
svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
|
if (!middleware_server_state(req, res)) {
|
||||||
|
return httplib::Server::HandlerResponse::Handled;
|
||||||
|
}
|
||||||
if (!middleware_validate_api_key(req, res)) {
|
if (!middleware_validate_api_key(req, res)) {
|
||||||
return httplib::Server::HandlerResponse::Handled;
|
return httplib::Server::HandlerResponse::Handled;
|
||||||
}
|
}
|
||||||
|
@ -2710,57 +2687,10 @@ int main(int argc, char ** argv) {
|
||||||
// Route handlers (or controllers)
|
// Route handlers (or controllers)
|
||||||
//
|
//
|
||||||
|
|
||||||
const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_health = [&](const httplib::Request &, httplib::Response & res) {
|
||||||
server_state current_state = state.load();
|
// error and loading states are handled by middleware
|
||||||
switch (current_state) {
|
json health = {{"status", "ok"}};
|
||||||
case SERVER_STATE_READY:
|
res.set_content(health.dump(), "application/json");
|
||||||
{
|
|
||||||
// request slots data using task queue
|
|
||||||
server_task task;
|
|
||||||
task.id = ctx_server.queue_tasks.get_new_id();
|
|
||||||
task.type = SERVER_TASK_TYPE_METRICS;
|
|
||||||
task.id_target = -1;
|
|
||||||
|
|
||||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
|
||||||
ctx_server.queue_tasks.post(task);
|
|
||||||
|
|
||||||
// get the result
|
|
||||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
|
||||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
|
||||||
|
|
||||||
const int n_idle_slots = result.data.at("idle");
|
|
||||||
const int n_processing_slots = result.data.at("processing");
|
|
||||||
|
|
||||||
json health = {
|
|
||||||
{"status", "ok"},
|
|
||||||
{"slots_idle", n_idle_slots},
|
|
||||||
{"slots_processing", n_processing_slots}
|
|
||||||
};
|
|
||||||
|
|
||||||
res.status = 200; // HTTP OK
|
|
||||||
if (params.endpoint_slots && req.has_param("include_slots")) {
|
|
||||||
health["slots"] = result.data.at("slots");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_idle_slots == 0) {
|
|
||||||
health["status"] = "no slot available";
|
|
||||||
if (req.has_param("fail_on_no_slot")) {
|
|
||||||
res.status = 503; // HTTP Service Unavailable
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
res.set_content(health.dump(), "application/json");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case SERVER_STATE_LOADING_MODEL:
|
|
||||||
{
|
|
||||||
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
|
||||||
} break;
|
|
||||||
case SERVER_STATE_ERROR:
|
|
||||||
{
|
|
||||||
res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
|
const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
|
||||||
|
@ -2783,7 +2713,7 @@ int main(int argc, char ** argv) {
|
||||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
server_task_result result = ctx_server.queue_results.recv(task.id);
|
||||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||||
|
|
||||||
res.set_content(result.data.at("slots").dump(), "application/json");
|
res.set_content(result.data.at("slots").dump(), MIMETYPE_JSON);
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2914,7 +2844,7 @@ int main(int argc, char ** argv) {
|
||||||
if (result.error) {
|
if (result.error) {
|
||||||
res_error(res, result.data);
|
res_error(res, result.data);
|
||||||
} else {
|
} else {
|
||||||
res.set_content(result.data.dump(), "application/json");
|
res.set_content(result.data.dump(), MIMETYPE_JSON);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2944,7 +2874,7 @@ int main(int argc, char ** argv) {
|
||||||
if (result.error) {
|
if (result.error) {
|
||||||
res_error(res, result.data);
|
res_error(res, result.data);
|
||||||
} else {
|
} else {
|
||||||
res.set_content(result.data.dump(), "application/json");
|
res.set_content(result.data.dump(), MIMETYPE_JSON);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -2964,13 +2894,11 @@ int main(int argc, char ** argv) {
|
||||||
if (result.error) {
|
if (result.error) {
|
||||||
res_error(res, result.data);
|
res_error(res, result.data);
|
||||||
} else {
|
} else {
|
||||||
res.set_content(result.data.dump(), "application/json");
|
res.set_content(result.data.dump(), MIMETYPE_JSON);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_slots_action = [&res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
|
|
||||||
std::string id_slot_str = req.path_params.at("id_slot");
|
std::string id_slot_str = req.path_params.at("id_slot");
|
||||||
int id_slot;
|
int id_slot;
|
||||||
|
|
||||||
|
@ -2994,7 +2922,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_props = [&ctx_server](const httplib::Request &, httplib::Response & res) {
|
||||||
std::string template_key = "tokenizer.chat_template", curr_tmpl;
|
std::string template_key = "tokenizer.chat_template", curr_tmpl;
|
||||||
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
|
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
|
||||||
if (tlen > 0) {
|
if (tlen > 0) {
|
||||||
|
@ -3003,7 +2931,6 @@ int main(int argc, char ** argv) {
|
||||||
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
|
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
json data = {
|
json data = {
|
||||||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
|
@ -3011,7 +2938,7 @@ int main(int argc, char ** argv) {
|
||||||
{ "chat_template", curr_tmpl.c_str() }
|
{ "chat_template", curr_tmpl.c_str() }
|
||||||
};
|
};
|
||||||
|
|
||||||
res.set_content(data.dump(), "application/json; charset=utf-8");
|
res.set_content(data.dump(), MIMETYPE_JSON);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
@ -3020,8 +2947,6 @@ int main(int argc, char ** argv) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
|
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
|
|
||||||
const int id_task = ctx_server.queue_tasks.get_new_id();
|
const int id_task = ctx_server.queue_tasks.get_new_id();
|
||||||
|
@ -3032,7 +2957,7 @@ int main(int argc, char ** argv) {
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
if (!result.error && result.stop) {
|
if (!result.error && result.stop) {
|
||||||
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
|
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
|
||||||
} else {
|
} else {
|
||||||
res_error(res, result.data);
|
res_error(res, result.data);
|
||||||
}
|
}
|
||||||
|
@ -3095,9 +3020,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_models = [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
|
|
||||||
json models = {
|
json models = {
|
||||||
{"object", "list"},
|
{"object", "list"},
|
||||||
{"data", {
|
{"data", {
|
||||||
|
@ -3106,12 +3029,12 @@ int main(int argc, char ** argv) {
|
||||||
{"object", "model"},
|
{"object", "model"},
|
||||||
{"created", std::time(0)},
|
{"created", std::time(0)},
|
||||||
{"owned_by", "llamacpp"},
|
{"owned_by", "llamacpp"},
|
||||||
{"meta", model_meta}
|
{"meta", ctx_server.model_meta()}
|
||||||
},
|
},
|
||||||
}}
|
}}
|
||||||
};
|
};
|
||||||
|
|
||||||
res.set_content(models.dump(), "application/json; charset=utf-8");
|
res.set_content(models.dump(), MIMETYPE_JSON);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
@ -3119,8 +3042,6 @@ int main(int argc, char ** argv) {
|
||||||
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support chat completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
|
||||||
|
|
||||||
const int id_task = ctx_server.queue_tasks.get_new_id();
|
const int id_task = ctx_server.queue_tasks.get_new_id();
|
||||||
|
@ -3135,7 +3056,7 @@ int main(int argc, char ** argv) {
|
||||||
if (!result.error && result.stop) {
|
if (!result.error && result.stop) {
|
||||||
json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
|
json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
|
||||||
|
|
||||||
res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
|
res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
|
||||||
} else {
|
} else {
|
||||||
res_error(res, result.data);
|
res_error(res, result.data);
|
||||||
}
|
}
|
||||||
|
@ -3197,8 +3118,6 @@ int main(int argc, char ** argv) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
|
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
|
|
||||||
const int id_task = ctx_server.queue_tasks.get_new_id();
|
const int id_task = ctx_server.queue_tasks.get_new_id();
|
||||||
|
@ -3209,7 +3128,7 @@ int main(int argc, char ** argv) {
|
||||||
if (!json_value(data, "stream", false)) {
|
if (!json_value(data, "stream", false)) {
|
||||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
if (!result.error && result.stop) {
|
if (!result.error && result.stop) {
|
||||||
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
|
res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
|
||||||
} else {
|
} else {
|
||||||
res_error(res, result.data);
|
res_error(res, result.data);
|
||||||
}
|
}
|
||||||
|
@ -3257,7 +3176,6 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
@ -3266,11 +3184,10 @@ int main(int argc, char ** argv) {
|
||||||
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||||
}
|
}
|
||||||
const json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
return res.set_content(data.dump(), MIMETYPE_JSON);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
std::string content;
|
std::string content;
|
||||||
|
@ -3280,12 +3197,10 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const json data = format_detokenized_response(content);
|
const json data = format_detokenized_response(content);
|
||||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
return res.set_content(data.dump(), MIMETYPE_JSON);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_embeddings = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
|
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
bool is_openai = false;
|
bool is_openai = false;
|
||||||
|
|
||||||
|
@ -3331,11 +3246,10 @@ int main(int argc, char ** argv) {
|
||||||
json root = is_openai
|
json root = is_openai
|
||||||
? format_embeddings_response_oaicompat(body, responses)
|
? format_embeddings_response_oaicompat(body, responses)
|
||||||
: responses[0];
|
: responses[0];
|
||||||
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
return res.set_content(root.dump(), MIMETYPE_JSON);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
json result = json::array();
|
json result = json::array();
|
||||||
for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
|
for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
|
||||||
auto & la = ctx_server.lora_adapters[i];
|
auto & la = ctx_server.lora_adapters[i];
|
||||||
|
@ -3345,13 +3259,11 @@ int main(int argc, char ** argv) {
|
||||||
{"scale", la.scale},
|
{"scale", la.scale},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
res.set_content(result.dump(), "application/json");
|
res.set_content(result.dump(), MIMETYPE_JSON);
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
|
||||||
|
|
||||||
const std::vector<json> body = json::parse(req.body);
|
const std::vector<json> body = json::parse(req.body);
|
||||||
int max_idx = ctx_server.lora_adapters.size();
|
int max_idx = ctx_server.lora_adapters.size();
|
||||||
|
|
||||||
|
@ -3379,7 +3291,7 @@ int main(int argc, char ** argv) {
|
||||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||||
|
|
||||||
res.set_content(result.data.dump(), "application/json");
|
res.set_content(result.data.dump(), MIMETYPE_JSON);
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3455,35 +3367,75 @@ int main(int argc, char ** argv) {
|
||||||
log_data["n_threads_http"] = std::to_string(params.n_threads_http);
|
log_data["n_threads_http"] = std::to_string(params.n_threads_http);
|
||||||
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
svr->new_task_queue = [¶ms] { return new httplib::ThreadPool(params.n_threads_http); };
|
||||||
|
|
||||||
LOG_INFO("HTTP server listening", log_data);
|
// clean up function, to be called before exit
|
||||||
|
auto clean_up = [&svr]() {
|
||||||
|
svr->stop();
|
||||||
|
llama_backend_free();
|
||||||
|
};
|
||||||
|
|
||||||
// run the HTTP server in a thread - see comment below
|
// bind HTTP listen port, run the HTTP server in a thread
|
||||||
std::thread t([&]() {
|
if (!svr->bind_to_port(params.hostname, params.port)) {
|
||||||
if (!svr->listen_after_bind()) {
|
LOG_ERROR("couldn't bind HTTP server socket", {
|
||||||
state.store(SERVER_STATE_ERROR);
|
{"hostname", params.hostname},
|
||||||
return 1;
|
{"port", params.port},
|
||||||
|
});
|
||||||
|
clean_up();
|
||||||
|
LOG_ERROR("exiting due to HTTP server error", {});
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
std::thread t([&]() { svr->listen_after_bind(); });
|
||||||
|
svr->wait_until_ready();
|
||||||
|
|
||||||
|
LOG_INFO("HTTP server is listening", log_data);
|
||||||
|
|
||||||
|
// load the model
|
||||||
|
LOG_INFO("loading model", log_data);
|
||||||
|
if (!ctx_server.load_model(params)) {
|
||||||
|
clean_up();
|
||||||
|
t.join();
|
||||||
|
LOG_ERROR("exiting due to model loading error", {});
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
ctx_server.init();
|
||||||
|
state.store(SERVER_STATE_READY);
|
||||||
|
|
||||||
|
LOG_INFO("model loaded", {});
|
||||||
|
|
||||||
|
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
|
||||||
|
if (params.chat_template.empty()) {
|
||||||
|
if (!ctx_server.validate_model_chat_template()) {
|
||||||
|
LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
|
||||||
|
params.chat_template = "chatml";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
// print sample chat example to make it clear which template is used
|
||||||
});
|
{
|
||||||
|
LOG_INFO("chat template", {
|
||||||
|
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
||||||
|
{"built_in", params.chat_template.empty()},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||||
ctx_server.queue_tasks.on_finish_multitask(std::bind(
|
ctx_server.queue_tasks.on_finish_multitask(std::bind(
|
||||||
&server_context::on_finish_multitask, &ctx_server, std::placeholders::_1));
|
&server_context::on_finish_multitask, &ctx_server, std::placeholders::_1));
|
||||||
ctx_server.queue_tasks.on_update_slots(std::bind(
|
ctx_server.queue_tasks.on_update_slots(std::bind(
|
||||||
&server_context::update_slots, &ctx_server));
|
&server_context::update_slots, &ctx_server));
|
||||||
ctx_server.queue_results.on_multitask_update(std::bind(
|
ctx_server.queue_results.on_multitask_update(std::bind(
|
||||||
&server_queue::update_multitask,
|
&server_queue::update_multitask,
|
||||||
&ctx_server.queue_tasks,
|
&ctx_server.queue_tasks,
|
||||||
std::placeholders::_1,
|
std::placeholders::_1,
|
||||||
std::placeholders::_2,
|
std::placeholders::_2,
|
||||||
std::placeholders::_3
|
std::placeholders::_3
|
||||||
));
|
));
|
||||||
|
|
||||||
shutdown_handler = [&](int) {
|
shutdown_handler = [&](int) {
|
||||||
ctx_server.queue_tasks.terminate();
|
ctx_server.queue_tasks.terminate();
|
||||||
};
|
};
|
||||||
|
ctx_server.queue_tasks.start_loop();
|
||||||
|
}
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
struct sigaction sigint_action;
|
struct sigaction sigint_action;
|
||||||
|
@ -3499,12 +3451,8 @@ int main(int argc, char ** argv) {
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ctx_server.queue_tasks.start_loop();
|
clean_up();
|
||||||
|
|
||||||
svr->stop();
|
|
||||||
t.join();
|
t.join();
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue