Merge branch 'master' into gg/llama-kv-cache

This commit is contained in:
Georgi Gerganov 2025-01-30 16:39:58 +02:00
commit a40ba49fa6
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
26 changed files with 1922 additions and 193 deletions

View file

@ -14,7 +14,7 @@
// mime type for sending response
#define MIMETYPE_JSON "application/json; charset=utf-8"
// auto generated files (update with ./deps.sh)
// auto generated files (see README.md for details)
#include "index.html.gz.hpp"
#include "loading.html.hpp"
@ -4124,6 +4124,14 @@ int main(int argc, char ** argv) {
res_ok(res, root);
};
const auto handle_apply_template = [&ctx_server, &params, &res_ok](const httplib::Request & req, httplib::Response & res) {
auto body = json::parse(req.body);
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
res_ok(res, {{ "prompt", data.at("prompt") }});
};
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
};
@ -4300,6 +4308,7 @@ int main(int argc, char ** argv) {
svr->Post("/v1/reranking", handle_rerank);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);
svr->Post("/apply-template", handle_apply_template);
// LoRA adapters hotswap
svr->Get ("/lora-adapters", handle_lora_adapters_list);
svr->Post("/lora-adapters", handle_lora_adapters_apply);
@ -4378,11 +4387,13 @@ int main(int argc, char ** argv) {
ctx_server.chat_templates.template_default->source().c_str(),
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
ctx_server.queue_tasks.on_new_task([&ctx_server](const server_task & task) {
ctx_server.process_single_task(task);
});
ctx_server.queue_tasks.on_update_slots(std::bind(
&server_context::update_slots, &ctx_server));
ctx_server.queue_tasks.on_update_slots([&ctx_server]() {
ctx_server.update_slots();
});
shutdown_handler = [&](int) {
ctx_server.queue_tasks.terminate();