Merge branch 'master' into xsn/vision_2

This commit is contained in:
Xuan Son Nguyen 2025-01-22 13:28:31 +01:00
commit 32daa38333
65 changed files with 7551 additions and 952 deletions

View file

@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
| `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
| `--grammar-file FNAME` | file to read grammar from |
| `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |
**Example-specific params**

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,7 @@
#include "loading.html.hpp"
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstddef>
#include <cinttypes>
@ -32,6 +33,8 @@
using json = nlohmann::ordered_json;
constexpr int HTTP_POLLING_SECONDS = 1;
enum stop_type {
STOP_TYPE_NONE,
STOP_TYPE_EOS,
@ -264,6 +267,11 @@ struct server_task {
params.speculative.n_min = std::max(params.speculative.n_min, 2);
params.speculative.n_max = std::max(params.speculative.n_max, 0);
// Use OpenAI API logprobs only if n_probs wasn't provided
if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){
params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs);
}
if (data.contains("lora")) {
if (data.at("lora").is_array()) {
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
@ -1602,6 +1610,30 @@ struct server_response {
// should never reach here
}
// same as recv(), but have timeout in seconds
// if timeout is reached, nullptr is returned
server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
while (true) {
std::unique_lock<std::mutex> lock(mutex_results);
bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
return !queue_results.empty();
});
if (!cr_res) {
return nullptr;
}
for (int i = 0; i < (int) queue_results.size(); i++) {
if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
server_task_result_ptr res = std::move(queue_results[i]);
queue_results.erase(queue_results.begin() + i);
return res;
}
}
}
// should never reach here
}
// single-task version of recv()
server_task_result_ptr recv(int id_task) {
std::unordered_set<int> id_tasks = {id_task};
@ -1661,6 +1693,8 @@ struct server_context {
// Necessary similarity of prompt for slot selection
float slot_prompt_similarity = 0.0f;
common_chat_templates chat_templates;
~server_context() {
// Clear any sampling context
for (server_slot & slot : slots) {
@ -1701,13 +1735,16 @@ struct server_context {
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params_base.speculative.model.empty()) {
if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
auto params_dft = params_base;
params_dft.devices = params_base.speculative.devices;
params_dft.hf_file = params_base.speculative.hf_file;
params_dft.hf_repo = params_base.speculative.hf_repo;
params_dft.model = params_base.speculative.model;
params_dft.model_url = params_base.speculative.model_url;
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
@ -1737,14 +1774,39 @@ struct server_context {
cparams_dft.type_v = GGML_TYPE_F16;
}
chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
GGML_ASSERT(chat_templates.template_default.get() != nullptr);
return true;
}
bool validate_builtin_chat_template() const {
bool validate_builtin_chat_template(bool use_jinja) const {
llama_chat_message chat[] = {{"user", "test"}};
const char * tmpl = llama_model_chat_template(model);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
if (use_jinja) {
auto templates = common_chat_templates_from_model(model, "");
GGML_ASSERT(templates.template_default);
try {
templates.template_default->apply({{
{"role", "user"},
{"content", "test"},
}}, json(), true);
if (templates.template_tool_use) {
templates.template_tool_use->apply({{
{"role", "user"},
{"content", "test"},
}}, json(), true);
}
return true;
} catch (const std::exception & e) {
SRV_ERR("failed to apply template: %s\n", e.what());
return false;
}
} else {
const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
return chat_res > 0;
}
}
void init() {
@ -2322,10 +2384,21 @@ struct server_context {
void receive_multi_results(
const std::unordered_set<int> & id_tasks,
const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
const std::function<void(json)> & error_handler) {
const std::function<void(json)> & error_handler,
const std::function<bool()> & is_connection_closed) {
std::vector<server_task_result_ptr> results(id_tasks.size());
for (size_t i = 0; i < id_tasks.size(); i++) {
server_task_result_ptr result = queue_results.recv(id_tasks);
for (int i = 0; i < (int)id_tasks.size(); i++) {
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
if (is_connection_closed()) {
cancel_tasks(id_tasks);
return;
}
if (result == nullptr) {
i--; // retry
continue;
}
if (result->is_error()) {
error_handler(result->to_json());
@ -2349,10 +2422,20 @@ struct server_context {
void receive_cmpl_results_stream(
const std::unordered_set<int> & id_tasks,
const std::function<bool(server_task_result_ptr&)> & result_handler,
const std::function<void(json)> & error_handler) {
const std::function<void(json)> & error_handler,
const std::function<bool()> & is_connection_closed) {
size_t n_finished = 0;
while (true) {
server_task_result_ptr result = queue_results.recv(id_tasks);
server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
if (is_connection_closed()) {
cancel_tasks(id_tasks);
return;
}
if (result == nullptr) {
continue; // retry
}
if (result->is_error()) {
error_handler(result->to_json());
@ -3609,9 +3692,12 @@ int main(int argc, char ** argv) {
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
{ "total_slots", ctx_server.params_base.n_parallel },
{ "model_path", ctx_server.params_base.model },
{ "chat_template", common_get_builtin_chat_template(ctx_server.model) },
{ "chat_template", ctx_server.chat_templates.template_default->source() },
{ "build_info", build_info },
};
if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
}
res_ok(res, data);
};
@ -3634,6 +3720,7 @@ int main(int argc, char ** argv) {
const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
server_task_type type,
json & data,
std::function<bool()> is_connection_closed,
httplib::Response & res,
oaicompat_type oaicompat) {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
@ -3695,7 +3782,7 @@ int main(int argc, char ** argv) {
}
}, [&](const json & error_data) {
res_error(res, error_data);
});
}, is_connection_closed);
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
} else {
@ -3705,6 +3792,7 @@ int main(int argc, char ** argv) {
if (res_json.is_array()) {
for (const auto & res : res_json) {
if (!server_sent_event(sink, "data", res)) {
// sending failed (HTTP connection closed), cancel the generation
return false;
}
}
@ -3714,6 +3802,9 @@ int main(int argc, char ** argv) {
}
}, [&](const json & error_data) {
server_sent_event(sink, "error", error_data);
}, [&sink]() {
// note: do not use req.is_connection_closed here because req is already destroyed
return !sink.is_writable();
});
if (oaicompat != OAICOMPAT_TYPE_NONE) {
static const std::string ev_done = "data: [DONE]\n\n";
@ -3736,6 +3827,7 @@ int main(int argc, char ** argv) {
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_NONE);
};
@ -3745,6 +3837,7 @@ int main(int argc, char ** argv) {
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_COMPLETION);
};
@ -3821,6 +3914,7 @@ int main(int argc, char ** argv) {
return handle_completions_impl(
SERVER_TASK_TYPE_INFILL,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
};
@ -3831,10 +3925,14 @@ int main(int argc, char ** argv) {
return;
}
json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
auto body = json::parse(req.body);
const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
req.is_connection_closed,
res,
OAICOMPAT_TYPE_CHAT);
};
@ -3981,7 +4079,7 @@ int main(int argc, char ** argv) {
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
});
}, req.is_connection_closed);
ctx_server.queue_results.remove_waiting_task_ids(task_ids);
}
@ -4071,7 +4169,7 @@ int main(int argc, char ** argv) {
}, [&](const json & error_data) {
res_error(res, error_data);
error = true;
});
}, req.is_connection_closed);
}
if (error) {
@ -4240,7 +4338,7 @@ int main(int argc, char ** argv) {
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
if (params.chat_template.empty()) {
if (!ctx_server.validate_builtin_chat_template()) {
if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
params.chat_template = "chatml";
}
@ -4248,8 +4346,8 @@ int main(int argc, char ** argv) {
// print sample chat example to make it clear which template is used
LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
common_chat_format_example(ctx_server.model, params.chat_template).c_str());
ctx_server.chat_templates.template_default->source().c_str(),
common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());
ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1));

View file

@ -4,22 +4,26 @@ from utils import *
server = ServerPreset.tinyllama2()
@pytest.fixture(scope="module", autouse=True)
@pytest.fixture(autouse=True)
def create_server():
global server
server = ServerPreset.tinyllama2()
@pytest.mark.parametrize(
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
"model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
[
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", False, None),
(None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", True, None),
(None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
]
)
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
global server
server.jinja = jinja
server.chat_template = chat_template
server.start()
res = server.make_request("POST", "/chat/completions", data={
"model": model,

View file

@ -1,4 +1,5 @@
import pytest
import requests
import time
from openai import OpenAI
from utils import *
@ -405,3 +406,23 @@ def test_n_probs_post_sampling():
assert "bytes" in prob and type(prob["bytes"]) == list
# because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
def test_cancel_request():
global server
server.n_ctx = 4096
server.n_predict = -1
server.n_slots = 1
server.server_slots = True
server.start()
# send a request that will take a long time, but cancel it before it finishes
try:
server.make_request("POST", "/completion", data={
"prompt": "I believe the meaning of life is",
}, timeout=0.1)
except requests.exceptions.ReadTimeout:
pass # expected
# make sure the slot is free
time.sleep(1) # wait for HTTP_POLLING_SECONDS
res = server.make_request("GET", "/slots")
assert res.body[0]["is_processing"] == False

View file

@ -26,6 +26,9 @@ from re import RegexFlag
import wget
DEFAULT_HTTP_TIMEOUT = 10 if "LLAMA_SANITIZE" not in os.environ else 30
class ServerResponse:
headers: dict
status_code: int
@ -69,13 +72,14 @@ class ServerProcess:
pooling: str | None = None
draft: int | None = None
api_key: str | None = None
response_format: str | None = None
lora_files: List[str] | None = None
disable_ctx_shift: int | None = False
draft_min: int | None = None
draft_max: int | None = None
no_webui: bool | None = None
jinja: bool | None = None
chat_template: str | None = None
chat_template_file: str | None = None
# session variables
process: subprocess.Popen | None = None
@ -88,7 +92,7 @@ class ServerProcess:
if "PORT" in os.environ:
self.server_port = int(os.environ["PORT"])
def start(self, timeout_seconds: int = 10) -> None:
def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
if "LLAMA_SERVER_BIN_PATH" in os.environ:
server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
elif os.name == "nt":
@ -166,8 +170,12 @@ class ServerProcess:
server_args.extend(["--draft-min", self.draft_min])
if self.no_webui:
server_args.append("--no-webui")
if self.jinja:
server_args.append("--jinja")
if self.chat_template:
server_args.extend(["--chat-template", self.chat_template])
if self.chat_template_file:
server_args.extend(["--chat-template-file", self.chat_template_file])
args = [str(arg) for arg in [server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")
@ -219,17 +227,18 @@ class ServerProcess:
path: str,
data: dict | Any | None = None,
headers: dict | None = None,
timeout: float | None = None,
) -> ServerResponse:
url = f"http://{self.server_host}:{self.server_port}{path}"
parse_body = False
if method == "GET":
response = requests.get(url, headers=headers)
response = requests.get(url, headers=headers, timeout=timeout)
parse_body = True
elif method == "POST":
response = requests.post(url, headers=headers, json=data)
response = requests.post(url, headers=headers, json=data, timeout=timeout)
parse_body = True
elif method == "OPTIONS":
response = requests.options(url, headers=headers)
response = requests.options(url, headers=headers, timeout=timeout)
else:
raise ValueError(f"Unimplemented method: {method}")
result = ServerResponse()

View file

@ -16,6 +16,8 @@
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "minja.hpp"
#include "chat-template.hpp"
#include <random>
#include <sstream>
@ -349,7 +351,7 @@ static llama_tokens format_infill(
}
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
std::vector<common_chat_msg> chat;
for (size_t i = 0; i < messages.size(); ++i) {
@ -377,7 +379,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
chat.push_back({role, content});
}
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
return formatted_chat;
@ -576,14 +578,23 @@ static json oaicompat_completion_params_parse(const json & body) {
return llama_params;
}
static json oaicompat_chat_completion_params_parse(
const struct llama_model * model,
const json & body, /* openai api json semantics */
const std::string & chat_template) {
static json oaicompat_completion_params_parse(
const json & body, /* openai api json semantics */
const common_chat_template & tmpl,
bool use_jinja)
{
json llama_params;
// Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
auto tools = json_value(body, "tools", json());
auto has_tools = tools.is_array() && !tools.empty();
if (has_tools) {
if (use_jinja) {
LOG_WRN("tools param is not fully supported yet\n");
} else {
throw std::runtime_error("tools param requires --jinja flag");
}
}
// Handle "stop" field
if (body.contains("stop") && body.at("stop").is_string()) {
@ -606,6 +617,13 @@ static json oaicompat_chat_completion_params_parse(
}
}
// Apply chat template to the list of messages
if (use_jinja) {
llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
} else {
llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
}
// Handle "n" field
int n_choices = json_value(body, "n", 1);
if (n_choices != 1) {
@ -621,7 +639,7 @@ static json oaicompat_chat_completion_params_parse(
}
// Params supported by OAI but unsupported by llama.cpp
static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
static const std::vector<std::string> unsupported_params { "tool_choice" };
for (const auto & param : unsupported_params) {
if (body.contains(param)) {
throw std::runtime_error("Unsupported param: " + param);