Add Jinja template support (#11016)

* Copy minja from 58f0ca6dd7 * Add --jinja and --chat-template-file flags * Add missing <optional> include * Avoid print in get_hf_chat_template.py * No designated initializers yet * Try and work around msvc++ non-macro max resolution quirk * Update test_chat_completion.py * Wire LLM_KV_TOKENIZER_CHAT_TEMPLATE_N in llama_model_chat_template * Refactor test-chat-template * Test templates w/ minja * Fix deprecation * Add --jinja to llama-run * Update common_chat_format_example to use minja template wrapper * Test chat_template in e2e test * Update utils.py * Update test_chat_completion.py * Update run.cpp * Update arg.cpp * Refactor common_chat_* functions to accept minja template + use_jinja option * Attempt to fix linkage of LLAMA_CHATML_TEMPLATE * Revert LLAMA_CHATML_TEMPLATE refactor * Normalize newlines in test-chat-templates for windows tests * Forward decl minja::chat_template to avoid eager json dep * Flush stdout in chat template before potential crash * Fix copy elision warning * Rm unused optional include * Add missing optional include to server.cpp * Disable jinja test that has a cryptic windows failure * minja: fix vigogne (https://github.com/google/minja/pull/22) * Apply suggestions from code review Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Finish suggested renamings * Move chat_templates inside server_context + remove mutex * Update --chat-template-file w/ recent change to --chat-template * Refactor chat template validation * Guard against missing eos/bos tokens (null token otherwise throws in llama_vocab::impl::token_get_attr) * Warn against missing eos / bos tokens when jinja template references them * rename: common_chat_template[s] * reinstate assert on chat_templates.template_default * Update minja to b8437df626 * Update minja to https://github.com/google/minja/pull/25 * Update minja from https://github.com/google/minja/pull/27 * rm unused optional header --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-01-21 13:18:51 +00:00 · 2025-01-21 13:18:51 +00:00 · 6171c9d258
commit 6171c9d258
parent e28245f35f
22 changed files with 3563 additions and 133 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1688,6 +1688,8 @@ struct server_context {
    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;

+    common_chat_templates chat_templates;
+
    ~server_context() {
        // Clear any sampling context
        for (server_slot & slot : slots) {
@ -1767,14 +1769,39 @@ struct server_context {
            cparams_dft.type_v = GGML_TYPE_F16;
        }

+        chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
+        GGML_ASSERT(chat_templates.template_default.get() != nullptr);
+
        return true;
    }

-    bool validate_builtin_chat_template() const {
+    bool validate_builtin_chat_template(bool use_jinja) const {
        llama_chat_message chat[] = {{"user", "test"}};
-        const char * tmpl = llama_model_chat_template(model);
-        const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
-        return chat_res > 0;
+
+        if (use_jinja) {
+            auto templates = common_chat_templates_from_model(model, "");
+            GGML_ASSERT(templates.template_default);
+            try {
+                templates.template_default->apply({{
+                    {"role", "user"},
+                    {"content", "test"},
+                }}, json(), true);
+                if (templates.template_tool_use) {
+                    templates.template_tool_use->apply({{
+                        {"role", "user"},
+                        {"content", "test"},
+                    }}, json(), true);
+                }
+                return true;
+            } catch (const std::exception & e) {
+                SRV_ERR("failed to apply template: %s\n", e.what());
+                return false;
+            }
+        } else {
+            const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
+            const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
+            return chat_res > 0;
+        }
    }

    void init() {
@ -3659,9 +3686,12 @@ int main(int argc, char ** argv) {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
-            { "chat_template",               common_get_builtin_chat_template(ctx_server.model) },
+            { "chat_template",               ctx_server.chat_templates.template_default->source() },
            { "build_info",                  build_info },
        };
+        if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
+            data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
+        }

        res_ok(res, data);
    };
@ -3889,7 +3919,10 @@ int main(int argc, char ** argv) {
            return;
        }

-        json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
+        auto body = json::parse(req.body);
+        const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
+        json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
+
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
@ -4299,7 +4332,7 @@ int main(int argc, char ** argv) {

    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
    if (params.chat_template.empty()) {
-        if (!ctx_server.validate_builtin_chat_template()) {
+        if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
            params.chat_template = "chatml";
        }
@ -4307,8 +4340,8 @@ int main(int argc, char ** argv) {

    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
-        params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
-        common_chat_format_example(ctx_server.model, params.chat_template).c_str());
+        ctx_server.chat_templates.template_default->source().c_str(),
+        common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());

    ctx_server.queue_tasks.on_new_task(std::bind(
                &server_context::process_single_task, &ctx_server, std::placeholders::_1));