Add Jinja template support (#11016)

* Copy minja from 58f0ca6dd7 * Add --jinja and --chat-template-file flags * Add missing <optional> include * Avoid print in get_hf_chat_template.py * No designated initializers yet * Try and work around msvc++ non-macro max resolution quirk * Update test_chat_completion.py * Wire LLM_KV_TOKENIZER_CHAT_TEMPLATE_N in llama_model_chat_template * Refactor test-chat-template * Test templates w/ minja * Fix deprecation * Add --jinja to llama-run * Update common_chat_format_example to use minja template wrapper * Test chat_template in e2e test * Update utils.py * Update test_chat_completion.py * Update run.cpp * Update arg.cpp * Refactor common_chat_* functions to accept minja template + use_jinja option * Attempt to fix linkage of LLAMA_CHATML_TEMPLATE * Revert LLAMA_CHATML_TEMPLATE refactor * Normalize newlines in test-chat-templates for windows tests * Forward decl minja::chat_template to avoid eager json dep * Flush stdout in chat template before potential crash * Fix copy elision warning * Rm unused optional include * Add missing optional include to server.cpp * Disable jinja test that has a cryptic windows failure * minja: fix vigogne (https://github.com/google/minja/pull/22) * Apply suggestions from code review Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Finish suggested renamings * Move chat_templates inside server_context + remove mutex * Update --chat-template-file w/ recent change to --chat-template * Refactor chat template validation * Guard against missing eos/bos tokens (null token otherwise throws in llama_vocab::impl::token_get_attr) * Warn against missing eos / bos tokens when jinja template references them * rename: common_chat_template[s] * reinstate assert on chat_templates.template_default * Update minja to b8437df626 * Update minja to https://github.com/google/minja/pull/25 * Update minja from https://github.com/google/minja/pull/27 * rm unused optional header --------- Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-01-21 13:18:51 +00:00 · 2025-01-21 13:18:51 +00:00 · 6171c9d258
commit 6171c9d258
parent e28245f35f
22 changed files with 3563 additions and 133 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -4,6 +4,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "llama.h"
+#include "chat-template.hpp"

 #include <cstdio>
 #include <cstring>
@ -84,14 +85,6 @@ static void sigint_handler(int signo) {
 }
 #endif

-static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
-    common_chat_msg new_msg{role, content};
-    auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
-    chat_msgs.push_back({role, content});
-    LOG_DBG("formatted: '%s'\n", formatted.c_str());
-    return formatted;
-}
-
 int main(int argc, char ** argv) {
    common_params params;
    g_params = &params;
@ -165,6 +158,7 @@ int main(int argc, char ** argv) {
    }

    const llama_vocab * vocab = llama_model_get_vocab(model);
+    auto chat_templates = common_chat_templates_from_model(model, params.chat_template);

    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

@ -207,7 +201,7 @@ int main(int argc, char ** argv) {
    }

    // auto enable conversation mode if chat template is available
-    const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
+    const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.template_default;
    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
        if (has_chat_template) {
            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
@ -225,7 +219,7 @@ int main(int argc, char ** argv) {
    // print chat template example in conversation mode
    if (params.conversation_mode) {
        if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.template_default, params.use_jinja).c_str());
        } else {
            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@ -269,10 +263,18 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd_inp;

+    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
+        common_chat_msg new_msg{role, content};
+        auto formatted = common_chat_format_single(*chat_templates.template_default, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        chat_msgs.push_back({role, content});
+        LOG_DBG("formatted: '%s'\n", formatted.c_str());
+        return formatted;
+    };
+
    {
        auto prompt = (params.conversation_mode && params.enable_chat_template)
            // format the system prompt in conversation mode (fallback to default if empty)
-            ? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
            // otherwise use the prompt as is
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@ -779,7 +781,7 @@ int main(int argc, char ** argv) {
                    }

                    if (params.enable_chat_template) {
-                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+                        chat_add_and_format("assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    LOG("\n");
@ -844,7 +846,7 @@ int main(int argc, char ** argv) {

                    bool format_chat = params.conversation_mode && params.enable_chat_template;
                    std::string user_inp = format_chat
-                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
+                        ? chat_add_and_format("user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -28,6 +28,7 @@
 #include "json.hpp"
 #include "linenoise.cpp/linenoise.h"
 #include "llama-cpp.h"
+#include "chat-template.hpp"

 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 [[noreturn]] static void sigint_handler(int) {
@ -105,6 +106,7 @@ class Opt {
    llama_model_params   model_params;
    std::string model_;
    std::string          user;
+    bool                 use_jinja   = false;
    int                  context_size = -1, ngl = -1;
    float                temperature = -1;
    bool                 verbose     = false;
@ -156,6 +158,8 @@ class Opt {
            } else if (options_parsing &&
                       (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
                verbose = true;
+            } else if (options_parsing && strcmp(argv[i], "--jinja") == 0) {
+                use_jinja = true;
            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
                help = true;
                return 0;
@ -713,13 +717,31 @@ static void add_message(const char * role, const std::string & text, LlamaData &
 }

 // Function to apply the chat template and resize `formatted` if needed
-static int apply_chat_template(LlamaData & llama_data, const bool append) {
+static int apply_chat_template(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
+    if (use_jinja) {
+        json messages = json::array();
+        for (const auto & msg : llama_data.messages) {
+            messages.push_back({
+                {"role", msg.role},
+                {"content", msg.content},
+            });
+        }
+        try {
+            auto result = tmpl.apply(messages, /* tools= */ json(), append);
+            llama_data.fmtted.resize(result.size() + 1);
+            memcpy(llama_data.fmtted.data(), result.c_str(), result.size() + 1);
+            return result.size();
+        } catch (const std::exception & e) {
+            printe("failed to render the chat template: %s\n", e.what());
+            return -1;
+        }
+    }
    int result = llama_chat_apply_template(
-        llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(), llama_data.messages.size(), append,
+        tmpl.source().c_str(), llama_data.messages.data(), llama_data.messages.size(), append,
        append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
    if (append && result > static_cast<int>(llama_data.fmtted.size())) {
        llama_data.fmtted.resize(result);
-        result = llama_chat_apply_template(llama_model_chat_template(llama_data.model.get()), llama_data.messages.data(),
+        result = llama_chat_apply_template(tmpl.source().c_str(), llama_data.messages.data(),
                                           llama_data.messages.size(), append, llama_data.fmtted.data(),
                                           llama_data.fmtted.size());
    }
@ -871,8 +893,8 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
 }

 // Helper function to apply the chat template and handle errors
-static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
-    const int new_len = apply_chat_template(llama_data, append);
+static int apply_chat_template_with_error_handling(const common_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
+    const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
    if (new_len < 0) {
        printe("failed to apply the chat template\n");
        return -1;
@ -931,9 +953,11 @@ static int get_user_input(std::string & user_input, const std::string & user) {
 }

 // Main chat loop function
-static int chat_loop(LlamaData & llama_data, const std::string & user) {
+static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_jinja) {
    int prev_len = 0;
    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
+    auto chat_templates = common_chat_templates_from_model(llama_data.model.get(), "");
+    GGML_ASSERT(chat_templates.template_default);
    static const bool stdout_a_terminal = is_stdout_a_terminal();
    while (true) {
        // Get user input
@ -944,7 +968,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) {

        add_message("user", user.empty() ? user_input : user, llama_data);
        int new_len;
-        if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, true, new_len, use_jinja) < 0) {
            return 1;
        }

@ -959,7 +983,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user) {
        }

        add_message("assistant", response, llama_data);
-        if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
+        if (apply_chat_template_with_error_handling(*chat_templates.template_default, llama_data, false, prev_len, use_jinja) < 0) {
            return 1;
        }
    }
@ -1019,7 +1043,7 @@ int main(int argc, const char ** argv) {
        return 1;
    }

-    if (chat_loop(llama_data, opt.user)) {
+    if (chat_loop(llama_data, opt.user, opt.use_jinja)) {
        return 1;
    }

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -126,7 +126,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar GRAMMAR` | BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '') |
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
-
+| `--jinja` | Enable experimental Jinja templating engine (needed for tool use) |

 **Example-specific params**

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1688,6 +1688,8 @@ struct server_context {
    // Necessary similarity of prompt for slot selection
    float slot_prompt_similarity = 0.0f;

+    common_chat_templates chat_templates;
+
    ~server_context() {
        // Clear any sampling context
        for (server_slot & slot : slots) {
@ -1767,14 +1769,39 @@ struct server_context {
            cparams_dft.type_v = GGML_TYPE_F16;
        }

+        chat_templates = common_chat_templates_from_model(model, params_base.chat_template);
+        GGML_ASSERT(chat_templates.template_default.get() != nullptr);
+
        return true;
    }

-    bool validate_builtin_chat_template() const {
+    bool validate_builtin_chat_template(bool use_jinja) const {
        llama_chat_message chat[] = {{"user", "test"}};
-        const char * tmpl = llama_model_chat_template(model);
-        const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
-        return chat_res > 0;
+
+        if (use_jinja) {
+            auto templates = common_chat_templates_from_model(model, "");
+            GGML_ASSERT(templates.template_default);
+            try {
+                templates.template_default->apply({{
+                    {"role", "user"},
+                    {"content", "test"},
+                }}, json(), true);
+                if (templates.template_tool_use) {
+                    templates.template_tool_use->apply({{
+                        {"role", "user"},
+                        {"content", "test"},
+                    }}, json(), true);
+                }
+                return true;
+            } catch (const std::exception & e) {
+                SRV_ERR("failed to apply template: %s\n", e.what());
+                return false;
+            }
+        } else {
+            const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);
+            const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0);
+            return chat_res > 0;
+        }
    }

    void init() {
@ -3659,9 +3686,12 @@ int main(int argc, char ** argv) {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "model_path",                  ctx_server.params_base.model },
-            { "chat_template",               common_get_builtin_chat_template(ctx_server.model) },
+            { "chat_template",               ctx_server.chat_templates.template_default->source() },
            { "build_info",                  build_info },
        };
+        if (ctx_server.params_base.use_jinja && ctx_server.chat_templates.template_tool_use) {
+            data["chat_template_tool_use"] = ctx_server.chat_templates.template_tool_use->source();
+        }

        res_ok(res, data);
    };
@ -3889,7 +3919,10 @@ int main(int argc, char ** argv) {
            return;
        }

-        json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
+        auto body = json::parse(req.body);
+        const auto & chat_template = body.contains("tools") && ctx_server.chat_templates.template_tool_use ? *ctx_server.chat_templates.template_tool_use : *ctx_server.chat_templates.template_default;
+        json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);
+
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
@ -4299,7 +4332,7 @@ int main(int argc, char ** argv) {

    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
    if (params.chat_template.empty()) {
-        if (!ctx_server.validate_builtin_chat_template()) {
+        if (!ctx_server.validate_builtin_chat_template(params.use_jinja)) {
            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
            params.chat_template = "chatml";
        }
@ -4307,8 +4340,8 @@ int main(int argc, char ** argv) {

    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
-        params.chat_template.empty() ? "(built-in)" : params.chat_template.c_str(),
-        common_chat_format_example(ctx_server.model, params.chat_template).c_str());
+        ctx_server.chat_templates.template_default->source().c_str(),
+        common_chat_format_example(*ctx_server.chat_templates.template_default, ctx_server.params_base.use_jinja).c_str());

    ctx_server.queue_tasks.on_new_task(std::bind(
                &server_context::process_single_task, &ctx_server, std::placeholders::_1));
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@ -4,22 +4,26 @@ from utils import *

 server = ServerPreset.tinyllama2()

-
-@pytest.fixture(scope="module", autouse=True)
+@pytest.fixture(autouse=True)
 def create_server():
    global server
    server = ServerPreset.tinyllama2()


@pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
    [
-        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", False, None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length", True, None),
+        (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
    ]
 )
-def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason, jinja, chat_template):
    global server
+    server.jinja = jinja
+    server.chat_template = chat_template
    server.start()
    res = server.make_request("POST", "/chat/completions", data={
        "model": model,
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@ -72,13 +72,14 @@ class ServerProcess:
    pooling: str | None = None
    draft: int | None = None
    api_key: str | None = None
-    response_format: str | None = None
    lora_files: List[str] | None = None
    disable_ctx_shift: int | None = False
    draft_min: int | None = None
    draft_max: int | None = None
    no_webui: bool | None = None
+    jinja: bool | None = None
    chat_template: str | None = None
+    chat_template_file: str | None = None

    # session variables
    process: subprocess.Popen | None = None
@ -169,8 +170,12 @@ class ServerProcess:
            server_args.extend(["--draft-min", self.draft_min])
        if self.no_webui:
            server_args.append("--no-webui")
+        if self.jinja:
+            server_args.append("--jinja")
        if self.chat_template:
            server_args.extend(["--chat-template", self.chat_template])
+        if self.chat_template_file:
+            server_args.extend(["--chat-template-file", self.chat_template_file])

        args = [str(arg) for arg in [server_path, *server_args]]
        print(f"bench: starting server with: {' '.join(args)}")
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -16,6 +16,8 @@
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
+#include "minja.hpp"
+#include "chat-template.hpp"

 #include <random>
 #include <sstream>
@ -349,7 +351,7 @@ static llama_tokens format_infill(
 }

 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const common_chat_template & tmpl, const std::vector<json> & messages) {
    std::vector<common_chat_msg> chat;

    for (size_t i = 0; i < messages.size(); ++i) {
@ -377,7 +379,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
        chat.push_back({role, content});
    }

-    const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());

    return formatted_chat;
@ -576,14 +578,23 @@ static json oaicompat_completion_params_parse(const json & body) {
    return llama_params;
 }

-static json oaicompat_chat_completion_params_parse(
-        const struct llama_model * model,
-        const json & body, /* openai api json semantics */
-        const std::string & chat_template) {
+static json oaicompat_completion_params_parse(
+    const json & body, /* openai api json semantics */
+    const common_chat_template & tmpl,
+    bool use_jinja)
+{
    json llama_params;

-    // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    auto tools = json_value(body, "tools", json());
+    auto has_tools = tools.is_array() && !tools.empty();
+
+    if (has_tools) {
+        if (use_jinja) {
+            LOG_WRN("tools param is not fully supported yet\n");
+        } else {
+            throw std::runtime_error("tools param requires --jinja flag");
+        }
+    }

    // Handle "stop" field
    if (body.contains("stop") && body.at("stop").is_string()) {
@ -606,6 +617,13 @@ static json oaicompat_chat_completion_params_parse(
        }
    }

+    // Apply chat template to the list of messages
+    if (use_jinja) {
+        llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
+    } else {
+        llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
+    }
+
    // Handle "n" field
    int n_choices = json_value(body, "n", 1);
    if (n_choices != 1) {
@ -621,7 +639,7 @@ static json oaicompat_chat_completion_params_parse(
    }

    // Params supported by OAI but unsupported by llama.cpp
-    static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
+    static const std::vector<std::string> unsupported_params { "tool_choice" };
    for (const auto & param : unsupported_params) {
        if (body.contains(param)) {
            throw std::runtime_error("Unsupported param: " + param);
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -163,7 +163,7 @@ int main(int argc, char ** argv) {
            break;
        }

-        const char * tmpl = llama_model_chat_template(model);
+        const char * tmpl = llama_model_chat_template(model, /* name */ nullptr);

        // add the user input to the message list and format it
        messages.push_back({"user", strdup(user.c_str())});