Merge branch 'jinja' into tool-call

2025-01-18 11:26:56 +00:00 · 2025-01-18 11:26:56 +00:00 · 76893f5880
commit 76893f5880
parent acf7c240d8 5074e6fecd
6 changed files with 52 additions and 23 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1866,11 +1866,18 @@ llama_chat_templates llama_chat_templates_from_model(const struct llama_model *
    auto eos_token = common_token_to_piece(vocab, llama_vocab_eos(vocab), true);
    std::string default_template_src = chat_template_override;
    std::string tool_use_template_src = chat_template_override;
+    bool has_explicit_template = !chat_template_override.empty();
    if (chat_template_override.empty()) {
        auto str = llama_model_chat_template(model, /* name */ nullptr);
-        if (str) default_template_src = str;
+        if (str) {
+            default_template_src = str;
+            has_explicit_template = true;
+        }
        str = llama_model_chat_template(model, /* name */ "tool_use");
-        if (str) tool_use_template_src = str;
+        if (str) {
+            tool_use_template_src = str;
+            has_explicit_template = true;
+        }
    }
    if (default_template_src.empty() || default_template_src == "chatml") {
        if (!tool_use_template_src.empty()) {
@ -1887,9 +1894,11 @@ llama_chat_templates llama_chat_templates_from_model(const struct llama_model *
        }
    }
    return {
-        /* .default_template = */  { default_template_src, bos_token, eos_token },
-        /* .tool_use_template = */ tool_use_template_src.empty() ? std::nullopt
-            : std::optional<minja::chat_template>({ tool_use_template_src, bos_token, eos_token }),
+        has_explicit_template,
+        std::make_unique<minja::chat_template>(default_template_src, bos_token, eos_token),
+        tool_use_template_src.empty()
+            ? nullptr
+            : std::make_unique<minja::chat_template>(tool_use_template_src, bos_token, eos_token)
    };
 }

--- a/common/common.h
+++ b/common/common.h
@ -3,7 +3,6 @@
 #pragma once

 #include "llama-cpp.h"
-#include "chat-template.hpp"

 #include <functional>
 #include <queue>
@ -606,8 +605,18 @@ struct common_chat_msg {
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
 bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);

+namespace minja {
+    class chat_template;
+}
+
 typedef minja::chat_template llama_chat_template;

+struct llama_chat_templates {
+    bool has_explicit_template; // Model had builtin template or template overridde was specified.
+    std::unique_ptr<llama_chat_template> default_template; // always set (defaults to chatml)
+    std::unique_ptr<llama_chat_template> tool_use_template;
+};
+
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
@ -629,11 +638,6 @@ std::string common_chat_format_single(
 std::string common_chat_format_example(
    const llama_chat_template & tmpl, bool use_jinja);

-struct llama_chat_templates {
-    llama_chat_template default_template;
-    std::optional<llama_chat_template> tool_use_template;
-};
-
 llama_chat_templates llama_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override);

 llama_chat_template llama_chat_template_from_model(
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -4,6 +4,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "llama.h"
+#include "chat-template.hpp"

 #include <cstdio>
 #include <cstring>
@ -200,7 +201,7 @@ int main(int argc, char ** argv) {
    }

    // auto enable conversation mode if chat template is available
-    const bool has_chat_template = !chat_templates.default_template.source().empty();
+    const bool has_chat_template = chat_templates.has_explicit_template && chat_templates.default_template;
    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
        if (has_chat_template) {
            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
@ -218,7 +219,7 @@ int main(int argc, char ** argv) {
    // print chat template example in conversation mode
    if (params.conversation_mode) {
        if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.default_template, params.use_jinja).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(*chat_templates.default_template, params.use_jinja).c_str());
        } else {
            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@ -264,7 +265,7 @@ int main(int argc, char ** argv) {

    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
        common_chat_msg new_msg{role, content};
-        auto formatted = common_chat_format_single(chat_templates.default_template, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        auto formatted = common_chat_format_single(*chat_templates.default_template, chat_msgs, new_msg, role == "user", g_params->use_jinja);
        chat_msgs.push_back({role, content});
        LOG_DBG("formatted: '%s'\n", formatted.c_str());
        return formatted;
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -26,6 +26,7 @@
 #include "common.h"
 #include "json.hpp"
 #include "llama-cpp.h"
+#include "chat-template.hpp"

 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
 [[noreturn]] static void sigint_handler(int) {
@ -936,6 +937,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
    int prev_len = 0;
    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
    auto chat_templates = llama_chat_templates_from_model(llama_data.model.get(), "");
+    GGML_ASSERT(chat_templates.default_template);
    static const bool stdout_a_terminal = is_stdout_a_terminal();
    while (true) {
        // Get user input
@ -946,7 +948,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_

        add_message("user", user.empty() ? user_input : user, llama_data);
        int new_len;
-        if (apply_chat_template_with_error_handling(chat_templates.default_template, llama_data, true, new_len, use_jinja) < 0) {
+        if (apply_chat_template_with_error_handling(*chat_templates.default_template, llama_data, true, new_len, use_jinja) < 0) {
            return 1;
        }

@ -961,7 +963,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user, bool use_
        }

        add_message("assistant", response, llama_data);
-        if (apply_chat_template_with_error_handling(chat_templates.default_template, llama_data, false, prev_len, use_jinja) < 0) {
+        if (apply_chat_template_with_error_handling(*chat_templates.default_template, llama_data, false, prev_len, use_jinja) < 0) {
            return 1;
        }
    }
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1802,8 +1802,9 @@ struct server_context {

        if (use_jinja) {
            auto templates = llama_chat_templates_from_model(model, "");
+            GGML_ASSERT(templates.default_template);
            try {
-                templates.default_template.apply({{
+                templates.default_template->apply({{
                    {"role", "user"},
                    {"content", "test"},
                }}, json(), true);
@ -3722,6 +3723,7 @@ int main(int argc, char ** argv) {
        std::lock_guard<std::mutex> lock(chat_templates_mutex);
        if (!chat_templates) {
            chat_templates = llama_chat_templates_from_model(ctx_server.model, ctx_server.params_base.chat_template);
+            GGML_ASSERT(chat_templates->default_template);
        }
        return *chat_templates;
    };
@ -3736,7 +3738,7 @@ int main(int argc, char ** argv) {
            { "model_path",                  ctx_server.params_base.model },
            { "bos_token",                   common_token_to_piece(vocab, llama_vocab_bos(vocab), true) },
            { "eos_token",                   common_token_to_piece(vocab, llama_vocab_eos(vocab), true) },
-            { "chat_template",               templates.default_template.source() },
+            { "chat_template",               templates.default_template->source() },
            { "build_info",                  build_info },
        };
        if (ctx_server.params_base.use_jinja && templates.tool_use_template) {
@ -3965,7 +3967,7 @@ int main(int argc, char ** argv) {

        auto body = json::parse(req.body);
        const auto & templates = get_chat_templates();
-        const auto & chat_template = body.contains("tools") && templates.tool_use_template ? *templates.tool_use_template : templates.default_template;
+        const auto & chat_template = body.contains("tools") && templates.tool_use_template ? *templates.tool_use_template : *templates.default_template;
        auto tool_call_style = llama_tool_call_style_detect(chat_template);
        LOG_INF("Tool call style: %s\n", llama_tool_call_style_name(tool_call_style).c_str());

@ -4388,8 +4390,8 @@ int main(int argc, char ** argv) {

    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
-        get_chat_templates().default_template.source().c_str(),
-        common_chat_format_example(get_chat_templates().default_template, ctx_server.params_base.use_jinja).c_str());
+        get_chat_templates().default_template->source().c_str(),
+        common_chat_format_example(*get_chat_templates().default_template, ctx_server.params_base.use_jinja).c_str());

    ctx_server.queue_tasks.on_new_task(std::bind(
                &server_context::process_single_task, &ctx_server, std::placeholders::_1));
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -9,6 +9,15 @@
 #include "common.h"
 #include "chat-template.hpp"

+static std::string normalize_newlines(const std::string & s) {
+#ifdef _WIN32
+  static const std::regex nl_regex("\r\n");
+  return std::regex_replace(s, nl_regex, "\n");
+#else
+  return s;
+#endif
+}
+
 int main(void) {
    std::vector<llama_chat_message> conversation {
        {"system", "You are a helpful assistant"},
@ -282,6 +291,7 @@ int main(void) {
            printf("Expected:\n%s\n", test_case.expected_output.c_str());
            printf("-------------------------\n");
            printf("Actual:\n%s\n", output.c_str());
+            fflush(stdout);
            assert(output == test_case.expected_output);
        }
    }
@ -300,12 +310,13 @@ int main(void) {
        printf("\n\n=== %s (jinja) ===\n\n", test_case.name.c_str());
        try {
            minja::chat_template tmpl(test_case.template_str, test_case.bos_token, test_case.eos_token);
-            auto output = tmpl.apply(messages, json(), add_generation_prompt);
-            auto expected_output = test_case.expected_output_jinja.empty() ? test_case.expected_output : test_case.expected_output_jinja;
+            auto output = normalize_newlines(tmpl.apply(messages, json(), add_generation_prompt));
+            auto expected_output = normalize_newlines(test_case.expected_output_jinja.empty() ? test_case.expected_output : test_case.expected_output_jinja);
            if (output != expected_output) {
                printf("Expected:\n%s\n", expected_output.c_str());
                printf("-------------------------\n");
                printf("Actual:\n%s\n", output.c_str());
+                fflush(stdout);
                assert(output == expected_output);
            }
        } catch (const std::exception & e) {