Refactor common_chat_* functions to accept minja template + use_jinja option

2025-01-18 00:43:38 +00:00 · 2025-01-18 00:43:38 +00:00 · b75d0622e4
commit b75d0622e4
parent 3ed670b6dd
7 changed files with 82 additions and 80 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -84,14 +84,6 @@ static void sigint_handler(int signo) {
 }
 #endif

-static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
-    common_chat_msg new_msg{role, content};
-    auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
-    chat_msgs.push_back({role, content});
-    LOG_DBG("formatted: '%s'\n", formatted.c_str());
-    return formatted;
-}
-
 int main(int argc, char ** argv) {
    common_params params;
    g_params = &params;
@ -226,7 +218,7 @@ int main(int argc, char ** argv) {
    // print chat template example in conversation mode
    if (params.conversation_mode) {
        if (params.enable_chat_template) {
-            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, chat_templates.default_template, params.use_jinja).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(chat_templates.default_template, params.use_jinja).c_str());
        } else {
            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
        }
@ -270,10 +262,18 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd_inp;

+    auto chat_add_and_format = [&chat_msgs, &chat_templates](const std::string & role, const std::string & content) {
+        common_chat_msg new_msg{role, content};
+        auto formatted = common_chat_format_single(chat_templates.default_template, chat_msgs, new_msg, role == "user", g_params->use_jinja);
+        chat_msgs.push_back({role, content});
+        LOG_DBG("formatted: '%s'\n", formatted.c_str());
+        return formatted;
+    };
+
    {
        auto prompt = (params.conversation_mode && params.enable_chat_template)
            // format the system prompt in conversation mode (fallback to default if empty)
-            ? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+            ? chat_add_and_format("system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
            // otherwise use the prompt as is
            : params.prompt;
        if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
                    }

                    if (params.enable_chat_template) {
-                        chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
+                        chat_add_and_format("assistant", assistant_ss.str());
                    }
                    is_interacting = true;
                    LOG("\n");
@ -845,7 +845,7 @@ int main(int argc, char ** argv) {

                    bool format_chat = params.conversation_mode && params.enable_chat_template;
                    std::string user_inp = format_chat
-                        ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
+                        ? chat_add_and_format("user", std::move(buffer))
                        : std::move(buffer);
                    // TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
                    const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -714,7 +714,7 @@ static void add_message(const char * role, const std::string & text, LlamaData &
 }

 // Function to apply the chat template and resize `formatted` if needed
-static int apply_chat_template(const minja::chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
+static int apply_chat_template(const llama_chat_template & tmpl, LlamaData & llama_data, const bool append, bool use_jinja) {
    if (use_jinja) {
        json messages = json::array();
        for (const auto & msg : llama_data.messages) {
@ -868,7 +868,7 @@ static int generate_response(LlamaData & llama_data, const std::string & prompt,
 }

 // Helper function to apply the chat template and handle errors
-static int apply_chat_template_with_error_handling(const minja::chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
+static int apply_chat_template_with_error_handling(const llama_chat_template & tmpl, LlamaData & llama_data, const bool append, int & output_length, bool use_jinja) {
    const int new_len = apply_chat_template(tmpl, llama_data, append, use_jinja);
    if (new_len < 0) {
        printe("failed to apply the chat template\n");
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3869,7 +3869,7 @@ int main(int argc, char ** argv) {
        auto body = json::parse(req.body);
        const auto & templates = get_chat_templates();
        const auto & chat_template = body.contains("tools") && templates.tool_use_template ? *templates.tool_use_template : templates.default_template;
-        json data = oaicompat_completion_params_parse(ctx_server.model, body, chat_template, params.use_jinja);
+        json data = oaicompat_completion_params_parse(body, chat_template, params.use_jinja);

        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
@ -4288,7 +4288,7 @@ int main(int argc, char ** argv) {
    // print sample chat example to make it clear which template is used
    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
        get_chat_templates().default_template.source().c_str(),
-        common_chat_format_example(ctx_server.model, get_chat_templates().default_template, ctx_server.params_base.use_jinja).c_str());
+        common_chat_format_example(get_chat_templates().default_template, ctx_server.params_base.use_jinja).c_str());

    ctx_server.queue_tasks.on_new_task(std::bind(
                &server_context::process_single_task, &ctx_server, std::placeholders::_1));
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -351,7 +351,7 @@ static llama_tokens format_infill(
 }

 // Format given chat. If tmpl is empty, we take the template from model metadata
-inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
+inline std::string format_chat(const llama_chat_template & tmpl, const std::vector<json> & messages) {
    std::vector<common_chat_msg> chat;

    for (size_t i = 0; i < messages.size(); ++i) {
@ -379,7 +379,7 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
        chat.push_back({role, content});
    }

-    const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
+    const auto formatted_chat = common_chat_apply_template(tmpl, chat, true, /* use_jinja= */ false);
    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());

    return formatted_chat;
@ -579,9 +579,8 @@ static json oaicompat_completion_params_parse(const json & body) {
 }

 static json oaicompat_completion_params_parse(
-    const struct llama_model * model,
    const json & body, /* openai api json semantics */
-    const minja::chat_template & tmpl,
+    const llama_chat_template & tmpl,
    bool use_jinja)
 {
    json llama_params;
@ -622,7 +621,7 @@ static json oaicompat_completion_params_parse(
    if (use_jinja) {
        llama_params["prompt"] = tmpl.apply(body.at("messages"), tools, /* add_generation_prompt= */ true);
    } else {
-        llama_params["prompt"] = format_chat(model, tmpl.source(), body.at("messages"));
+        llama_params["prompt"] = format_chat(tmpl, body.at("messages"));
    }

    // Handle "n" field