From 1e9acd2d312a6b3f9f005915ebecaedc97da2edb Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 3 Feb 2025 04:07:11 +0000
Subject: [PATCH 1/5] tool-call: allow `--jinja --chat-template chatml`

---
 common/common.cpp                            | 21 +++--
 examples/server/tests/unit/test_tool_call.py | 96 ++++++++++++++++----
 2 files changed, 91 insertions(+), 26 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6c81d18f9..b9d1e0e30 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1869,11 +1869,19 @@ std::string common_chat_format_example(const common_chat_template & tmpl, bool u
     return common_chat_apply_template(tmpl, msgs, true, use_jinja);
 }
 
+#define CHATML_TEMPLATE_SRC \
+    "{%- for message in messages -%}\n" \
+    "  {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
+    "{%- endfor -%}\n" \
+    "{%- if add_generation_prompt -%}\n" \
+    "  {{- '<|im_start|>assistant\n' -}}\n" \
+    "{%- endif -%})"
+
 common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
 {
     auto vocab = llama_model_get_vocab(model);
-    std::string default_template_src = chat_template_override;
-    std::string template_tool_use_src = chat_template_override;
+    std::string default_template_src = chat_template_override == "chatml" ? CHATML_TEMPLATE_SRC : chat_template_override;
+    std::string template_tool_use_src = chat_template_override == "chatml" ? CHATML_TEMPLATE_SRC : "";
     bool has_explicit_template = !chat_template_override.empty();
     if (chat_template_override.empty()) {
         auto str = llama_model_chat_template(model, /* name */ nullptr);
@@ -1891,14 +1899,7 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
         if (!template_tool_use_src.empty()) {
             default_template_src = template_tool_use_src;
         } else {
-            default_template_src = R"(
-                {%- for message in messages -%}
-                    {{- "<|im_start|>" + message.role + "\n" + message.content + "<|im_end|>\n" -}}
-                {%- endfor -%}
-                {%- if add_generation_prompt -%}
-                    {{- "<|im_start|>assistant\n" -}}
-                {%- endif -%}
-            )";
+            default_template_src = CHATML_TEMPLATE_SRC;
         }
     }
     const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
index e6ed9c9be..9c6e1b856 100644
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -67,8 +67,8 @@ WEATHER_TOOL = {
 
 
 def do_test_completion_with_required_tool_tiny(template_name: str, tool: dict, argument_key: str | None):
-    n_predict = 512
     global server
+    n_predict = 512
     # server = ServerPreset.stories15m_moe()
     server.jinja = True
     server.n_predict = n_predict
@@ -139,29 +139,49 @@ def test_completion_with_required_tool_tiny_slow(template_name: str, tool: dict,
 @pytest.mark.parametrize("tool,argument_key,hf_repo,template_override", [
     (TEST_TOOL,    "success",  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
     (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (PYTHON_TOOL,  "code",     "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
     (TEST_TOOL,    "success",  "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
     (PYTHON_TOOL,  "code",     "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
     (TEST_TOOL,    "success",  "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
     (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (PYTHON_TOOL,  "code",     "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
     (TEST_TOOL,    "success",  "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
     (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (PYTHON_TOOL,  "code",     "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
     (TEST_TOOL,    "success",  "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
     (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", "chatml"),
+
     (TEST_TOOL,    "success",  "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
     (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (PYTHON_TOOL,  "code",     "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   "chatml"),
+
     (TEST_TOOL,    "success",  "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
     (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    (TEST_TOOL,    "success",  "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       ("meetkai/functionary-medium-v3.2", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/functionary-small-v3.2-GGUF:Q4_K_M",       "chatml"),
+
     (TEST_TOOL,    "success",  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
     (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
     (TEST_TOOL,    "success",  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
     (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (PYTHON_TOOL,  "code",     "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
     # TODO: fix these
     # (TEST_TOOL,    "success",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
     # (PYTHON_TOOL,  "code",     "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_completion_with_required_tool_real_model(tool: dict, argument_key: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+    global server
     n_predict = 512
     server.n_slots = 1
     server.jinja = True
@@ -169,10 +189,12 @@ def test_completion_with_required_tool_real_model(tool: dict, argument_key: str
     server.n_predict = n_predict
     server.model_hf_repo = hf_repo
     server.model_hf_file = None
-    if template_override:
+    if isinstance(template_override, tuple):
         (template_hf_repo, template_variant) = template_override
         server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
         assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
     res = server.make_request("POST", "/chat/completions", data={
         "max_tokens": n_predict,
@@ -252,18 +274,36 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
 @pytest.mark.slow
 @pytest.mark.parametrize("hf_repo,template_override", [
     ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
     ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
     ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M", ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
     ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
     ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
     ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
     # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
     # ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_weather_tool_call(hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
     server.n_slots = 1
     server.jinja = True
@@ -271,10 +311,12 @@ def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | Non
     server.n_predict = 512
     server.model_hf_repo = hf_repo
     server.model_hf_file = None
-    if template_override:
+    if isinstance(template_override, tuple):
         (template_hf_repo, template_variant) = template_override
         server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
         assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
     res = server.make_request("POST", "/chat/completions", data={
         "max_tokens": 256,
@@ -298,19 +340,39 @@ def test_weather_tool_call(hf_repo: str, template_override: Tuple[str, str | Non
 
 @pytest.mark.slow
 @pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
-    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
     (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+
     (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
+    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+
     ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+
+    ('{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
     ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+
     (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",   ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+
+    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+
+    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+
     (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+
+    # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
+    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+
     # (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 ])
-def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
     server.n_slots = 1
     server.jinja = True
@@ -318,10 +380,12 @@ def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo:
     server.n_predict = 128
     server.model_hf_repo = hf_repo
     server.model_hf_file = None
-    if template_override:
+    if isinstance(template_override, tuple):
         (template_hf_repo, template_variant) = template_override
         server.chat_template_file = f"../../../models/templates/{template_hf_repo.replace('/', '-') + ('-' + template_variant if template_variant else '')}.jinja"
         assert os.path.exists(server.chat_template_file), f"Template file {server.chat_template_file} does not exist. Run `python scripts/get_chat_template.py {template_hf_repo} {template_variant} > {server.chat_template_file}` to download the template."
+    elif isinstance(template_override, str):
+        server.chat_template = template_override
     server.start(timeout_seconds=TIMEOUT_SERVER_START)
     res = server.make_request("POST", "/chat/completions", data={
         "max_tokens": 256,

From 77ae97e7d6e8aa6a57e3b8f4f05584512f69ef19 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 3 Feb 2025 10:28:30 +0000
Subject: [PATCH 2/5] Update test_tool_call.py

---
 examples/server/tests/unit/test_tool_call.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
index 9c6e1b856..62a48a0d9 100644
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -346,7 +346,7 @@ def test_weather_tool_call(hf_repo: str, template_override: str | Tuple[str, str
     (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
     (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 
-    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
     ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
 
     ('{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),

From cf83623a4796f049228f8bca167d4d267e9c4a0a Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Mon, 3 Feb 2025 13:58:46 +0000
Subject: [PATCH 3/5] fix typo

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index b9d1e0e30..e7dcffabc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1875,7 +1875,7 @@ std::string common_chat_format_example(const common_chat_template & tmpl, bool u
     "{%- endfor -%}\n" \
     "{%- if add_generation_prompt -%}\n" \
     "  {{- '<|im_start|>assistant\n' -}}\n" \
-    "{%- endif -%})"
+    "{%- endif -%}"
 
 common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
 {

From 5d18d76b690bb5bfe0be1e444be94f6db7ae8f39 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Mon, 3 Feb 2025 13:59:16 +0000
Subject: [PATCH 4/5] fix double bos issue (drop bos/eos tokens from jinja
 template)

---
 common/common.cpp                                  | 10 ++++++++--
 examples/server/tests/unit/test_chat_completion.py |  7 +++++--
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index e7dcffabc..24e66c5b6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1902,6 +1902,11 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
             default_template_src = CHATML_TEMPLATE_SRC;
         }
     }
+    std::string token_bos;
+    std::string token_eos;
+    // TODO: update logic that adds BOS and EOS tokens to the tokenized prompt, in favour of the template.
+#if 0
+    auto vocab = llama_model_get_vocab(model);
     const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
         if (token == LLAMA_TOKEN_NULL) {
             if (default_template_src.find(jinja_variable_name) != std::string::npos
@@ -1913,8 +1918,9 @@ common_chat_templates common_chat_templates_from_model(const struct llama_model
             return common_token_to_piece(vocab, token, true);
         }
     };
-    auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
-    auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
+    token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
+    token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
+#endif
     return {
         has_explicit_template,
         std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index f5d8b0572..f23d5cff4 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -13,9 +13,12 @@ def create_server():
 @pytest.mark.parametrize(
     "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason,jinja,chat_template",
     [
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", False, None),
+        (None, "Book", "Hey", 8, "But she couldn't", 69, 8, "length", True, None),
         (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", False, None),
-        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, None),
-        (None, "Book", "What is the best book", 8, "^ blue", 23, 8, "length", True, "This is not a chat template, it is"),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True,  None),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+|\\{ \" Sarax.", 77, 8, "length", True, 'chatml'),
+        (None, "Book", "What is the best book", 8, "^ blue",                    23, 8, "length", True, "This is not a chat template, it is"),
         ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", False, None),
         ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length", True, None),
     ]

From aa98e5903855705b539458071fd9b7af99b664e2 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Mon, 3 Feb 2025 14:01:49 +0000
Subject: [PATCH 5/5] fix bad merge

---
 common/common.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 24e66c5b6..f22b218e1 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1879,7 +1879,6 @@ std::string common_chat_format_example(const common_chat_template & tmpl, bool u
 
 common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
 {
-    auto vocab = llama_model_get_vocab(model);
     std::string default_template_src = chat_template_override == "chatml" ? CHATML_TEMPLATE_SRC : chat_template_override;
     std::string template_tool_use_src = chat_template_override == "chatml" ? CHATML_TEMPLATE_SRC : "";
     bool has_explicit_template = !chat_template_override.empty();