From e6d9b52480ab0335c281537d87603b8b46c1f117 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@google.com>
Date: Wed, 5 Feb 2025 15:47:37 +0000
Subject: [PATCH] align Command R7B w/ --think / reasoning_content behaviour

---
 common/arg.cpp                               |  2 +-
 common/chat-template.hpp                     |  2 +-
 common/chat.cpp                              | 69 +++++++++++----
 common/chat.hpp                              |  1 +
 common/common.h                              |  1 -
 examples/server/README.md                    |  8 +-
 examples/server/server.cpp                   |  3 -
 examples/server/tests/unit/test_tool_call.py | 90 ++++++++++----------
 tests/test-chat.cpp                          | 87 +++++++++++++++----
 9 files changed, 176 insertions(+), 87 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index ba1999829..de2e97dca 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1978,7 +1978,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--think"},
         "*experimental* thinking mode (default: disabled)\n"
-        "returns reasoning_content in messages, forcing model to think unless it supports native <think> tags (DeepSeek R1)\n"
+        "returns reasoning_content in messages, forcing model to think unless it supports native <think> tags (DeepSeek R1, Command R7B)\n"
         "only supported for non-streamed responses",
         [](common_params & params) {
             params.think = true;
diff --git a/common/chat-template.hpp b/common/chat-template.hpp
index 0e88fb361..36dff41db 100644
--- a/common/chat-template.hpp
+++ b/common/chat-template.hpp
@@ -316,7 +316,7 @@ class chat_template {
 
         auto polyfill_system_role = opts.polyfill_system_role && !caps_.supports_system_role;
         auto polyfill_tools = opts.polyfill_tools && has_tools && !caps_.supports_tools;
-        auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples;
+        auto polyfill_tool_call_example = polyfill_tools && opts.polyfill_tool_call_examples && caps_.supports_tool_calls;
         auto polyfill_tool_calls = opts.polyfill_tool_calls && has_tool_calls && !caps_.supports_tool_calls;
         auto polyfill_tool_responses = opts.polyfill_tool_responses && has_tool_responses && !caps_.supports_tool_responses;
         auto polyfill_object_arguments = opts.polyfill_object_arguments && has_tool_calls && caps_.requires_object_arguments;
diff --git a/common/chat.cpp b/common/chat.cpp
index cba713553..2ff9aa397 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -12,12 +12,13 @@ std::string common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
         case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
-        case COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK: return "DeepSeek R1 (extract <think>)";
+        case COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK: return "DeepSeek R1 (extract reasoning_content)";
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
         case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
         case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
+        case COMMON_CHAT_FORMAT_COMMAND_R7B_THINK: return "Command R7B (extract reasoning_content)";
         default:
             throw std::runtime_error("Unknown chat format");
     }
@@ -469,22 +470,49 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
         "<|END_THINKING|>",
         "<|END_ACTION|>",
     };
-    data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
-    data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
+    auto adjusted_messages = json::array();
+    for (const auto & msg : inputs.messages) {
+        auto has_reasoning_content = msg.contains("reasoning_content") && msg["reasoning_content"].is_string();
+        auto has_tool_calls = msg.contains("tool_calls") && msg["tool_calls"].is_array();
+        if (has_reasoning_content && has_tool_calls) {
+            auto adjusted_message = msg;
+            adjusted_message["tool_plan"] = msg["reasoning_content"];
+            adjusted_message.erase("reasoning_content");
+            adjusted_messages.push_back(adjusted_message);
+        } else {
+            adjusted_messages.push_back(msg);
+        }
+    }
+    // } else {
+    //     adjusted_messages = inputs.messages;
+    // }
+    data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
+    data.format = inputs.think ? COMMON_CHAT_FORMAT_COMMAND_R7B_THINK : COMMON_CHAT_FORMAT_COMMAND_R7B;
     return data;
 }
-static common_chat_msg common_chat_parse_command_r7b(const std::string & input) {
-    static std::regex response_regex("<\\|START_RESPONSE\\|>([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
-    static std::regex thought_action_regex("<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|><\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
+static common_chat_msg common_chat_parse_command_r7b(const std::string & input, bool think) {
+    static std::regex thought_regex("(<\\|START_THINKING\\|>([\\s\\S\\n\\r]*?)<\\|END_THINKING\\|>)([\\s\\S\\n\\r]*)");
+    static std::regex action_regex("<\\|START_ACTION\\|>([\\s\\S\\n\\r]*?)<\\|END_ACTION\\|>");
+    static std::regex response_regex("(?:<\\|START_RESPONSE\\|>)?([\\s\\S\\n\\r]*?)<\\|END_RESPONSE\\|>");
+
     std::smatch match;
 
     common_chat_msg result;
     result.role = "assistant";
-    if (std::regex_match(input, match, response_regex)) {
-        result.content = match[1].str();
-    } else if (std::regex_match(input, match, thought_action_regex)) {
-        result.tool_plan = match[1].str();
-        auto actions_str = match[2].str();
+
+    std::string rest = input;
+
+    if (std::regex_match(rest, match, thought_regex)) {
+        if (think) {
+            result.reasoning_content = match[2].str();
+        } else if (!match[2].str().empty()) {
+            // Let the unparsed thinking tags through in content only if their insides aren't empty.
+            result.content = match[1].str();
+        }
+        rest = match[3].str();
+    }
+    if (std::regex_match(rest, match, action_regex)) {
+        auto actions_str = match[1].str();
         auto actions = json::parse(actions_str);
         for (const auto & action : actions) {
             result.tool_calls.push_back({
@@ -493,9 +521,12 @@ static common_chat_msg common_chat_parse_command_r7b(const std::string & input)
                 /* .id = */        action["tool_call_id"],
             });
         }
+    } else if (std::regex_match(rest, match, response_regex)) {
+        auto response = match[1].str();
+        result.content += response;
     } else {
         LOG_ERR("Failed to parse command_r output");
-        result.content = input;
+        result.content += rest;
     }
     return result;
 }
@@ -1038,6 +1069,11 @@ common_chat_params common_chat_params_init(const common_chat_template & tmpl, co
         return common_chat_params_init_deepseek_r1(tmpl, inputs);
     }
 
+    // Command R7B: : use handler in all cases except json schema (thinking / tools).
+    if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos && inputs.json_schema.is_null()) {
+        return common_chat_params_init_command_r7b(tmpl, inputs);
+    }
+
     // Use generic handler when forcing thoughts or JSON schema for final output
     // TODO: support thinking mode and/or JSON schema in handlers below this.
     if (inputs.think || (!inputs.tools.is_null() && inputs.json_schema.is_object())) {
@@ -1081,11 +1117,6 @@ common_chat_params common_chat_params_init(const common_chat_template & tmpl, co
         return common_chat_params_init_mistral_nemo(tmpl, inputs);
     }
 
-    // Command R7B (w/ tools)
-    if (src.find("<|END_THINKING|><|START_ACTION|>") != std::string::npos) {
-        return common_chat_params_init_command_r7b(tmpl, inputs);
-    }
-
     // Generic fallback
     return common_chat_params_init_generic(tmpl, inputs);
 }
@@ -1123,7 +1154,9 @@ common_chat_msg common_chat_parse(const std::string & input, common_chat_format
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2:
             return common_chat_parse_firefunction_v2(input);
         case COMMON_CHAT_FORMAT_COMMAND_R7B:
-            return common_chat_parse_command_r7b(input);
+            return common_chat_parse_command_r7b(input, /* think= */ false);
+        case COMMON_CHAT_FORMAT_COMMAND_R7B_THINK:
+            return common_chat_parse_command_r7b(input, /* think= */ true);
         default:
             throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
     }
diff --git a/common/chat.hpp b/common/chat.hpp
index 9bd9dc5ef..d3272f70f 100644
--- a/common/chat.hpp
+++ b/common/chat.hpp
@@ -35,6 +35,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
     COMMON_CHAT_FORMAT_HERMES_2_PRO,
     COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_THINK,
 
     COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
diff --git a/common/common.h b/common/common.h
index e389a29d0..76de599f6 100644
--- a/common/common.h
+++ b/common/common.h
@@ -625,7 +625,6 @@ struct common_chat_msg {
     std::string content;
     std::vector<common_tool_call> tool_calls;
     std::string reasoning_content = "";
-    std::string tool_plan = "";
 };
 
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
diff --git a/examples/server/README.md b/examples/server/README.md
index 359fd8578..944f1a885 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -127,6 +127,8 @@ The project is under active development, and we are [looking for feedback and co
 | `--grammar-file FNAME` | file to read grammar from |
 | `-j, --json-schema SCHEMA` | JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object<br/>For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead |
 | `--jinja` | Enable experimental Jinja templating engine (required for tool use) |
+| `--think` | Enable experimental thinking mode (extracts DeepSeek R1 & Command R7B's native thinking tags and forces any other model to think before responding, resulting thoughts are in the `reasoning_content` output field) (requires `--jinja`) |
+--think 
 
 **Example-specific params**
 
@@ -1223,10 +1225,10 @@ curl http://localhost:8080/v1/chat/completions \
 
   # Native support for DeepSeek R1 works best w/ our own template (official template buggy)
 
-  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L \
+  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q6_K_L --think \
     --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
-  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M \
+  llama-server --jinja -fa -hf bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF:Q4_K_M --think \
     --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja
 
   # Native support requires the right template for these GGUFs:
@@ -1240,7 +1242,7 @@ curl http://localhost:8080/v1/chat/completions \
   llama-server --jinja -fa -hf bartowski/firefunction-v2-GGUF -hff firefunction-v2-IQ1_M.gguf \
     --chat-template-file <( python scripts/get_chat_template.py fireworks-ai/llama-3-firefunction-v2 tool_use )
 
-  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L \
+  llama-server --jinja -fa -hf bartowski/c4ai-command-r7b-12-2024-GGUF:Q6_K_L --think \
     --chat-template-file <( python scripts/get_chat_template.py CohereForAI/c4ai-command-r7b-12-2024 tool_use )
 
   # Generic format support
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index bc0689d0f..05b73ef73 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -748,9 +748,6 @@ struct server_task_result_cmpl_final : server_task_result {
         if (!msg.reasoning_content.empty()) {
             message["reasoning_content"] = msg.reasoning_content;
         }
-        if (!msg.tool_plan.empty()) {
-            message["tool_plan"] = msg.tool_plan;
-        }
 
         json choice {
             {"finish_reason", finish_reason},
diff --git a/examples/server/tests/unit/test_tool_call.py b/examples/server/tests/unit/test_tool_call.py
index 538b42fea..de02e8184 100644
--- a/examples/server/tests/unit/test_tool_call.py
+++ b/examples/server/tests/unit/test_tool_call.py
@@ -274,43 +274,44 @@ def test_completion_without_tool_call_slow(template_name: str, n_predict: int, t
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("hf_repo,template_override", [
-    ("bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
+@pytest.mark.parametrize("think,hf_repo,template_override", [
+    (True,  "bartowski/c4ai-command-r7b-12-2024-GGUF:Q4_K_M",   ("CohereForAI/c4ai-command-r7b-12-2024", "tool_use")),
 
-    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    ("bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+    (False, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (False, "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
 
-    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    ("bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (False, "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
-    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    ("bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+    (False, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (False, "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
 
-    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    ("bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+    (False, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (False, "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
 
-    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
-    ("bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+    (False, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch/Hermes-3-Llama-3.1-8B", "tool_use")),
+    (False, "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
 
-    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    ("bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+    (False, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (False, "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
 
-    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
-    ("bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+    (False, "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai/functionary-medium-v3.2", None)),
+    (False, "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 
-    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
-    ("bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama/Llama-3.2-3B-Instruct", None)),
+    (False, "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
 
-    ("bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (True,  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
 
     # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
-    ("bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (False, "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
 
     # ("bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M", ("meta-llama/Llama-3.2-3B-Instruct", None)),
 ])
-def test_weather(hf_repo: str, template_override: Tuple[str, str | None] | None):
+def test_weather(think: bool, hf_repo: str, template_override: Tuple[str, str | None] | None):
     global server
     n_predict = 512
+    server.think = think
     server.n_slots = 1
     server.jinja = True
     server.n_ctx = 8192
@@ -488,44 +489,45 @@ def test_thoughts(n_predict: int, think: bool, expect_content: str | None, expec
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("expected_arguments_override,hf_repo,template_override", [
-    (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
+@pytest.mark.parametrize("think,expected_arguments_override,hf_repo,template_override", [
+    (True,  None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (True,  None,                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", "chatml"),
 
-    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
-    (None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      None),
+    (False, None,                 "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
-    (None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
+    (False, None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       ("meetkai-functionary-medium-v3.2", None)),
+    (False, None,                 "bartowski/functionary-small-v3.2-GGUF:Q8_0",       "chatml"),
 
-    (None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
-    ('{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
+    (False, None,                 "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", None),
+    (False, '{"code":"print("}',  "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Q4_K_M", "chatml"),
 
-    ('{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    (None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, '{"code":"print("}',  "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (False, None,                 "bartowski/Llama-3.2-1B-Instruct-GGUF:Q4_K_M",      "chatml"),
 
-    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
-    ('{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
+    (False, '{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      ("meta-llama-Llama-3.2-3B-Instruct", None)),
+    (False, '{"code":"print("}',  "bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
-    (None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
+    (False, None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        None),
+    (False, None,                 "bartowski/Qwen2.5-7B-Instruct-GGUF:Q4_K_M",        "chatml"),
 
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
+    (False, None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    ("NousResearch/Hermes-2-Pro-Llama-3-8B", "tool_use")),
+    (False, None,                 "bartowski/Hermes-2-Pro-Llama-3-8B-GGUF:Q4_K_M",    "chatml"),
 
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
-    (None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
+    (False, None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      ("NousResearch-Hermes-3-Llama-3.1-8B", "tool_use")),
+    (False, None,                 "bartowski/Hermes-3-Llama-3.1-8B-GGUF:Q4_K_M",      "chatml"),
 
-    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
-    (None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
+    (False, None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", None),
+    (False, None,                 "bartowski/Mistral-Nemo-Instruct-2407-GGUF:Q4_K_M", "chatml"),
 
     # Note: gemma-2-2b-it knows itself as "model", not "assistant", so we don't test the ill-suited chatml on it.
-    (None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
+    (False, None,                 "bartowski/gemma-2-2b-it-GGUF:Q4_K_M",              None),
 ])
-def test_hello_world_tool_call(expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
+def test_hello_world(think: bool, expected_arguments_override: str | None, hf_repo: str, template_override: str | Tuple[str, str | None] | None):
     global server
     server.n_slots = 1
     server.jinja = True
+    server.think = think
     server.n_ctx = 8192
     server.n_predict = 512 # High because of DeepSeek R1
     server.model_hf_repo = hf_repo
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index a556098be..865e7fbfe 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -24,7 +24,7 @@ static common_chat_msg msg_from_json(const json & message) {
         ret.content = message.at("content");
     }
     if (message.contains("tool_plan")) {
-        ret.tool_plan = message.at("tool_plan");
+        ret.reasoning_content = message.at("tool_plan");
     }
     if (message.contains("reasoning_content")) {
         ret.reasoning_content = message.at("reasoning_content");
@@ -109,7 +109,6 @@ static void assert_msg_equals(const common_chat_msg & expected, const common_cha
     assert_equals(expected.role, actual.role);
     assert_equals(expected.content, actual.content);
     assert_equals(expected.reasoning_content, actual.reasoning_content);
-    assert_equals(expected.tool_plan, actual.tool_plan);
     assert_equals(expected.tool_calls.size(), actual.tool_calls.size());
     for (size_t i = 0; i < expected.tool_calls.size(); i++) {
         const auto & expected_tool_call = expected.tool_calls[i];
@@ -181,13 +180,15 @@ struct delta_data {
 
 static delta_data init_delta(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens,
                              const json & user_message, const json & delta_message, const json & tools,
-                             const json & tool_choice) {
+                             const json & tool_choice,
+                             bool think = false) {
     common_chat_inputs inputs;
     inputs.parallel_tool_calls = true;
     inputs.messages            = json::array();
     inputs.messages.push_back(user_message);
     inputs.tools       = tools;
     inputs.tool_choice = tool_choice;
+    inputs.think       = think;
     auto params_prefix = common_chat_params_init(tmpl, inputs);
 
     inputs.messages.push_back(delta_message);
@@ -229,7 +230,8 @@ static delta_data init_delta(const common_chat_template & tmpl, const std::vecto
 static void test_template(const common_chat_template & tmpl, const std::vector<std::string> & end_tokens,
                           const json & test_message, const json & tools = {}, const std::string & expected_delta = "",
                           bool expect_grammar_triggered = true,
-                          bool test_grammar_if_triggered = true) {
+                          bool test_grammar_if_triggered = true,
+                          bool think = false) {
     common_chat_msg expected_msg = msg_from_json(test_message);
 
     auto user_message = json{
@@ -238,7 +240,7 @@ static void test_template(const common_chat_template & tmpl, const std::vector<s
     };
 
     for (const auto & tool_choice : json({ "auto", "required" })) {
-        auto data = init_delta(tmpl, end_tokens, user_message, test_message, tools, tool_choice);
+        auto data = init_delta(tmpl, end_tokens, user_message, test_message, tools, tool_choice, think);
         if (!expected_delta.empty()) {
             assert_equals(expected_delta, data.delta);
         }
@@ -297,10 +299,14 @@ static void test_template_output_parsers() {
         { "role",    "assistant"     },
         { "content", "Hello, world!\nWhat's up?" },
     };
-    json message_assist_thoughts_unparsed {
+    json message_assist_thoughts_unparsed_think {
         { "role",    "assistant"     },
         { "content", "<think>I'm thinking</think>Hello, world!\nWhat's up?" },
     };
+    json message_assist_thoughts_unparsed_r7b {
+        { "role",    "assistant"     },
+        { "content", "<|START_THINKING|>I'm thinking<|END_THINKING|>Hello, world!\nWhat's up?" },
+    };
     json message_assist_thoughts {
         { "role",    "assistant"     },
         { "content", "Hello, world!\nWhat's up?" },
@@ -371,7 +377,6 @@ static void test_template_output_parsers() {
     json message_assist_call_idx {
         { "role",       "assistant"},
         { "content",    {}},
-        { "tool_plan",  "I'm not so sure"},
         { "tool_calls", {
             {
                 { "type", "function" },
@@ -387,6 +392,8 @@ static void test_template_output_parsers() {
         { "content",    {}                         },
         { "tool_calls", tool_calls                  }
     };
+    json message_assist_call_tool_plan_idx = message_assist_call_idx;
+    message_assist_call_tool_plan_idx["tool_plan"] = "I'm thinking";
 
     auto python_message_assist_call = json{
         { "role",       "assistant"                },
@@ -448,14 +455,52 @@ static void test_template_output_parsers() {
         const common_chat_template tmpl(read_file("models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja"), "<s>", "</s>");
         std::vector<std::string>   end_tokens{ "<|END_OF_TURN_TOKEN|>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_params_init(tmpl, inputs_no_tools).format);
-        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,    common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,       common_chat_params_init(tmpl, inputs_no_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B,       common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_COMMAND_R7B_THINK, common_chat_params_init(tmpl, inputs_tools_think).format);
+
+        assert_msg_equals(msg_from_json(message_assist),
+            common_chat_parse(
+                "Hello, world!\nWhat's up?",
+                COMMON_CHAT_FORMAT_COMMAND_R7B));
+        assert_msg_equals(msg_from_json(message_assist),
+            common_chat_parse(
+                "Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                COMMON_CHAT_FORMAT_COMMAND_R7B));
+        assert_msg_equals(msg_from_json(message_assist),
+            common_chat_parse(
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                COMMON_CHAT_FORMAT_COMMAND_R7B));
+        assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_r7b),
+            common_chat_parse(
+                "<|START_THINKING|>I'm thinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                COMMON_CHAT_FORMAT_COMMAND_R7B));
+        assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_r7b),
+            common_chat_parse(
+                "<|START_THINKING|>I'm thinking<|END_THINKING|>"
+                "Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                COMMON_CHAT_FORMAT_COMMAND_R7B));
+
+        assert_msg_equals(msg_from_json(message_assist_thoughts),
+            common_chat_parse(
+                "<|START_THINKING|>I'm thinking<|END_THINKING|>"
+                "<|START_RESPONSE|>Hello, world!\nWhat's up?<|END_RESPONSE|>",
+                COMMON_CHAT_FORMAT_COMMAND_R7B_THINK));
 
         test_template(tmpl, end_tokens, message_assist_call_idx, tools,
-                      "<|START_THINKING|>I'm not so sure<|END_THINKING|>"
+                      "<|START_THINKING|><|END_THINKING|>"
                       "<|START_ACTION|>[\n"
                       "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
                       "]<|END_ACTION|>");
+        test_template(tmpl, end_tokens, message_assist_call_tool_plan_idx, tools,
+                      "<|START_THINKING|>I'm thinking<|END_THINKING|>"
+                      "<|START_ACTION|>[\n"
+                      "    {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n"
+                      "]<|END_ACTION|>",
+                      /* expect_grammar_triggered= */ true,
+                      /* test_grammar_if_triggered= */ true,
+                      /* think= */ true);
         test_template(tmpl, end_tokens, message_assist, tools,
                       "<|START_RESPONSE|>Hello, world!\n"
                       "What's up?<|END_RESPONSE|>",
@@ -616,12 +661,17 @@ static void test_template_output_parsers() {
                                         "<s>", "</s>");
         std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,       common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK, common_chat_params_init(tmpl, inputs_tools_think).format);
 
         test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_template(tmpl, end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
-        assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed), common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?", COMMON_CHAT_FORMAT_DEEPSEEK_R1));
-        assert_msg_equals(msg_from_json(message_assist_thoughts), common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?", COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK));
+        assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_think),
+            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
+            COMMON_CHAT_FORMAT_DEEPSEEK_R1));
+        assert_msg_equals(msg_from_json(message_assist_thoughts),
+            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
+            COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK));
         // test_template(tmpl, end_tokens, message_assist_call, tools,
         //               "<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>function<｜tool▁sep｜>special_function\n"
         //               "```json\n"
@@ -637,12 +687,17 @@ static void test_template_output_parsers() {
                                         "<s>", "</s>");
         std::vector<std::string>   end_tokens{ "<｜end▁of▁sentence｜>" };
 
-        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1, common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1,       common_chat_params_init(tmpl, inputs_tools).format);
+        assert_equals(COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK, common_chat_params_init(tmpl, inputs_tools_think).format);
 
         test_template(tmpl, end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
         test_template(tmpl, end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
-        assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed), common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?", COMMON_CHAT_FORMAT_DEEPSEEK_R1));
-        assert_msg_equals(msg_from_json(message_assist_thoughts), common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?", COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK));
+        assert_msg_equals(msg_from_json(message_assist_thoughts_unparsed_think),
+            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
+            COMMON_CHAT_FORMAT_DEEPSEEK_R1));
+        assert_msg_equals(msg_from_json(message_assist_thoughts),
+            common_chat_parse("<think>I'm thinking</think>Hello, world!\nWhat's up?",
+            COMMON_CHAT_FORMAT_DEEPSEEK_R1_THINK));
 
         assert_msg_equals(msg_from_json(message_assist_call_thoughts_unparsed),
             common_chat_parse(