minja: fix identifiers parsing (when start w/ not/is/etc) and lstrip_blocks corner case (needed by DeepSeek-V2.5

2024-09-27 18:30:44 +01:00 · 2024-09-27 18:30:44 +01:00 · 0093a5e527
commit 0093a5e527
parent 2f25ee30ef
9 changed files with 32 additions and 6 deletions
--- a/common/minja.hpp
+++ b/common/minja.hpp
@ -1689,7 +1689,7 @@ private:
    }

    std::unique_ptr<VariableExpr> parseIdentifier() {
-        static std::regex ident_regex(R"((?!not|is|and|or|del)[a-zA-Z_]\w*)");
+        static std::regex ident_regex(R"((?!(?:not|is|and|or|del)\b)[a-zA-Z_]\w*)");
        auto location = get_location();
        auto ident = consumeToken(ident_regex);
        if (ident.empty())
@ -2165,7 +2165,7 @@ private:
                static std::regex trailing_space_regex(R"((\s|\r|\n)+$)");
                text = std::regex_replace(text, trailing_space_regex, "");
              } else if (options.lstrip_blocks && it != end) {
-                static std::regex trailing_last_line_space_regex(R"((^|\n)[ \t]*$)");
+                static std::regex trailing_last_line_space_regex(R"((\n)[ \t]*$)");
                text = std::regex_replace(text, trailing_last_line_space_regex, "$1");
              }

--- a/tests/chat/goldens/deepseek-ai-DeepSeek-Coder-V2-Instruct-simple.txt
+++ b/tests/chat/goldens/deepseek-ai-DeepSeek-Coder-V2-Instruct-simple.txt
@ -0,0 +1,3 @@
+<|startoftext|>User: What's your favourite LLM framework?
+
+Assistant: llama.cpp!<|endoftext|>Assistant:
--- a/tests/chat/goldens/deepseek-ai-DeepSeek-Coder-V2-Instruct-system.txt
+++ b/tests/chat/goldens/deepseek-ai-DeepSeek-Coder-V2-Instruct-system.txt
@ -0,0 +1,5 @@
+<|startoftext|>You only tell the truth.
+
+User: What's your favourite LLM framework?
+
+Assistant: llama.cpp!<|endoftext|>Assistant:
--- a/tests/chat/goldens/deepseek-ai-DeepSeek-V2.5-simple.txt
+++ b/tests/chat/goldens/deepseek-ai-DeepSeek-V2.5-simple.txt
@ -0,0 +1 @@
+<|startoftext|><｜User｜>What's your favourite LLM framework?<｜Assistant｜>llama.cpp!<｜end▁of▁sentence｜><｜Assistant｜>
--- a/tests/chat/goldens/deepseek-ai-DeepSeek-V2.5-system.txt
+++ b/tests/chat/goldens/deepseek-ai-DeepSeek-V2.5-system.txt
@ -0,0 +1 @@
+        <|startoftext|>You only tell the truth.<｜User｜>What's your favourite LLM framework?<｜Assistant｜>llama.cpp!<｜end▁of▁sentence｜><｜Assistant｜>
--- a/tests/chat/templates/deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja
+++ b/tests/chat/templates/deepseek-ai-DeepSeek-Coder-V2-Instruct.jinja
@ -0,0 +1,5 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '
+
+' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '
+
+' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}
--- a/tests/chat/templates/deepseek-ai-DeepSeek-V2.5.jinja
+++ b/tests/chat/templates/deepseek-ai-DeepSeek-V2.5.jinja
@ -0,0 +1 @@
+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}    {%- if message['role'] == 'system' %}        {% set ns.system_prompt = message['content'] %}    {%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}    {%- if message['role'] == 'user' %}    {%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}    {%- endif %}    {%- if message['role'] == 'assistant' and message['content'] is none %}        {%- set ns.is_tool = false -%}        {%- for tool in message['tool_calls']%}            {%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}            {%- set ns.is_first = true -%}            {%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}                   {%- endif %}        {%- endfor %}    {%- endif %}    {%- if message['role'] == 'assistant' and message['content'] is not none %}        {%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}        {%- set ns.is_tool = false -%}        {%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}        {%- endif %}    {%- endif %}    {%- if message['role'] == 'tool' %}        {%- set ns.is_tool = true -%}        {%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}        {%- set ns.is_output_first = false %}        {%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}        {%- endif %}    {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}
--- a/tests/test-minja.cpp
+++ b/tests/test-minja.cpp
@ -119,6 +119,17 @@ static void test_error_contains(const std::string & template_str, const json & b
    cmake -B build -DCMAKE_BUILD_TYPE=Release && cmake --build build -t test-minja -j && ./build/bin/test-minja
 */
 int main() {
+    test_render(R"({%- if True %}        {% set _ = x %}{%- endif %}{{ 1 }})",
+        {},
+        {
+            .lstrip_blocks = true,
+            .trim_blocks = true
+        },
+        "        1"
+    );
+    test_render(R"(  {{- 'a' -}}{{ '  ' }}{{- 'b' -}}  )", {}, {}, "a  b");
+    test_render(R"(    {%- if True %}{%- endif %}{{ '        ' }}{%- for x in [] %}foo{% endfor %}end)", {}, {}, "        end");
+    test_render(R"({% set ns = namespace(is_first=false, nottool=false, and_or=true, delme='') %}{{ ns.is_first }})", {}, {}, "False");
    test_render(R"({{ {} is mapping }},{{ '' is mapping }})", {}, {}, "True,False");
    test_render(R"({{ {} is iterable }},{{ '' is iterable }})", {}, {}, "True,True");
    test_render(R"({% for x in ["a", "b"] %}{{ x }},{% endfor %})", {}, {}, "a,b,");
--- a/tests/update_jinja_goldens.py
+++ b/tests/update_jinja_goldens.py
@ -33,9 +33,11 @@ model_ids = [
    "abacusai/Fewshot-Metamath-OrcaVicuna-Mistral",
    "bofenghuang/vigogne-2-70b-chat",
    "deepseek-ai/deepseek-coder-33b-instruct",
+    "deepseek-ai/DeepSeek-Coder-V2-Instruct",
+    "deepseek-ai/DeepSeek-V2.5",
    "indischepartij/MiniCPM-3B-OpenHermes-2.5-v2",
-    "meetkai/functionary-medium-v3.2",
    "meetkai/functionary-medium-v3.1",
+    "meetkai/functionary-medium-v3.2",
    "microsoft/Phi-3-medium-4k-instruct",
    "microsoft/Phi-3-mini-4k-instruct",
    "microsoft/Phi-3-small-8k-instruct",
@ -57,9 +59,6 @@ model_ids = [
    # "CohereForAI/c4ai-command-r-plus",
    # "THUDM/chatglm3-6b",
    # "derek33125/project-angel-chatglm4",
-    # "deepseek-ai/DeepSeek-Coder-V2-Instruct",
-    # "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
-    # "deepseek-ai/DeepSeek-V2.5",

    # Cannot find chat template:
    # "eachadea/vicuna-13b-1.1",
				`@ -0,0 +1 @@`
				`<\|startoftext\|><｜User｜>What's your favourite LLM framework?<｜Assistant｜>llama.cpp!<｜end▁of▁sentence｜><｜Assistant｜>`
				`@ -0,0 +1 @@`
				`<\|startoftext\|>You only tell the truth.<｜User｜>What's your favourite LLM framework?<｜Assistant｜>llama.cpp!<｜end▁of▁sentence｜><｜Assistant｜>`
				`@ -0,0 +1 @@`
				{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %} {%- if message['role'] == 'system' %} {% set ns.system_prompt = message['content'] %} {%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %} {%- if message['role'] == 'user' %} {%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is none %} {%- set ns.is_tool = false -%} {%- for tool in message['tool_calls']%} {%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}} {%- set ns.is_first = true -%} {%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}} {%- endif %} {%- endfor %} {%- endif %} {%- if message['role'] == 'assistant' and message['content'] is not none %} {%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}} {%- set ns.is_tool = false -%} {%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}} {%- endif %} {%- endif %} {%- if message['role'] == 'tool' %} {%- set ns.is_tool = true -%} {%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}} {%- set ns.is_output_first = false %} {%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}} {%- endif %} {%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}