From 9a86ea79a22294993b9be68890fbfcfdbe05b468 Mon Sep 17 00:00:00 2001
From: ochafik <ochafik@google.com>
Date: Mon, 28 Oct 2024 00:26:40 +0000
Subject: [PATCH] `tool-call`: slow tool call integration tests

---
 common/arg.cpp                                |  2 +-
 examples/server/tests/features/steps/steps.py | 50 +++++++++++++++++--
 .../server/tests/features/tool_call.feature   | 40 ++++++++++++---
 examples/server/tests/tests.sh                |  2 +-
 4 files changed, 82 insertions(+), 12 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 77f40b4a4..ab249dc05 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -864,7 +864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index edeb52c31..e21e20fa7 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -20,7 +20,7 @@ import aiohttp
 import numpy as np
 import openai
 from openai.types.chat import ChatCompletionChunk
-from behave import step  # pyright: ignore[reportAttributeAccessIssue]
+from behave import register_type, step  # pyright: ignore[reportAttributeAccessIssue]
 from behave.api.async_step import async_run_until_complete
 from prometheus_client import parser
 
@@ -28,6 +28,13 @@ from prometheus_client import parser
 
 DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
 
+@parse.with_pattern(r".*")
+def parse_maybe_empty_string(text):
+     return text.strip()
+
+register_type(MaybeEmptyString=parse_maybe_empty_string)
+
+
 @step("a server listening on {server_fqdn}:{server_port}")
 def step_server_config(context, server_fqdn: str, server_port: str):
     context.server_fqdn = server_fqdn
@@ -82,6 +89,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
     context.temperature = None
     context.lora_file = None
     context.disable_ctx_shift = False
+    context.warmup = True
     context.use_jinja = False
     context.chat_template_file = None
 
@@ -98,7 +106,6 @@ def step_server_config(context, server_fqdn: str, server_port: str):
 def step_download_hf_model(context, hf_file: str, hf_repo: str):
     context.model_hf_repo = hf_repo
     context.model_hf_file = hf_file
-    context.model_file = os.path.basename(hf_file)
 
 @step('a lora adapter file from {lora_file_url}')
 def step_download_lora_file(context, lora_file_url: str):
@@ -172,11 +179,23 @@ def step_use_jinja(context):
     context.use_jinja = True
 
 
+@step('no warmup')
+def step_no_warmup(context):
+    context.warmup = False
+
+
 @step('a chat template file {file}')
-def step_use_jinja(context, file):
+def step_chat_template_file(context, file):
     context.chat_template_file = file
 
 
+@step('a test chat template file named {name:MaybeEmptyString}')
+def step_test_chat_template_file_named(context, name):
+    name = name.strip()
+    if name:
+        context.chat_template_file = f'../../../tests/chat/templates/{name}.jinja'
+
+
 @step('using slot id {id_slot:d}')
 def step_id_slot(context, id_slot: int):
     context.id_slot = id_slot
@@ -390,6 +409,29 @@ def step_response_format(context, response_format):
 def step_tools(context, tools):
     context.tools = json.loads(tools)
 
+
+@step('python tool')
+def step_python_tool(context):
+    if not context.tools:
+        context.tools = []
+    context.tools.append({
+        "type": "function",
+        "function": {
+            "name": "ipython",
+            "description": "",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string",
+                        "description": ""
+                    }
+                },
+                "required": ["code"]
+            }
+        }
+    })
+
 @step('a tool choice {tool_choice}')
 def step_tool_choice(context, tool_choice):
     context.tool_choice = tool_choice
@@ -1552,6 +1594,8 @@ def start_server_background(context):
         server_args.extend(['--lora', context.lora_file])
     if context.disable_ctx_shift:
         server_args.extend(['--no-context-shift'])
+    if not context.warmup:
+        server_args.extend(['--no-warmup'])
 
     args = [str(arg) for arg in [context.server_path, *server_args]]
     print(f"bench: starting server with: {' '.join(args)}")
diff --git a/examples/server/tests/features/tool_call.feature b/examples/server/tests/features/tool_call.feature
index 5a59ae67c..530565cba 100644
--- a/examples/server/tests/features/tool_call.feature
+++ b/examples/server/tests/features/tool_call.feature
@@ -4,20 +4,18 @@ Feature: llama.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
     And   BOS token is 1
     And   42 as server seed
     And   8192 KV cache size
     And   32 as batch size
-    And   2 slots
+    And   1 slots
     And   prometheus compatible metrics exposed
     And   jinja templates are enabled
 
 
   Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool)
-    Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a test chat template file named <template_name>
     And   the server is starting
     And   the server is healthy
     And   a model test
@@ -44,7 +42,8 @@ Feature: llama.cpp server
 
 
   Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template)
-    Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a test chat template file named <template_name>
     And   the server is starting
     And   the server is healthy
     And   a model test
@@ -62,7 +61,8 @@ Feature: llama.cpp server
 
 
   Scenario: OAI Compatibility w/ no tool
-    Given a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
     And   the server is starting
     And   the server is healthy
     And   a model test
@@ -73,3 +73,29 @@ Feature: llama.cpp server
     And   an OAI compatible chat completions request with no api error
     Then  no tool is called
 
+
+  @slow
+  Scenario Outline: OAI Compatibility w/ tools (<hf_repo> / <hf_file> with <template_override> template)
+    Given a model file <hf_file> from HF repo <hf_repo>
+    And   a test chat template file named <template_override>
+    And   no warmup
+    And   the server is starting
+    And   the server is healthy
+    And   a model test
+    And   256 max tokens to predict
+    And   a user prompt write a hello world in python (use single quotes for strings)
+    And   python tool
+    And   parallel tool calls is disabled
+    And   an OAI compatible chat completions request with no api error
+    Then  tool <tool_name> is called with arguments <tool_arguments>
+
+    Examples: Prompts
+      | tool_name | tool_arguments                       | hf_repo                                              | hf_file                                 | template_override                             |
+      | ipython   | {"code": "print('Hello, world!')"}   | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF            | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf       | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
+      | ipython   | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF            | Mistral-Nemo-Instruct-2407-Q8_0.gguf    | mistralai-Mistral-Nemo-Instruct-2407          |
+      | ipython   | {"code": "print('Hello, World!'}"}   | lmstudio-community/Llama-3.2-1B-Instruct-GGUF        | Llama-3.2-1B-Instruct-Q4_K_M.gguf       | meta-llama-Llama-3.2-3B-Instruct              |
+      | ipython   | {"code": "print("}                   | lmstudio-community/Llama-3.2-3B-Instruct-GGUF        | Llama-3.2-3B-Instruct-Q6_K.gguf         | meta-llama-Llama-3.2-3B-Instruct              |
+      | ipython   | {"code": "print("}                   | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF   | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf  |                                               |
+      | ipython   | {"code": "print("}                   | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF  | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf |                                               |
+      # | ipython   | {"code": "print('Hello, World!')"}   | meetkai/functionary-small-v3.2-GGUF                  | functionary-small-v3.2.Q4_0.gguf        | meetkai-functionary-medium-v3.2               |
+
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 72a0fbad8..370495afe 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -5,7 +5,7 @@ set -eu
 if [ $# -lt 1 ]
 then
     # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp,-slow
 else
     behave "$@"
 fi