tool-call: slow tool call integration tests

2024-10-28 00:26:40 +00:00 · 2024-10-28 00:26:40 +00:00 · 9a86ea79a2
commit 9a86ea79a2
parent ec9f3b101b
4 changed files with 82 additions and 12 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -864,7 +864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -20,7 +20,7 @@ import aiohttp
 import numpy as np
 import openai
 from openai.types.chat import ChatCompletionChunk
-from behave import step  # pyright: ignore[reportAttributeAccessIssue]
+from behave import register_type, step  # pyright: ignore[reportAttributeAccessIssue]
 from behave.api.async_step import async_run_until_complete
 from prometheus_client import parser

@ -28,6 +28,13 @@ from prometheus_client import parser

 DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)

+@parse.with_pattern(r".*")
+def parse_maybe_empty_string(text):
+     return text.strip()
+
+register_type(MaybeEmptyString=parse_maybe_empty_string)
+
+
@step("a server listening on {server_fqdn}:{server_port}")
 def step_server_config(context, server_fqdn: str, server_port: str):
    context.server_fqdn = server_fqdn
@ -82,6 +89,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
    context.temperature = None
    context.lora_file = None
    context.disable_ctx_shift = False
+    context.warmup = True
    context.use_jinja = False
    context.chat_template_file = None

@ -98,7 +106,6 @@ def step_server_config(context, server_fqdn: str, server_port: str):
 def step_download_hf_model(context, hf_file: str, hf_repo: str):
    context.model_hf_repo = hf_repo
    context.model_hf_file = hf_file
-    context.model_file = os.path.basename(hf_file)

@step('a lora adapter file from {lora_file_url}')
 def step_download_lora_file(context, lora_file_url: str):
@ -172,11 +179,23 @@ def step_use_jinja(context):
    context.use_jinja = True


+@step('no warmup')
+def step_no_warmup(context):
+    context.warmup = False
+
+
@step('a chat template file {file}')
-def step_use_jinja(context, file):
+def step_chat_template_file(context, file):
    context.chat_template_file = file


+@step('a test chat template file named {name:MaybeEmptyString}')
+def step_test_chat_template_file_named(context, name):
+    name = name.strip()
+    if name:
+        context.chat_template_file = f'../../../tests/chat/templates/{name}.jinja'
+
+
@step('using slot id {id_slot:d}')
 def step_id_slot(context, id_slot: int):
    context.id_slot = id_slot
@ -390,6 +409,29 @@ def step_response_format(context, response_format):
 def step_tools(context, tools):
    context.tools = json.loads(tools)

+
+@step('python tool')
+def step_python_tool(context):
+    if not context.tools:
+        context.tools = []
+    context.tools.append({
+        "type": "function",
+        "function": {
+            "name": "ipython",
+            "description": "",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "code": {
+                        "type": "string",
+                        "description": ""
+                    }
+                },
+                "required": ["code"]
+            }
+        }
+    })
+
@step('a tool choice {tool_choice}')
 def step_tool_choice(context, tool_choice):
    context.tool_choice = tool_choice
@ -1552,6 +1594,8 @@ def start_server_background(context):
        server_args.extend(['--lora', context.lora_file])
    if context.disable_ctx_shift:
        server_args.extend(['--no-context-shift'])
+    if not context.warmup:
+        server_args.extend(['--no-warmup'])

    args = [str(arg) for arg in [context.server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
--- a/examples/server/tests/features/tool_call.feature
+++ b/examples/server/tests/features/tool_call.feature
@ -4,20 +4,18 @@ Feature: llama.cpp server

  Background: Server startup
    Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
-    And   a model file test-model.gguf
-    And   a model alias tinyllama-2
    And   BOS token is 1
    And   42 as server seed
    And   8192 KV cache size
    And   32 as batch size
-    And   2 slots
+    And   1 slots
    And   prometheus compatible metrics exposed
    And   jinja templates are enabled


  Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool)
-    Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a test chat template file named <template_name>
    And   the server is starting
    And   the server is healthy
    And   a model test
@ -44,7 +42,8 @@ Feature: llama.cpp server


  Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template)
-    Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a test chat template file named <template_name>
    And   the server is starting
    And   the server is healthy
    And   a model test
@ -62,7 +61,8 @@ Feature: llama.cpp server


  Scenario: OAI Compatibility w/ no tool
-    Given a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
    And   the server is starting
    And   the server is healthy
    And   a model test
@ -73,3 +73,29 @@ Feature: llama.cpp server
    And   an OAI compatible chat completions request with no api error
    Then  no tool is called

+
+  @slow
+  Scenario Outline: OAI Compatibility w/ tools (<hf_repo> / <hf_file> with <template_override> template)
+    Given a model file <hf_file> from HF repo <hf_repo>
+    And   a test chat template file named <template_override>
+    And   no warmup
+    And   the server is starting
+    And   the server is healthy
+    And   a model test
+    And   256 max tokens to predict
+    And   a user prompt write a hello world in python (use single quotes for strings)
+    And   python tool
+    And   parallel tool calls is disabled
+    And   an OAI compatible chat completions request with no api error
+    Then  tool <tool_name> is called with arguments <tool_arguments>
+
+    Examples: Prompts
+      | tool_name | tool_arguments                       | hf_repo                                              | hf_file                                 | template_override                             |
+      | ipython   | {"code": "print('Hello, world!')"}   | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF            | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf       | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
+      | ipython   | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF            | Mistral-Nemo-Instruct-2407-Q8_0.gguf    | mistralai-Mistral-Nemo-Instruct-2407          |
+      | ipython   | {"code": "print('Hello, World!'}"}   | lmstudio-community/Llama-3.2-1B-Instruct-GGUF        | Llama-3.2-1B-Instruct-Q4_K_M.gguf       | meta-llama-Llama-3.2-3B-Instruct              |
+      | ipython   | {"code": "print("}                   | lmstudio-community/Llama-3.2-3B-Instruct-GGUF        | Llama-3.2-3B-Instruct-Q6_K.gguf         | meta-llama-Llama-3.2-3B-Instruct              |
+      | ipython   | {"code": "print("}                   | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF   | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf  |                                               |
+      | ipython   | {"code": "print("}                   | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF  | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf |                                               |
+      # | ipython   | {"code": "print('Hello, World!')"}   | meetkai/functionary-small-v3.2-GGUF                  | functionary-small-v3.2.Q4_0.gguf        | meetkai-functionary-medium-v3.2               |
+
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -5,7 +5,7 @@ set -eu
 if [ $# -lt 1 ]
 then
    # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp,-slow
 else
    behave "$@"
 fi