tool-call: slow tool call integration tests

2024-10-28 00:26:40 +00:00 · 2024-10-28 00:26:40 +00:00 · 9a86ea79a2
commit 9a86ea79a2
parent ec9f3b101b
4 changed files with 82 additions and 12 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -864,7 +864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params) {
            params.warmup = false;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--spm-infill"},
        string_format(
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -20,7 +20,7 @@ import aiohttp
 import numpy as np
 import openai
 from openai.types.chat import ChatCompletionChunk
-from behave import step  # pyright: ignore[reportAttributeAccessIssue]
+from behave import register_type, step  # pyright: ignore[reportAttributeAccessIssue]
 from behave.api.async_step import async_run_until_complete
 from prometheus_client import parser
@ -28,6 +28,13 @@ from prometheus_client import parser
 DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
@parse.with_pattern(r".*")
 def parse_maybe_empty_string(text):
     return text.strip()
 register_type(MaybeEmptyString=parse_maybe_empty_string)
@step("a server listening on {server_fqdn}:{server_port}")
 def step_server_config(context, server_fqdn: str, server_port: str):
    context.server_fqdn = server_fqdn
@ -82,6 +89,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
    context.temperature = None
    context.lora_file = None
    context.disable_ctx_shift = False
    context.warmup = True
    context.use_jinja = False
    context.chat_template_file = None
@ -98,7 +106,6 @@ def step_server_config(context, server_fqdn: str, server_port: str):
 def step_download_hf_model(context, hf_file: str, hf_repo: str):
    context.model_hf_repo = hf_repo
    context.model_hf_file = hf_file
    context.model_file = os.path.basename(hf_file)
@step('a lora adapter file from {lora_file_url}')
 def step_download_lora_file(context, lora_file_url: str):
@ -172,11 +179,23 @@ def step_use_jinja(context):
    context.use_jinja = True
@step('no warmup')
 def step_no_warmup(context):
    context.warmup = False
@step('a chat template file {file}')
-def step_use_jinja(context, file):
+def step_chat_template_file(context, file):
    context.chat_template_file = file
@step('a test chat template file named {name:MaybeEmptyString}')
 def step_test_chat_template_file_named(context, name):
    name = name.strip()
    if name:
        context.chat_template_file = f'../../../tests/chat/templates/{name}.jinja'
@step('using slot id {id_slot:d}')
 def step_id_slot(context, id_slot: int):
    context.id_slot = id_slot
@ -390,6 +409,29 @@ def step_response_format(context, response_format):
 def step_tools(context, tools):
    context.tools = json.loads(tools)
@step('python tool')
 def step_python_tool(context):
    if not context.tools:
        context.tools = []
    context.tools.append({
        "type": "function",
        "function": {
            "name": "ipython",
            "description": "",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string",
                        "description": ""
                    }
                },
                "required": ["code"]
            }
        }
    })
@step('a tool choice {tool_choice}')
 def step_tool_choice(context, tool_choice):
    context.tool_choice = tool_choice
@ -1552,6 +1594,8 @@ def start_server_background(context):
        server_args.extend(['--lora', context.lora_file])
    if context.disable_ctx_shift:
        server_args.extend(['--no-context-shift'])
    if not context.warmup:
        server_args.extend(['--no-warmup'])
    args = [str(arg) for arg in [context.server_path, *server_args]]
    print(f"bench: starting server with: {' '.join(args)}")
--- a/examples/server/tests/features/tool_call.feature
+++ b/examples/server/tests/features/tool_call.feature
@ -4,20 +4,18 @@ Feature: llama.cpp server
  Background: Server startup
    Given a server listening on localhost:8080
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a model file test-model.gguf
    And   a model alias tinyllama-2
    And   BOS token is 1
    And   42 as server seed
    And   8192 KV cache size
    And   32 as batch size
-    And   2 slots
+    And   1 slots
    And   prometheus compatible metrics exposed
    And   jinja templates are enabled
  Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool)
-    Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a test chat template file named <template_name>
    And   the server is starting
    And   the server is healthy
    And   a model test
@ -44,7 +42,8 @@ Feature: llama.cpp server
  Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template)
-    Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a test chat template file named <template_name>
    And   the server is starting
    And   the server is healthy
    And   a model test
@ -62,7 +61,8 @@ Feature: llama.cpp server
  Scenario: OAI Compatibility w/ no tool
-    Given a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
+    Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
    And   the server is starting
    And   the server is healthy
    And   a model test
@ -73,3 +73,29 @@ Feature: llama.cpp server
    And   an OAI compatible chat completions request with no api error
    Then  no tool is called
  @slow
  Scenario Outline: OAI Compatibility w/ tools (<hf_repo> / <hf_file> with <template_override> template)
    Given a model file <hf_file> from HF repo <hf_repo>
    And   a test chat template file named <template_override>
    And   no warmup
    And   the server is starting
    And   the server is healthy
    And   a model test
    And   256 max tokens to predict
    And   a user prompt write a hello world in python (use single quotes for strings)
    And   python tool
    And   parallel tool calls is disabled
    And   an OAI compatible chat completions request with no api error
    Then  tool <tool_name> is called with arguments <tool_arguments>
    Examples: Prompts
      | tool_name | tool_arguments                       | hf_repo                                              | hf_file                                 | template_override                             |
      | ipython   | {"code": "print('Hello, world!')"}   | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF            | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf       | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
      | ipython   | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF            | Mistral-Nemo-Instruct-2407-Q8_0.gguf    | mistralai-Mistral-Nemo-Instruct-2407          |
      | ipython   | {"code": "print('Hello, World!'}"}   | lmstudio-community/Llama-3.2-1B-Instruct-GGUF        | Llama-3.2-1B-Instruct-Q4_K_M.gguf       | meta-llama-Llama-3.2-3B-Instruct              |
      | ipython   | {"code": "print("}                   | lmstudio-community/Llama-3.2-3B-Instruct-GGUF        | Llama-3.2-3B-Instruct-Q6_K.gguf         | meta-llama-Llama-3.2-3B-Instruct              |
      | ipython   | {"code": "print("}                   | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF   | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf  |                                               |
      | ipython   | {"code": "print("}                   | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF  | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf |                                               |
      # | ipython   | {"code": "print('Hello, World!')"}   | meetkai/functionary-small-v3.2-GGUF                  | functionary-small-v3.2.Q4_0.gguf        | meetkai-functionary-medium-v3.2               |
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -5,7 +5,7 @@ set -eu
 if [ $# -lt 1 ]
 then
    # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp,-slow
 else
    behave "$@"
 fi