From 9a86ea79a22294993b9be68890fbfcfdbe05b468 Mon Sep 17 00:00:00 2001 From: ochafik Date: Mon, 28 Oct 2024 00:26:40 +0000 Subject: [PATCH] `tool-call`: slow tool call integration tests --- common/arg.cpp | 2 +- examples/server/tests/features/steps/steps.py | 50 +++++++++++++++++-- .../server/tests/features/tool_call.feature | 40 ++++++++++++--- examples/server/tests/tests.sh | 2 +- 4 files changed, 82 insertions(+), 12 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 77f40b4a4..ab249dc05 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -864,7 +864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.warmup = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN})); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"--spm-infill"}, string_format( diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index edeb52c31..e21e20fa7 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -20,7 +20,7 @@ import aiohttp import numpy as np import openai from openai.types.chat import ChatCompletionChunk -from behave import step # pyright: ignore[reportAttributeAccessIssue] +from behave import register_type, step # pyright: ignore[reportAttributeAccessIssue] from behave.api.async_step import async_run_until_complete from prometheus_client import parser @@ -28,6 +28,13 @@ from prometheus_client import parser DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600) +@parse.with_pattern(r".*") +def parse_maybe_empty_string(text): + return text.strip() + +register_type(MaybeEmptyString=parse_maybe_empty_string) + + @step("a server listening on {server_fqdn}:{server_port}") def step_server_config(context, server_fqdn: str, server_port: str): context.server_fqdn = server_fqdn @@ -82,6 +89,7 @@ def step_server_config(context, server_fqdn: str, server_port: str): context.temperature = None context.lora_file = None context.disable_ctx_shift = False + context.warmup = True context.use_jinja = False context.chat_template_file = None @@ -98,7 +106,6 @@ def step_server_config(context, server_fqdn: str, server_port: str): def step_download_hf_model(context, hf_file: str, hf_repo: str): context.model_hf_repo = hf_repo context.model_hf_file = hf_file - context.model_file = os.path.basename(hf_file) @step('a lora adapter file from {lora_file_url}') def step_download_lora_file(context, lora_file_url: str): @@ -172,11 +179,23 @@ def step_use_jinja(context): context.use_jinja = True +@step('no warmup') +def step_no_warmup(context): + context.warmup = False + + @step('a chat template file {file}') -def step_use_jinja(context, file): +def step_chat_template_file(context, file): context.chat_template_file = file +@step('a test chat template file named {name:MaybeEmptyString}') +def step_test_chat_template_file_named(context, name): + name = name.strip() + if name: + context.chat_template_file = f'../../../tests/chat/templates/{name}.jinja' + + @step('using slot id {id_slot:d}') def step_id_slot(context, id_slot: int): context.id_slot = id_slot @@ -390,6 +409,29 @@ def step_response_format(context, response_format): def step_tools(context, tools): context.tools = json.loads(tools) + +@step('python tool') +def step_python_tool(context): + if not context.tools: + context.tools = [] + context.tools.append({ + "type": "function", + "function": { + "name": "ipython", + "description": "", + "parameters": { + "type": "object", + "properties": { + "code": { + "type": "string", + "description": "" + } + }, + "required": ["code"] + } + } + }) + @step('a tool choice {tool_choice}') def step_tool_choice(context, tool_choice): context.tool_choice = tool_choice @@ -1552,6 +1594,8 @@ def start_server_background(context): server_args.extend(['--lora', context.lora_file]) if context.disable_ctx_shift: server_args.extend(['--no-context-shift']) + if not context.warmup: + server_args.extend(['--no-warmup']) args = [str(arg) for arg in [context.server_path, *server_args]] print(f"bench: starting server with: {' '.join(args)}") diff --git a/examples/server/tests/features/tool_call.feature b/examples/server/tests/features/tool_call.feature index 5a59ae67c..530565cba 100644 --- a/examples/server/tests/features/tool_call.feature +++ b/examples/server/tests/features/tool_call.feature @@ -4,20 +4,18 @@ Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models - And a model file test-model.gguf - And a model alias tinyllama-2 And BOS token is 1 And 42 as server seed And 8192 KV cache size And 32 as batch size - And 2 slots + And 1 slots And prometheus compatible metrics exposed And jinja templates are enabled Scenario Outline: OAI Compatibility w/ tools and required tool_choice ( template, tool) - Given a chat template file ../../../tests/chat/templates/.jinja + Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a test chat template file named And the server is starting And the server is healthy And a model test @@ -44,7 +42,8 @@ Feature: llama.cpp server Scenario Outline: OAI Compatibility w/ tools and auto tool_choice ( template) - Given a chat template file ../../../tests/chat/templates/.jinja + Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a test chat template file named And the server is starting And the server is healthy And a model test @@ -62,7 +61,8 @@ Feature: llama.cpp server Scenario: OAI Compatibility w/ no tool - Given a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja + Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models + And a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja And the server is starting And the server is healthy And a model test @@ -73,3 +73,29 @@ Feature: llama.cpp server And an OAI compatible chat completions request with no api error Then no tool is called + + @slow + Scenario Outline: OAI Compatibility w/ tools ( / with template) + Given a model file from HF repo + And a test chat template file named + And no warmup + And the server is starting + And the server is healthy + And a model test + And 256 max tokens to predict + And a user prompt write a hello world in python (use single quotes for strings) + And python tool + And parallel tool calls is disabled + And an OAI compatible chat completions request with no api error + Then tool is called with arguments + + Examples: Prompts + | tool_name | tool_arguments | hf_repo | hf_file | template_override | + | ipython | {"code": "print('Hello, world!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use | + | ipython | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF | Mistral-Nemo-Instruct-2407-Q8_0.gguf | mistralai-Mistral-Nemo-Instruct-2407 | + | ipython | {"code": "print('Hello, World!'}"} | lmstudio-community/Llama-3.2-1B-Instruct-GGUF | Llama-3.2-1B-Instruct-Q4_K_M.gguf | meta-llama-Llama-3.2-3B-Instruct | + | ipython | {"code": "print("} | lmstudio-community/Llama-3.2-3B-Instruct-GGUF | Llama-3.2-3B-Instruct-Q6_K.gguf | meta-llama-Llama-3.2-3B-Instruct | + | ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf | | + | ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf | | + # | ipython | {"code": "print('Hello, World!')"} | meetkai/functionary-small-v3.2-GGUF | functionary-small-v3.2.Q4_0.gguf | meetkai-functionary-medium-v3.2 | + diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 72a0fbad8..370495afe 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -5,7 +5,7 @@ set -eu if [ $# -lt 1 ] then # Start @llama.cpp scenario - behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp + behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp,-slow else behave "$@" fi