tool-call: slow tool call integration tests

This commit is contained in:
ochafik 2024-10-28 00:26:40 +00:00
parent ec9f3b101b
commit 9a86ea79a2
4 changed files with 82 additions and 12 deletions

View file

@ -864,7 +864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) {
params.warmup = false;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"--spm-infill"},
string_format(

View file

@ -20,7 +20,7 @@ import aiohttp
import numpy as np
import openai
from openai.types.chat import ChatCompletionChunk
from behave import step # pyright: ignore[reportAttributeAccessIssue]
from behave import register_type, step # pyright: ignore[reportAttributeAccessIssue]
from behave.api.async_step import async_run_until_complete
from prometheus_client import parser
@ -28,6 +28,13 @@ from prometheus_client import parser
DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
@parse.with_pattern(r".*")
def parse_maybe_empty_string(text):
return text.strip()
register_type(MaybeEmptyString=parse_maybe_empty_string)
@step("a server listening on {server_fqdn}:{server_port}")
def step_server_config(context, server_fqdn: str, server_port: str):
context.server_fqdn = server_fqdn
@ -82,6 +89,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
context.temperature = None
context.lora_file = None
context.disable_ctx_shift = False
context.warmup = True
context.use_jinja = False
context.chat_template_file = None
@ -98,7 +106,6 @@ def step_server_config(context, server_fqdn: str, server_port: str):
def step_download_hf_model(context, hf_file: str, hf_repo: str):
context.model_hf_repo = hf_repo
context.model_hf_file = hf_file
context.model_file = os.path.basename(hf_file)
@step('a lora adapter file from {lora_file_url}')
def step_download_lora_file(context, lora_file_url: str):
@ -172,11 +179,23 @@ def step_use_jinja(context):
context.use_jinja = True
@step('no warmup')
def step_no_warmup(context):
context.warmup = False
@step('a chat template file {file}')
def step_use_jinja(context, file):
def step_chat_template_file(context, file):
context.chat_template_file = file
@step('a test chat template file named {name:MaybeEmptyString}')
def step_test_chat_template_file_named(context, name):
name = name.strip()
if name:
context.chat_template_file = f'../../../tests/chat/templates/{name}.jinja'
@step('using slot id {id_slot:d}')
def step_id_slot(context, id_slot: int):
context.id_slot = id_slot
@ -390,6 +409,29 @@ def step_response_format(context, response_format):
def step_tools(context, tools):
context.tools = json.loads(tools)
@step('python tool')
def step_python_tool(context):
if not context.tools:
context.tools = []
context.tools.append({
"type": "function",
"function": {
"name": "ipython",
"description": "",
"parameters": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": ""
}
},
"required": ["code"]
}
}
})
@step('a tool choice {tool_choice}')
def step_tool_choice(context, tool_choice):
context.tool_choice = tool_choice
@ -1552,6 +1594,8 @@ def start_server_background(context):
server_args.extend(['--lora', context.lora_file])
if context.disable_ctx_shift:
server_args.extend(['--no-context-shift'])
if not context.warmup:
server_args.extend(['--no-warmup'])
args = [str(arg) for arg in [context.server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}")

View file

@ -4,20 +4,18 @@ Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model file test-model.gguf
And a model alias tinyllama-2
And BOS token is 1
And 42 as server seed
And 8192 KV cache size
And 32 as batch size
And 2 slots
And 1 slots
And prometheus compatible metrics exposed
And jinja templates are enabled
Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool)
Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a test chat template file named <template_name>
And the server is starting
And the server is healthy
And a model test
@ -44,7 +42,8 @@ Feature: llama.cpp server
Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template)
Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a test chat template file named <template_name>
And the server is starting
And the server is healthy
And a model test
@ -62,7 +61,8 @@ Feature: llama.cpp server
Scenario: OAI Compatibility w/ no tool
Given a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
And the server is starting
And the server is healthy
And a model test
@ -73,3 +73,29 @@ Feature: llama.cpp server
And an OAI compatible chat completions request with no api error
Then no tool is called
@slow
Scenario Outline: OAI Compatibility w/ tools (<hf_repo> / <hf_file> with <template_override> template)
Given a model file <hf_file> from HF repo <hf_repo>
And a test chat template file named <template_override>
And no warmup
And the server is starting
And the server is healthy
And a model test
And 256 max tokens to predict
And a user prompt write a hello world in python (use single quotes for strings)
And python tool
And parallel tool calls is disabled
And an OAI compatible chat completions request with no api error
Then tool <tool_name> is called with arguments <tool_arguments>
Examples: Prompts
| tool_name | tool_arguments | hf_repo | hf_file | template_override |
| ipython | {"code": "print('Hello, world!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
| ipython | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF | Mistral-Nemo-Instruct-2407-Q8_0.gguf | mistralai-Mistral-Nemo-Instruct-2407 |
| ipython | {"code": "print('Hello, World!'}"} | lmstudio-community/Llama-3.2-1B-Instruct-GGUF | Llama-3.2-1B-Instruct-Q4_K_M.gguf | meta-llama-Llama-3.2-3B-Instruct |
| ipython | {"code": "print("} | lmstudio-community/Llama-3.2-3B-Instruct-GGUF | Llama-3.2-3B-Instruct-Q6_K.gguf | meta-llama-Llama-3.2-3B-Instruct |
| ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf | |
| ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf | |
# | ipython | {"code": "print('Hello, World!')"} | meetkai/functionary-small-v3.2-GGUF | functionary-small-v3.2.Q4_0.gguf | meetkai-functionary-medium-v3.2 |

View file

@ -5,7 +5,7 @@ set -eu
if [ $# -lt 1 ]
then
# Start @llama.cpp scenario
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp,-slow
else
behave "$@"
fi