tool-call: slow tool call integration tests

This commit is contained in:
ochafik 2024-10-28 00:26:40 +00:00
parent ec9f3b101b
commit 9a86ea79a2
4 changed files with 82 additions and 12 deletions

View file

@ -864,7 +864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.warmup = false; params.warmup = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg( add_opt(common_arg(
{"--spm-infill"}, {"--spm-infill"},
string_format( string_format(

View file

@ -20,7 +20,7 @@ import aiohttp
import numpy as np import numpy as np
import openai import openai
from openai.types.chat import ChatCompletionChunk from openai.types.chat import ChatCompletionChunk
from behave import step # pyright: ignore[reportAttributeAccessIssue] from behave import register_type, step # pyright: ignore[reportAttributeAccessIssue]
from behave.api.async_step import async_run_until_complete from behave.api.async_step import async_run_until_complete
from prometheus_client import parser from prometheus_client import parser
@ -28,6 +28,13 @@ from prometheus_client import parser
DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600) DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
@parse.with_pattern(r".*")
def parse_maybe_empty_string(text):
return text.strip()
register_type(MaybeEmptyString=parse_maybe_empty_string)
@step("a server listening on {server_fqdn}:{server_port}") @step("a server listening on {server_fqdn}:{server_port}")
def step_server_config(context, server_fqdn: str, server_port: str): def step_server_config(context, server_fqdn: str, server_port: str):
context.server_fqdn = server_fqdn context.server_fqdn = server_fqdn
@ -82,6 +89,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
context.temperature = None context.temperature = None
context.lora_file = None context.lora_file = None
context.disable_ctx_shift = False context.disable_ctx_shift = False
context.warmup = True
context.use_jinja = False context.use_jinja = False
context.chat_template_file = None context.chat_template_file = None
@ -98,7 +106,6 @@ def step_server_config(context, server_fqdn: str, server_port: str):
def step_download_hf_model(context, hf_file: str, hf_repo: str): def step_download_hf_model(context, hf_file: str, hf_repo: str):
context.model_hf_repo = hf_repo context.model_hf_repo = hf_repo
context.model_hf_file = hf_file context.model_hf_file = hf_file
context.model_file = os.path.basename(hf_file)
@step('a lora adapter file from {lora_file_url}') @step('a lora adapter file from {lora_file_url}')
def step_download_lora_file(context, lora_file_url: str): def step_download_lora_file(context, lora_file_url: str):
@ -172,11 +179,23 @@ def step_use_jinja(context):
context.use_jinja = True context.use_jinja = True
@step('no warmup')
def step_no_warmup(context):
context.warmup = False
@step('a chat template file {file}') @step('a chat template file {file}')
def step_use_jinja(context, file): def step_chat_template_file(context, file):
context.chat_template_file = file context.chat_template_file = file
@step('a test chat template file named {name:MaybeEmptyString}')
def step_test_chat_template_file_named(context, name):
name = name.strip()
if name:
context.chat_template_file = f'../../../tests/chat/templates/{name}.jinja'
@step('using slot id {id_slot:d}') @step('using slot id {id_slot:d}')
def step_id_slot(context, id_slot: int): def step_id_slot(context, id_slot: int):
context.id_slot = id_slot context.id_slot = id_slot
@ -390,6 +409,29 @@ def step_response_format(context, response_format):
def step_tools(context, tools): def step_tools(context, tools):
context.tools = json.loads(tools) context.tools = json.loads(tools)
@step('python tool')
def step_python_tool(context):
if not context.tools:
context.tools = []
context.tools.append({
"type": "function",
"function": {
"name": "ipython",
"description": "",
"parameters": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": ""
}
},
"required": ["code"]
}
}
})
@step('a tool choice {tool_choice}') @step('a tool choice {tool_choice}')
def step_tool_choice(context, tool_choice): def step_tool_choice(context, tool_choice):
context.tool_choice = tool_choice context.tool_choice = tool_choice
@ -1552,6 +1594,8 @@ def start_server_background(context):
server_args.extend(['--lora', context.lora_file]) server_args.extend(['--lora', context.lora_file])
if context.disable_ctx_shift: if context.disable_ctx_shift:
server_args.extend(['--no-context-shift']) server_args.extend(['--no-context-shift'])
if not context.warmup:
server_args.extend(['--no-warmup'])
args = [str(arg) for arg in [context.server_path, *server_args]] args = [str(arg) for arg in [context.server_path, *server_args]]
print(f"bench: starting server with: {' '.join(args)}") print(f"bench: starting server with: {' '.join(args)}")

View file

@ -4,20 +4,18 @@ Feature: llama.cpp server
Background: Server startup Background: Server startup
Given a server listening on localhost:8080 Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model file test-model.gguf
And a model alias tinyllama-2
And BOS token is 1 And BOS token is 1
And 42 as server seed And 42 as server seed
And 8192 KV cache size And 8192 KV cache size
And 32 as batch size And 32 as batch size
And 2 slots And 1 slots
And prometheus compatible metrics exposed And prometheus compatible metrics exposed
And jinja templates are enabled And jinja templates are enabled
Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool) Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool)
Given a chat template file ../../../tests/chat/templates/<template_name>.jinja Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a test chat template file named <template_name>
And the server is starting And the server is starting
And the server is healthy And the server is healthy
And a model test And a model test
@ -44,7 +42,8 @@ Feature: llama.cpp server
Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template) Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template)
Given a chat template file ../../../tests/chat/templates/<template_name>.jinja Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a test chat template file named <template_name>
And the server is starting And the server is starting
And the server is healthy And the server is healthy
And a model test And a model test
@ -62,7 +61,8 @@ Feature: llama.cpp server
Scenario: OAI Compatibility w/ no tool Scenario: OAI Compatibility w/ no tool
Given a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
And the server is starting And the server is starting
And the server is healthy And the server is healthy
And a model test And a model test
@ -73,3 +73,29 @@ Feature: llama.cpp server
And an OAI compatible chat completions request with no api error And an OAI compatible chat completions request with no api error
Then no tool is called Then no tool is called
@slow
Scenario Outline: OAI Compatibility w/ tools (<hf_repo> / <hf_file> with <template_override> template)
Given a model file <hf_file> from HF repo <hf_repo>
And a test chat template file named <template_override>
And no warmup
And the server is starting
And the server is healthy
And a model test
And 256 max tokens to predict
And a user prompt write a hello world in python (use single quotes for strings)
And python tool
And parallel tool calls is disabled
And an OAI compatible chat completions request with no api error
Then tool <tool_name> is called with arguments <tool_arguments>
Examples: Prompts
| tool_name | tool_arguments | hf_repo | hf_file | template_override |
| ipython | {"code": "print('Hello, world!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
| ipython | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF | Mistral-Nemo-Instruct-2407-Q8_0.gguf | mistralai-Mistral-Nemo-Instruct-2407 |
| ipython | {"code": "print('Hello, World!'}"} | lmstudio-community/Llama-3.2-1B-Instruct-GGUF | Llama-3.2-1B-Instruct-Q4_K_M.gguf | meta-llama-Llama-3.2-3B-Instruct |
| ipython | {"code": "print("} | lmstudio-community/Llama-3.2-3B-Instruct-GGUF | Llama-3.2-3B-Instruct-Q6_K.gguf | meta-llama-Llama-3.2-3B-Instruct |
| ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf | |
| ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf | |
# | ipython | {"code": "print('Hello, World!')"} | meetkai/functionary-small-v3.2-GGUF | functionary-small-v3.2.Q4_0.gguf | meetkai-functionary-medium-v3.2 |

View file

@ -5,7 +5,7 @@ set -eu
if [ $# -lt 1 ] if [ $# -lt 1 ]
then then
# Start @llama.cpp scenario # Start @llama.cpp scenario
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp,-slow
else else
behave "$@" behave "$@"
fi fi