tool-call
: slow tool call integration tests
This commit is contained in:
parent
ec9f3b101b
commit
9a86ea79a2
4 changed files with 82 additions and 12 deletions
|
@ -864,7 +864,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||||
[](common_params & params) {
|
[](common_params & params) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--spm-infill"},
|
{"--spm-infill"},
|
||||||
string_format(
|
string_format(
|
||||||
|
|
|
@ -20,7 +20,7 @@ import aiohttp
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import openai
|
import openai
|
||||||
from openai.types.chat import ChatCompletionChunk
|
from openai.types.chat import ChatCompletionChunk
|
||||||
from behave import step # pyright: ignore[reportAttributeAccessIssue]
|
from behave import register_type, step # pyright: ignore[reportAttributeAccessIssue]
|
||||||
from behave.api.async_step import async_run_until_complete
|
from behave.api.async_step import async_run_until_complete
|
||||||
from prometheus_client import parser
|
from prometheus_client import parser
|
||||||
|
|
||||||
|
@ -28,6 +28,13 @@ from prometheus_client import parser
|
||||||
|
|
||||||
DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
|
DEFAULT_TIMEOUT_SECONDS = aiohttp.ClientTimeout(total=600)
|
||||||
|
|
||||||
|
@parse.with_pattern(r".*")
|
||||||
|
def parse_maybe_empty_string(text):
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
register_type(MaybeEmptyString=parse_maybe_empty_string)
|
||||||
|
|
||||||
|
|
||||||
@step("a server listening on {server_fqdn}:{server_port}")
|
@step("a server listening on {server_fqdn}:{server_port}")
|
||||||
def step_server_config(context, server_fqdn: str, server_port: str):
|
def step_server_config(context, server_fqdn: str, server_port: str):
|
||||||
context.server_fqdn = server_fqdn
|
context.server_fqdn = server_fqdn
|
||||||
|
@ -82,6 +89,7 @@ def step_server_config(context, server_fqdn: str, server_port: str):
|
||||||
context.temperature = None
|
context.temperature = None
|
||||||
context.lora_file = None
|
context.lora_file = None
|
||||||
context.disable_ctx_shift = False
|
context.disable_ctx_shift = False
|
||||||
|
context.warmup = True
|
||||||
context.use_jinja = False
|
context.use_jinja = False
|
||||||
context.chat_template_file = None
|
context.chat_template_file = None
|
||||||
|
|
||||||
|
@ -98,7 +106,6 @@ def step_server_config(context, server_fqdn: str, server_port: str):
|
||||||
def step_download_hf_model(context, hf_file: str, hf_repo: str):
|
def step_download_hf_model(context, hf_file: str, hf_repo: str):
|
||||||
context.model_hf_repo = hf_repo
|
context.model_hf_repo = hf_repo
|
||||||
context.model_hf_file = hf_file
|
context.model_hf_file = hf_file
|
||||||
context.model_file = os.path.basename(hf_file)
|
|
||||||
|
|
||||||
@step('a lora adapter file from {lora_file_url}')
|
@step('a lora adapter file from {lora_file_url}')
|
||||||
def step_download_lora_file(context, lora_file_url: str):
|
def step_download_lora_file(context, lora_file_url: str):
|
||||||
|
@ -172,11 +179,23 @@ def step_use_jinja(context):
|
||||||
context.use_jinja = True
|
context.use_jinja = True
|
||||||
|
|
||||||
|
|
||||||
|
@step('no warmup')
|
||||||
|
def step_no_warmup(context):
|
||||||
|
context.warmup = False
|
||||||
|
|
||||||
|
|
||||||
@step('a chat template file {file}')
|
@step('a chat template file {file}')
|
||||||
def step_use_jinja(context, file):
|
def step_chat_template_file(context, file):
|
||||||
context.chat_template_file = file
|
context.chat_template_file = file
|
||||||
|
|
||||||
|
|
||||||
|
@step('a test chat template file named {name:MaybeEmptyString}')
|
||||||
|
def step_test_chat_template_file_named(context, name):
|
||||||
|
name = name.strip()
|
||||||
|
if name:
|
||||||
|
context.chat_template_file = f'../../../tests/chat/templates/{name}.jinja'
|
||||||
|
|
||||||
|
|
||||||
@step('using slot id {id_slot:d}')
|
@step('using slot id {id_slot:d}')
|
||||||
def step_id_slot(context, id_slot: int):
|
def step_id_slot(context, id_slot: int):
|
||||||
context.id_slot = id_slot
|
context.id_slot = id_slot
|
||||||
|
@ -390,6 +409,29 @@ def step_response_format(context, response_format):
|
||||||
def step_tools(context, tools):
|
def step_tools(context, tools):
|
||||||
context.tools = json.loads(tools)
|
context.tools = json.loads(tools)
|
||||||
|
|
||||||
|
|
||||||
|
@step('python tool')
|
||||||
|
def step_python_tool(context):
|
||||||
|
if not context.tools:
|
||||||
|
context.tools = []
|
||||||
|
context.tools.append({
|
||||||
|
"type": "function",
|
||||||
|
"function": {
|
||||||
|
"name": "ipython",
|
||||||
|
"description": "",
|
||||||
|
"parameters": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"code": {
|
||||||
|
"type": "string",
|
||||||
|
"description": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["code"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
@step('a tool choice {tool_choice}')
|
@step('a tool choice {tool_choice}')
|
||||||
def step_tool_choice(context, tool_choice):
|
def step_tool_choice(context, tool_choice):
|
||||||
context.tool_choice = tool_choice
|
context.tool_choice = tool_choice
|
||||||
|
@ -1552,6 +1594,8 @@ def start_server_background(context):
|
||||||
server_args.extend(['--lora', context.lora_file])
|
server_args.extend(['--lora', context.lora_file])
|
||||||
if context.disable_ctx_shift:
|
if context.disable_ctx_shift:
|
||||||
server_args.extend(['--no-context-shift'])
|
server_args.extend(['--no-context-shift'])
|
||||||
|
if not context.warmup:
|
||||||
|
server_args.extend(['--no-warmup'])
|
||||||
|
|
||||||
args = [str(arg) for arg in [context.server_path, *server_args]]
|
args = [str(arg) for arg in [context.server_path, *server_args]]
|
||||||
print(f"bench: starting server with: {' '.join(args)}")
|
print(f"bench: starting server with: {' '.join(args)}")
|
||||||
|
|
|
@ -4,20 +4,18 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
Background: Server startup
|
Background: Server startup
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
|
||||||
And a model file test-model.gguf
|
|
||||||
And a model alias tinyllama-2
|
|
||||||
And BOS token is 1
|
And BOS token is 1
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
And 8192 KV cache size
|
And 8192 KV cache size
|
||||||
And 32 as batch size
|
And 32 as batch size
|
||||||
And 2 slots
|
And 1 slots
|
||||||
And prometheus compatible metrics exposed
|
And prometheus compatible metrics exposed
|
||||||
And jinja templates are enabled
|
And jinja templates are enabled
|
||||||
|
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool)
|
Scenario Outline: OAI Compatibility w/ tools and required tool_choice (<template_name> template, <tool_name> tool)
|
||||||
Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
|
Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
|
And a test chat template file named <template_name>
|
||||||
And the server is starting
|
And the server is starting
|
||||||
And the server is healthy
|
And the server is healthy
|
||||||
And a model test
|
And a model test
|
||||||
|
@ -44,7 +42,8 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template)
|
Scenario Outline: OAI Compatibility w/ tools and auto tool_choice (<template_name> template)
|
||||||
Given a chat template file ../../../tests/chat/templates/<template_name>.jinja
|
Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
|
And a test chat template file named <template_name>
|
||||||
And the server is starting
|
And the server is starting
|
||||||
And the server is healthy
|
And the server is healthy
|
||||||
And a model test
|
And a model test
|
||||||
|
@ -62,7 +61,8 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
|
|
||||||
Scenario: OAI Compatibility w/ no tool
|
Scenario: OAI Compatibility w/ no tool
|
||||||
Given a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
|
Given a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
|
And a chat template file ../../../tests/chat/templates/meta-llama-Meta-Llama-3.1-8B-Instruct.jinja
|
||||||
And the server is starting
|
And the server is starting
|
||||||
And the server is healthy
|
And the server is healthy
|
||||||
And a model test
|
And a model test
|
||||||
|
@ -73,3 +73,29 @@ Feature: llama.cpp server
|
||||||
And an OAI compatible chat completions request with no api error
|
And an OAI compatible chat completions request with no api error
|
||||||
Then no tool is called
|
Then no tool is called
|
||||||
|
|
||||||
|
|
||||||
|
@slow
|
||||||
|
Scenario Outline: OAI Compatibility w/ tools (<hf_repo> / <hf_file> with <template_override> template)
|
||||||
|
Given a model file <hf_file> from HF repo <hf_repo>
|
||||||
|
And a test chat template file named <template_override>
|
||||||
|
And no warmup
|
||||||
|
And the server is starting
|
||||||
|
And the server is healthy
|
||||||
|
And a model test
|
||||||
|
And 256 max tokens to predict
|
||||||
|
And a user prompt write a hello world in python (use single quotes for strings)
|
||||||
|
And python tool
|
||||||
|
And parallel tool calls is disabled
|
||||||
|
And an OAI compatible chat completions request with no api error
|
||||||
|
Then tool <tool_name> is called with arguments <tool_arguments>
|
||||||
|
|
||||||
|
Examples: Prompts
|
||||||
|
| tool_name | tool_arguments | hf_repo | hf_file | template_override |
|
||||||
|
| ipython | {"code": "print('Hello, world!')"} | NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF | Hermes-2-Pro-Llama-3-8B-Q8_0.gguf | NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use |
|
||||||
|
| ipython | {"code": "print('Hello, World!')\n"} | bartowski/Mistral-Nemo-Instruct-2407-GGUF | Mistral-Nemo-Instruct-2407-Q8_0.gguf | mistralai-Mistral-Nemo-Instruct-2407 |
|
||||||
|
| ipython | {"code": "print('Hello, World!'}"} | lmstudio-community/Llama-3.2-1B-Instruct-GGUF | Llama-3.2-1B-Instruct-Q4_K_M.gguf | meta-llama-Llama-3.2-3B-Instruct |
|
||||||
|
| ipython | {"code": "print("} | lmstudio-community/Llama-3.2-3B-Instruct-GGUF | Llama-3.2-3B-Instruct-Q6_K.gguf | meta-llama-Llama-3.2-3B-Instruct |
|
||||||
|
| ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF | Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf | |
|
||||||
|
| ipython | {"code": "print("} | lmstudio-community/Meta-Llama-3.1-70B-Instruct-GGUF | Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf | |
|
||||||
|
# | ipython | {"code": "print('Hello, World!')"} | meetkai/functionary-small-v3.2-GGUF | functionary-small-v3.2.Q4_0.gguf | meetkai-functionary-medium-v3.2 |
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ set -eu
|
||||||
if [ $# -lt 1 ]
|
if [ $# -lt 1 ]
|
||||||
then
|
then
|
||||||
# Start @llama.cpp scenario
|
# Start @llama.cpp scenario
|
||||||
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp,-slow
|
||||||
else
|
else
|
||||||
behave "$@"
|
behave "$@"
|
||||||
fi
|
fi
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue