Server: add tests for consistent results
This commit is contained in:
parent
192090bae4
commit
b1a189115e
2 changed files with 91 additions and 0 deletions
57
examples/server/tests/features/results.feature
Normal file
57
examples/server/tests/features/results.feature
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
@llama.cpp
|
||||||
|
@results
|
||||||
|
Feature: Results
|
||||||
|
|
||||||
|
Background: Server startup
|
||||||
|
Given a server listening on localhost:8080
|
||||||
|
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
|
||||||
|
And a model file test-model-00001-of-00003.gguf
|
||||||
|
And 128 as batch size
|
||||||
|
And 256 KV cache size
|
||||||
|
And 128 max tokens to predict
|
||||||
|
|
||||||
|
Scenario Outline: Multi users completion
|
||||||
|
Given <n_slots> slots
|
||||||
|
And continuous batching
|
||||||
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given 42 as seed
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Given concurrent completion requests
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
And all slots are idle
|
||||||
|
Then all predictions are equal
|
||||||
|
Examples:
|
||||||
|
| n_slots |
|
||||||
|
| 1 |
|
||||||
|
| 2 |
|
|
@ -61,6 +61,7 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.server_metrics = False
|
context.server_metrics = False
|
||||||
context.server_process = None
|
context.server_process = None
|
||||||
context.seed = None
|
context.seed = None
|
||||||
|
context.draft = None
|
||||||
context.server_seed = None
|
context.server_seed = None
|
||||||
context.user_api_key = None
|
context.user_api_key = None
|
||||||
context.response_format = None
|
context.response_format = None
|
||||||
|
@ -107,6 +108,11 @@ def step_n_gpu_layer(context, ngl):
|
||||||
context.n_gpu_layer = ngl
|
context.n_gpu_layer = ngl
|
||||||
|
|
||||||
|
|
||||||
|
@step('{draft:d} as draft')
|
||||||
|
def step_draft(context, draft):
|
||||||
|
context.draft = draft
|
||||||
|
|
||||||
|
|
||||||
@step('{n_ctx:d} KV cache size')
|
@step('{n_ctx:d} KV cache size')
|
||||||
def step_n_ctx(context, n_ctx):
|
def step_n_ctx(context, n_ctx):
|
||||||
context.n_ctx = n_ctx
|
context.n_ctx = n_ctx
|
||||||
|
@ -254,6 +260,15 @@ def step_n_tokens_predicted(context, predicted_n):
|
||||||
assert_n_tokens_predicted(context.completion, predicted_n)
|
assert_n_tokens_predicted(context.completion, predicted_n)
|
||||||
|
|
||||||
|
|
||||||
|
@step('all predictions are equal')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_predictions_equal(context):
|
||||||
|
n_completions = await gather_tasks_results(context)
|
||||||
|
assert n_completions >= 2, "need at least 2 completions"
|
||||||
|
assert_all_predictions_equal(context.tasks_result)
|
||||||
|
context.tasks_result = []
|
||||||
|
|
||||||
|
|
||||||
@step('the completion is truncated')
|
@step('the completion is truncated')
|
||||||
def step_assert_completion_truncated(context):
|
def step_assert_completion_truncated(context):
|
||||||
step_assert_completion_truncated(context, '')
|
step_assert_completion_truncated(context, '')
|
||||||
|
@ -1020,6 +1035,23 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
||||||
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
||||||
f' {n_predicted} <> {expected_predicted_n}')
|
f' {n_predicted} <> {expected_predicted_n}')
|
||||||
|
|
||||||
|
def assert_all_predictions_equal(completion_responses):
|
||||||
|
content_0 = completion_responses[0]['content']
|
||||||
|
|
||||||
|
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||||
|
print(f"content 0: {content_0}")
|
||||||
|
|
||||||
|
i = 1
|
||||||
|
for response in completion_responses[1:]:
|
||||||
|
content = response['content']
|
||||||
|
|
||||||
|
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||||
|
print(f"content {i}: {content}")
|
||||||
|
|
||||||
|
assert content == content_0, "contents not equal"
|
||||||
|
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
async def gather_tasks_results(context):
|
async def gather_tasks_results(context):
|
||||||
n_tasks = len(context.concurrent_tasks)
|
n_tasks = len(context.concurrent_tasks)
|
||||||
|
@ -1148,6 +1180,8 @@ def start_server_background(context):
|
||||||
server_args.extend(['--ubatch-size', context.n_ubatch])
|
server_args.extend(['--ubatch-size', context.n_ubatch])
|
||||||
if context.n_gpu_layer:
|
if context.n_gpu_layer:
|
||||||
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
||||||
|
if context.draft is not None:
|
||||||
|
server_args.extend(['--draft', context.draft])
|
||||||
if context.server_continuous_batching:
|
if context.server_continuous_batching:
|
||||||
server_args.append('--cont-batching')
|
server_args.append('--cont-batching')
|
||||||
if context.server_embeddings:
|
if context.server_embeddings:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue