server: add test for token probs (#7347)
This commit is contained in:
parent
41858392e1
commit
1b01f06db0
3 changed files with 81 additions and 10 deletions
|
@ -70,12 +70,48 @@ Feature: Results
|
|||
Then all predictions are equal
|
||||
Examples:
|
||||
| n_parallel | temp |
|
||||
| 1 | 0.0 |
|
||||
| 2 | 0.0 |
|
||||
| 4 | 0.0 |
|
||||
| 1 | 1.0 |
|
||||
# FIXME: These tests fail on master. The problem seems to be the unified KV cache.
|
||||
| 1 | 0.0 |
|
||||
| 2 | 0.0 |
|
||||
| 4 | 0.0 |
|
||||
| 1 | 1.0 |
|
||||
# FIXME: These tests fail on master.
|
||||
# Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
|
||||
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
|
||||
# | 2 | 1.0 |
|
||||
# | 4 | 1.0 |
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
|
||||
# | 2 | 1.0 |
|
||||
# | 4 | 1.0 |
|
||||
|
||||
Scenario Outline: consistent token probs with same seed and prompt
|
||||
Given <n_slots> slots
|
||||
And <n_kv> KV cache size
|
||||
And 1.0 temperature
|
||||
And <n_predict> max tokens to predict
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
|
||||
Given 1 prompts "The meaning of life is" with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Given <n_parallel> prompts "The meaning of life is" with seed 42
|
||||
And concurrent completion requests
|
||||
# Then the server is busy # Not all slots will be utilized.
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
|
||||
Then all token probabilities are equal
|
||||
Examples:
|
||||
| n_slots | n_kv | n_predict | n_parallel |
|
||||
| 4 | 1024 | 1 | 1 |
|
||||
| 4 | 1024 | 1 | 4 |
|
||||
# FIXME: These tests fail on master.
|
||||
# Problems: unified KV cache (except for CPU backend with LLAMA_NO_LLAMAFILE=1), SIMD nondeterminism.
|
||||
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574
|
||||
# and https://github.com/ggerganov/llama.cpp/pull/7347 .
|
||||
# | 4 | 1024 | 100 | 1 |
|
||||
# This test still fails even the above patches; the first token probabilities are already different.
|
||||
# | 4 | 1024 | 100 | 4 |
|
||||
|
|
|
@ -23,6 +23,7 @@ from prometheus_client import parser
|
|||
def step_server_config(context, server_fqdn, server_port):
|
||||
context.server_fqdn = server_fqdn
|
||||
context.server_port = int(server_port)
|
||||
context.n_threads = None
|
||||
context.n_gpu_layer = None
|
||||
if 'PORT' in os.environ:
|
||||
context.server_port = int(os.environ['PORT'])
|
||||
|
@ -109,6 +110,11 @@ def step_n_gpu_layer(context, ngl):
|
|||
context.n_gpu_layer = ngl
|
||||
|
||||
|
||||
@step('{n_threads:d} threads')
|
||||
def step_n_threads(context, n_threads):
|
||||
context.n_thread = n_threads
|
||||
|
||||
|
||||
@step('{draft:d} as draft')
|
||||
def step_draft(context, draft):
|
||||
context.draft = draft
|
||||
|
@ -274,13 +280,22 @@ async def step_predictions_equal(context):
|
|||
|
||||
@step('all predictions are different')
|
||||
@async_run_until_complete
|
||||
async def step_predictions_equal(context):
|
||||
async def step_predictions_different(context):
|
||||
n_completions = await gather_tasks_results(context)
|
||||
assert n_completions >= 2, "need at least 2 completions"
|
||||
assert_all_predictions_different(context.tasks_result)
|
||||
context.tasks_result = []
|
||||
|
||||
|
||||
@step('all token probabilities are equal')
|
||||
@async_run_until_complete
|
||||
async def step_token_probabilities_equal(context):
|
||||
n_completions = await gather_tasks_results(context)
|
||||
assert n_completions >= 2, "need at least 2 completions"
|
||||
assert_all_token_probabilities_equal(context.tasks_result)
|
||||
context.tasks_result = []
|
||||
|
||||
|
||||
@step('the completion is truncated')
|
||||
def step_assert_completion_truncated(context):
|
||||
step_assert_completion_truncated(context, '')
|
||||
|
@ -869,6 +884,7 @@ async def request_completion(prompt,
|
|||
"id_slot": id_slot,
|
||||
"seed": seed if seed is not None else 42,
|
||||
"temperature": temperature if temperature is not None else "0.8f",
|
||||
"n_probs": 2,
|
||||
},
|
||||
headers=headers,
|
||||
timeout=3600) as response:
|
||||
|
@ -1123,6 +1139,23 @@ def assert_all_predictions_different(completion_responses):
|
|||
assert content_i != content_j, "contents not different"
|
||||
|
||||
|
||||
def assert_all_token_probabilities_equal(completion_responses):
|
||||
n_predict = len(completion_responses[0]['completion_probabilities'])
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
for pos in range(n_predict):
|
||||
for i, response_i in enumerate(completion_responses):
|
||||
probs_i = response_i['completion_probabilities'][pos]['probs']
|
||||
print(f"pos {pos}, probs {i}: {probs_i}")
|
||||
for pos in range(n_predict):
|
||||
for i, response_i in enumerate(completion_responses):
|
||||
probs_i = response_i['completion_probabilities'][pos]['probs']
|
||||
for j, response_j in enumerate(completion_responses):
|
||||
if i == j:
|
||||
continue
|
||||
probs_j = response_j['completion_probabilities'][pos]['probs']
|
||||
assert probs_i == probs_j, "contents not equal"
|
||||
|
||||
|
||||
async def gather_tasks_results(context):
|
||||
n_tasks = len(context.concurrent_tasks)
|
||||
if context.debug:
|
||||
|
@ -1261,6 +1294,8 @@ def start_server_background(context):
|
|||
server_args.extend(['--batch-size', context.n_batch])
|
||||
if context.n_ubatch:
|
||||
server_args.extend(['--ubatch-size', context.n_ubatch])
|
||||
if context.n_threads:
|
||||
server_args.extend(['--threads', context.threads])
|
||||
if context.n_gpu_layer:
|
||||
server_args.extend(['--n-gpu-layers', context.n_gpu_layer])
|
||||
if context.draft is not None:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue