server: tests: fix passkey, add doc, fix regex content matching, fix timeout

This commit is contained in:
Pierrick HYMBERT 2024-03-02 18:53:01 +01:00
parent 8abf8d3a08
commit 407cc609d3
6 changed files with 34 additions and 40 deletions

View file

@ -1,4 +1,5 @@
# List of ongoing issues
# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
@bug
Feature: Issues
# No confirmed issue at the moment

View file

@ -1,8 +1,6 @@
#@llama.cpp
# run with: ./tests.sh --no-skipped --tags passkey
@passkey
@wip
@slow
@bug
Feature: Passkey / Self-extend with context shift
Background: Server startup
@ -17,10 +15,8 @@ Feature: Passkey / Self-extend with context shift
And <n_batch> as batch size
And <n_junk> as number of junk
And <n_predicted> server max tokens to predict
And a self-extend context with a factor of <n_grp>
And <seed> as seed
And a KV cache size based on the model trained context <n_ctx_train> extended by <n_grp> with additional <n_keep> tokens
And <n_slots> slots
And 42 as seed
And <n_ctx> KV cache size
And <n_ga> group attention factor to extend context size through self-extend
And <n_ga_w> group attention width to extend context size through self-extend
# Can be override with N_GPU_LAYERS
@ -50,5 +46,5 @@ Feature: Passkey / Self-extend with context shift
Then <n_predicted> tokens are predicted matching <re_content>
Examples:
| hf_repo | hf_file | n_ctx_train | ngl | n_batch | n_slots | n_ga | n_ga_w | n_junk | n_grp | i_pos | seed | n_keep | passkey | n_predicted | re_content |
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 512 | 1 | 8 | 512 | 250 | 4 | 50 | 86 | 32 | 42 | 32 | .*42.* |
| hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 16384 | 512 | 16 | 512 | 250 | 50 | 42 | 1 | 42 |

View file

@ -31,9 +31,9 @@ Feature: llama.cpp server
And prometheus metrics are exposed
Examples: Prompts
| prompt | n_predict | re_content | n_predicted |
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
| prompt | n_predict | re_content | n_predicted |
| I believe the meaning of life is | 8 | (read\|going)+ | 8 |
| Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
Scenario Outline: OAI Compatibility
Given a model <model>
@ -45,9 +45,9 @@ Feature: llama.cpp server
Then <n_predicted> tokens are predicted matching <re_content>
Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
| llama-2 | Book | What is the best book | 8 | (Mom<or>what)+ | 8 | disabled |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks<or>happy<or>bird)+ | 32 | enabled |
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
Scenario: Embedding
When embeddings are computed for:

View file

@ -84,12 +84,6 @@ def step_n_ctx(context, n_ctx):
context.n_ctx = n_ctx
@step(u'a KV cache size based on the model trained context {n_ctx_train:d}'
u' extended by {n_grp:d} with additional {n_keep:d} tokens')
def step_kv_cache_size_extended(context, n_ctx_train, n_grp, n_keep):
context.n_ctx = n_ctx_train * n_grp + n_keep
@step(u'{n_slots:d} slots')
def step_n_slots(context, n_slots):
context.n_slots = n_slots
@ -146,7 +140,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
slots_idle=context.n_slots,
slots_processing=0,
expected_slots=[{'id': slot_id, 'state': 0}
for slot_id in range(context.n_slots if context.n_slots else 1)])
for slot_id in
range(context.n_slots if context.n_slots else 1)])
case 'busy':
await wait_for_health_status(context, context.base_url, 503,
'no slot available',
@ -154,7 +149,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
slots_idle=0,
slots_processing=context.n_slots,
expected_slots=[{'id': slot_id, 'state': 1}
for slot_id in range(context.n_slots if context.n_slots else 1)])
for slot_id in
range(context.n_slots if context.n_slots else 1)])
case _:
assert False, "unknown status"
@ -258,11 +254,6 @@ def step_n_batch(context, n_batch):
context.n_batch = n_batch
@step(u'a self-extend context with a factor of {n_grp:d}')
def step_n_grp(context, n_grp):
context.n_grp = n_grp
@step(u'{seed:d} as seed')
def step_seed(context, seed):
context.seed = seed
@ -282,6 +273,7 @@ def step_prompt_junk_suffix(context):
def step_prompt_suffix(context):
context.prompt_suffix = context.text
@step(u'{n_ga:d} group attention factor'
u' to extend context size through self-extend')
def step_impl(context, n_ga):
@ -294,8 +286,8 @@ def step_impl(context, n_ga_w):
@step(u'a passkey prompt template')
def step_prompt_passkey_template(context):
context.prompt_passkey_template = context.text
def step_prompt_passkey(context):
context.prompt_passkey = context.text
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
@ -303,10 +295,11 @@ def step_prompt_passkey(context, passkey, i_pos):
prompt = ""
for i in range(context.n_junk):
if i % context.n_junk == i_pos:
prompt += context.prompt_passkey_template
prompt += context.prompt_passkey # the passkey is already substituted
prompt += context.prompt_junk_suffix
if context.debug:
print(f"Passkey challenge:\n```\n{prompt}\n```\n")
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
@ -816,14 +809,18 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
content = completion_response['content']
n_predicted = completion_response['timings']['predicted_n']
assert len(content) > 0, "no token predicted"
if re_content is not None:
re_content = f'^(.*)({re_content})(.*)$'
p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
match = p.match(content)
assert match and len(match.groups()) == 3, f'/{re_content}/g must match ```{content}```'
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
highlighted = p.sub(r"\1<hi>\2</hi>\3", content).replace('<hi>', '\x1b[33m').replace('</hi>', '\x1b[0m')
print(f"Checking completion response: {highlighted}\n")
if expected_predicted_n and expected_predicted_n > 0:
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
f' {n_predicted} <> {expected_predicted_n}')
if re_content is not None:
re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
f'invalid tokens predicted:'
f' ```\n{content}\n``` do not match /{re_content}/')
async def gather_tasks_results(context):
@ -840,7 +837,7 @@ async def wait_for_health_status(context,
base_url,
expected_http_status_code,
expected_health_status,
timeout = 3,
timeout=3,
params=None,
slots_idle=None,
slots_processing=None,

View file

@ -1,4 +1,4 @@
# run with ./test.sh --tags wrong_usage
# run with: ./tests.sh --no-skipped --tags wrong_usage
@wrong_usage
Feature: Wrong usage of llama.cpp server

View file

@ -5,7 +5,7 @@ set -eu
if [ $# -lt 1 ]
then
# Start @llama.cpp scenario
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|slow' --tags llama.cpp
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
else
behave "$@"
fi