server: tests: fix passkey, add doc, fix regex content matching, fix timeout
This commit is contained in:
parent
8abf8d3a08
commit
407cc609d3
6 changed files with 34 additions and 40 deletions
|
@ -1,4 +1,5 @@
|
|||
# List of ongoing issues
|
||||
# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
|
||||
@bug
|
||||
Feature: Issues
|
||||
# No confirmed issue at the moment
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
#@llama.cpp
|
||||
# run with: ./tests.sh --no-skipped --tags passkey
|
||||
@passkey
|
||||
@wip
|
||||
@slow
|
||||
@bug
|
||||
Feature: Passkey / Self-extend with context shift
|
||||
|
||||
Background: Server startup
|
||||
|
@ -17,10 +15,8 @@ Feature: Passkey / Self-extend with context shift
|
|||
And <n_batch> as batch size
|
||||
And <n_junk> as number of junk
|
||||
And <n_predicted> server max tokens to predict
|
||||
And a self-extend context with a factor of <n_grp>
|
||||
And <seed> as seed
|
||||
And a KV cache size based on the model trained context <n_ctx_train> extended by <n_grp> with additional <n_keep> tokens
|
||||
And <n_slots> slots
|
||||
And 42 as seed
|
||||
And <n_ctx> KV cache size
|
||||
And <n_ga> group attention factor to extend context size through self-extend
|
||||
And <n_ga_w> group attention width to extend context size through self-extend
|
||||
# Can be override with N_GPU_LAYERS
|
||||
|
@ -50,5 +46,5 @@ Feature: Passkey / Self-extend with context shift
|
|||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
|
||||
Examples:
|
||||
| hf_repo | hf_file | n_ctx_train | ngl | n_batch | n_slots | n_ga | n_ga_w | n_junk | n_grp | i_pos | seed | n_keep | passkey | n_predicted | re_content |
|
||||
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 512 | 1 | 8 | 512 | 250 | 4 | 50 | 86 | 32 | 42 | 32 | .*42.* |
|
||||
| hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
|
||||
| TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 16384 | 512 | 16 | 512 | 250 | 50 | 42 | 1 | 42 |
|
||||
|
|
|
@ -31,9 +31,9 @@ Feature: llama.cpp server
|
|||
And prometheus metrics are exposed
|
||||
|
||||
Examples: Prompts
|
||||
| prompt | n_predict | re_content | n_predicted |
|
||||
| I believe the meaning of life is | 8 | (read<or>going)+ | 8 |
|
||||
| Write a joke about AI | 64 | (park<or>friends<or>scared<or>always)+ | 32 |
|
||||
| prompt | n_predict | re_content | n_predicted |
|
||||
| I believe the meaning of life is | 8 | (read\|going)+ | 8 |
|
||||
| Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 |
|
||||
|
||||
Scenario Outline: OAI Compatibility
|
||||
Given a model <model>
|
||||
|
@ -45,9 +45,9 @@ Feature: llama.cpp server
|
|||
Then <n_predicted> tokens are predicted matching <re_content>
|
||||
|
||||
Examples: Prompts
|
||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
|
||||
| llama-2 | Book | What is the best book | 8 | (Mom<or>what)+ | 8 | disabled |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks<or>happy<or>bird)+ | 32 | enabled |
|
||||
| model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming |
|
||||
| llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled |
|
||||
|
||||
Scenario: Embedding
|
||||
When embeddings are computed for:
|
||||
|
|
|
@ -84,12 +84,6 @@ def step_n_ctx(context, n_ctx):
|
|||
context.n_ctx = n_ctx
|
||||
|
||||
|
||||
@step(u'a KV cache size based on the model trained context {n_ctx_train:d}'
|
||||
u' extended by {n_grp:d} with additional {n_keep:d} tokens')
|
||||
def step_kv_cache_size_extended(context, n_ctx_train, n_grp, n_keep):
|
||||
context.n_ctx = n_ctx_train * n_grp + n_keep
|
||||
|
||||
|
||||
@step(u'{n_slots:d} slots')
|
||||
def step_n_slots(context, n_slots):
|
||||
context.n_slots = n_slots
|
||||
|
@ -146,7 +140,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
|||
slots_idle=context.n_slots,
|
||||
slots_processing=0,
|
||||
expected_slots=[{'id': slot_id, 'state': 0}
|
||||
for slot_id in range(context.n_slots if context.n_slots else 1)])
|
||||
for slot_id in
|
||||
range(context.n_slots if context.n_slots else 1)])
|
||||
case 'busy':
|
||||
await wait_for_health_status(context, context.base_url, 503,
|
||||
'no slot available',
|
||||
|
@ -154,7 +149,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
|||
slots_idle=0,
|
||||
slots_processing=context.n_slots,
|
||||
expected_slots=[{'id': slot_id, 'state': 1}
|
||||
for slot_id in range(context.n_slots if context.n_slots else 1)])
|
||||
for slot_id in
|
||||
range(context.n_slots if context.n_slots else 1)])
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
|
@ -258,11 +254,6 @@ def step_n_batch(context, n_batch):
|
|||
context.n_batch = n_batch
|
||||
|
||||
|
||||
@step(u'a self-extend context with a factor of {n_grp:d}')
|
||||
def step_n_grp(context, n_grp):
|
||||
context.n_grp = n_grp
|
||||
|
||||
|
||||
@step(u'{seed:d} as seed')
|
||||
def step_seed(context, seed):
|
||||
context.seed = seed
|
||||
|
@ -282,6 +273,7 @@ def step_prompt_junk_suffix(context):
|
|||
def step_prompt_suffix(context):
|
||||
context.prompt_suffix = context.text
|
||||
|
||||
|
||||
@step(u'{n_ga:d} group attention factor'
|
||||
u' to extend context size through self-extend')
|
||||
def step_impl(context, n_ga):
|
||||
|
@ -294,8 +286,8 @@ def step_impl(context, n_ga_w):
|
|||
|
||||
|
||||
@step(u'a passkey prompt template')
|
||||
def step_prompt_passkey_template(context):
|
||||
context.prompt_passkey_template = context.text
|
||||
def step_prompt_passkey(context):
|
||||
context.prompt_passkey = context.text
|
||||
|
||||
|
||||
@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
|
||||
|
@ -303,10 +295,11 @@ def step_prompt_passkey(context, passkey, i_pos):
|
|||
prompt = ""
|
||||
for i in range(context.n_junk):
|
||||
if i % context.n_junk == i_pos:
|
||||
prompt += context.prompt_passkey_template
|
||||
prompt += context.prompt_passkey # the passkey is already substituted
|
||||
prompt += context.prompt_junk_suffix
|
||||
if context.debug:
|
||||
print(f"Passkey challenge:\n```\n{prompt}\n```\n")
|
||||
passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
|
||||
print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
|
||||
context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
|
||||
|
||||
|
||||
|
@ -816,14 +809,18 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
|
|||
content = completion_response['content']
|
||||
n_predicted = completion_response['timings']['predicted_n']
|
||||
assert len(content) > 0, "no token predicted"
|
||||
if re_content is not None:
|
||||
re_content = f'^(.*)({re_content})(.*)$'
|
||||
p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
|
||||
match = p.match(content)
|
||||
assert match and len(match.groups()) == 3, f'/{re_content}/g must match ```{content}```'
|
||||
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
|
||||
highlighted = p.sub(r"\1<hi>\2</hi>\3", content).replace('<hi>', '\x1b[33m').replace('</hi>', '\x1b[0m')
|
||||
print(f"Checking completion response: {highlighted}\n")
|
||||
if expected_predicted_n and expected_predicted_n > 0:
|
||||
assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
|
||||
f' {n_predicted} <> {expected_predicted_n}')
|
||||
if re_content is not None:
|
||||
re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
|
||||
assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
|
||||
f'invalid tokens predicted:'
|
||||
f' ```\n{content}\n``` do not match /{re_content}/')
|
||||
|
||||
|
||||
|
||||
async def gather_tasks_results(context):
|
||||
|
@ -840,7 +837,7 @@ async def wait_for_health_status(context,
|
|||
base_url,
|
||||
expected_http_status_code,
|
||||
expected_health_status,
|
||||
timeout = 3,
|
||||
timeout=3,
|
||||
params=None,
|
||||
slots_idle=None,
|
||||
slots_processing=None,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# run with ./test.sh --tags wrong_usage
|
||||
# run with: ./tests.sh --no-skipped --tags wrong_usage
|
||||
@wrong_usage
|
||||
Feature: Wrong usage of llama.cpp server
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ set -eu
|
|||
if [ $# -lt 1 ]
|
||||
then
|
||||
# Start @llama.cpp scenario
|
||||
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|slow' --tags llama.cpp
|
||||
behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||
else
|
||||
behave "$@"
|
||||
fi
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue