From 407cc609d376e1d803a74ff67248a3a0722bb762 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 2 Mar 2024 18:53:01 +0100 Subject: [PATCH] server: tests: fix passkey, add doc, fix regex content matching, fix timeout --- examples/server/tests/features/issues.feature | 1 + .../server/tests/features/passkey.feature | 14 +++--- examples/server/tests/features/server.feature | 12 +++--- examples/server/tests/features/steps/steps.py | 43 +++++++++---------- .../tests/features/wrong_usages.feature | 2 +- examples/server/tests/tests.sh | 2 +- 6 files changed, 34 insertions(+), 40 deletions(-) diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature index bf5a175a3..7b13e44ca 100644 --- a/examples/server/tests/features/issues.feature +++ b/examples/server/tests/features/issues.feature @@ -1,4 +1,5 @@ # List of ongoing issues +# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug @bug Feature: Issues # No confirmed issue at the moment diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature index 0825046b0..f93b47d49 100644 --- a/examples/server/tests/features/passkey.feature +++ b/examples/server/tests/features/passkey.feature @@ -1,8 +1,6 @@ -#@llama.cpp +# run with: ./tests.sh --no-skipped --tags passkey @passkey -@wip @slow -@bug Feature: Passkey / Self-extend with context shift Background: Server startup @@ -17,10 +15,8 @@ Feature: Passkey / Self-extend with context shift And as batch size And as number of junk And server max tokens to predict - And a self-extend context with a factor of - And as seed - And a KV cache size based on the model trained context extended by with additional tokens - And slots + And 42 as seed + And KV cache size And group attention factor to extend context size through self-extend And group attention width to extend context size through self-extend # Can be override with N_GPU_LAYERS @@ -50,5 +46,5 @@ Feature: Passkey / Self-extend with context shift Then tokens are predicted matching Examples: - | hf_repo | hf_file | n_ctx_train | ngl | n_batch | n_slots | n_ga | n_ga_w | n_junk | n_grp | i_pos | seed | n_keep | passkey | n_predicted | re_content | - | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 512 | 1 | 8 | 512 | 250 | 4 | 50 | 86 | 32 | 42 | 32 | .*42.* | + | hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content | + | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 16384 | 512 | 16 | 512 | 250 | 50 | 42 | 1 | 42 | diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 6b8dbf891..7c977bcce 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -31,9 +31,9 @@ Feature: llama.cpp server And prometheus metrics are exposed Examples: Prompts - | prompt | n_predict | re_content | n_predicted | - | I believe the meaning of life is | 8 | (readgoing)+ | 8 | - | Write a joke about AI | 64 | (parkfriendsscaredalways)+ | 32 | + | prompt | n_predict | re_content | n_predicted | + | I believe the meaning of life is | 8 | (read\|going)+ | 8 | + | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 | Scenario Outline: OAI Compatibility Given a model @@ -45,9 +45,9 @@ Feature: llama.cpp server Then tokens are predicted matching Examples: Prompts - | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | - | llama-2 | Book | What is the best book | 8 | (Momwhat)+ | 8 | disabled | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thankshappybird)+ | 32 | enabled | + | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | + | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled | Scenario: Embedding When embeddings are computed for: diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index f73dc9a56..af097fa61 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -84,12 +84,6 @@ def step_n_ctx(context, n_ctx): context.n_ctx = n_ctx -@step(u'a KV cache size based on the model trained context {n_ctx_train:d}' - u' extended by {n_grp:d} with additional {n_keep:d} tokens') -def step_kv_cache_size_extended(context, n_ctx_train, n_grp, n_keep): - context.n_ctx = n_ctx_train * n_grp + n_keep - - @step(u'{n_slots:d} slots') def step_n_slots(context, n_slots): context.n_slots = n_slots @@ -146,7 +140,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status): slots_idle=context.n_slots, slots_processing=0, expected_slots=[{'id': slot_id, 'state': 0} - for slot_id in range(context.n_slots if context.n_slots else 1)]) + for slot_id in + range(context.n_slots if context.n_slots else 1)]) case 'busy': await wait_for_health_status(context, context.base_url, 503, 'no slot available', @@ -154,7 +149,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status): slots_idle=0, slots_processing=context.n_slots, expected_slots=[{'id': slot_id, 'state': 1} - for slot_id in range(context.n_slots if context.n_slots else 1)]) + for slot_id in + range(context.n_slots if context.n_slots else 1)]) case _: assert False, "unknown status" @@ -258,11 +254,6 @@ def step_n_batch(context, n_batch): context.n_batch = n_batch -@step(u'a self-extend context with a factor of {n_grp:d}') -def step_n_grp(context, n_grp): - context.n_grp = n_grp - - @step(u'{seed:d} as seed') def step_seed(context, seed): context.seed = seed @@ -282,6 +273,7 @@ def step_prompt_junk_suffix(context): def step_prompt_suffix(context): context.prompt_suffix = context.text + @step(u'{n_ga:d} group attention factor' u' to extend context size through self-extend') def step_impl(context, n_ga): @@ -294,8 +286,8 @@ def step_impl(context, n_ga_w): @step(u'a passkey prompt template') -def step_prompt_passkey_template(context): - context.prompt_passkey_template = context.text +def step_prompt_passkey(context): + context.prompt_passkey = context.text @step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') @@ -303,10 +295,11 @@ def step_prompt_passkey(context, passkey, i_pos): prompt = "" for i in range(context.n_junk): if i % context.n_junk == i_pos: - prompt += context.prompt_passkey_template + prompt += context.prompt_passkey # the passkey is already substituted prompt += context.prompt_junk_suffix if context.debug: - print(f"Passkey challenge:\n```\n{prompt}\n```\n") + passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m" + print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n") context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix) @@ -816,14 +809,18 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re content = completion_response['content'] n_predicted = completion_response['timings']['predicted_n'] assert len(content) > 0, "no token predicted" + if re_content is not None: + re_content = f'^(.*)({re_content})(.*)$' + p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL) + match = p.match(content) + assert match and len(match.groups()) == 3, f'/{re_content}/g must match ```{content}```' + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + highlighted = p.sub(r"\1\2\3", content).replace('', '\x1b[33m').replace('', '\x1b[0m') + print(f"Checking completion response: {highlighted}\n") if expected_predicted_n and expected_predicted_n > 0: assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' f' {n_predicted} <> {expected_predicted_n}') - if re_content is not None: - re_content = '^.*' + re_content.replace('', '|') + '.*$' - assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), ( - f'invalid tokens predicted:' - f' ```\n{content}\n``` do not match /{re_content}/') + async def gather_tasks_results(context): @@ -840,7 +837,7 @@ async def wait_for_health_status(context, base_url, expected_http_status_code, expected_health_status, - timeout = 3, + timeout=3, params=None, slots_idle=None, slots_processing=None, diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature index bfc7a7f55..cf14b3b44 100644 --- a/examples/server/tests/features/wrong_usages.feature +++ b/examples/server/tests/features/wrong_usages.feature @@ -1,4 +1,4 @@ -# run with ./test.sh --tags wrong_usage +# run with: ./tests.sh --no-skipped --tags wrong_usage @wrong_usage Feature: Wrong usage of llama.cpp server diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index a6f4e798e..1c6c5695f 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -5,7 +5,7 @@ set -eu if [ $# -lt 1 ] then # Start @llama.cpp scenario - behave --summary --stop --no-capture --exclude 'issues|wrong_usages|slow' --tags llama.cpp + behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp else behave "$@" fi