server: tests: fix passkey, add doc, fix regex content matching, fix timeout

2024-03-02 18:53:01 +01:00 · 2024-03-02 18:53:01 +01:00 · 407cc609d3
commit 407cc609d3
parent 8abf8d3a08
6 changed files with 34 additions and 40 deletions
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@ -1,4 +1,5 @@
 # List of ongoing issues
+# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
@bug
 Feature: Issues
  # No confirmed issue at the moment
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@ -1,8 +1,6 @@
-#@llama.cpp
+# run with: ./tests.sh --no-skipped --tags passkey
@passkey
-@wip
@slow
-@bug
 Feature: Passkey / Self-extend with context shift

  Background: Server startup
@ -17,10 +15,8 @@ Feature: Passkey / Self-extend with context shift
    And   <n_batch> as batch size
    And   <n_junk> as number of junk
    And   <n_predicted> server max tokens to predict
-    And   a self-extend context with a factor of <n_grp>
-    And   <seed> as seed
-    And   a KV cache size based on the model trained context <n_ctx_train> extended by <n_grp> with additional <n_keep> tokens
-    And   <n_slots> slots
+    And   42 as seed
+    And   <n_ctx> KV cache size
    And   <n_ga> group attention factor to extend context size through self-extend
    And   <n_ga_w> group attention width to extend context size through self-extend
    # Can be override with N_GPU_LAYERS
@ -50,5 +46,5 @@ Feature: Passkey / Self-extend with context shift
    Then <n_predicted> tokens are predicted matching <re_content>

    Examples:
-      | hf_repo             | hf_file           | n_ctx_train | ngl | n_batch | n_slots | n_ga | n_ga_w | n_junk | n_grp | i_pos | seed | n_keep | passkey | n_predicted | re_content |
-      | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048        | 5   | 512     | 1       | 8    | 512    | 250    | 4     | 50    | 86   | 32     | 42      | 32          | .*42.*     |
+      | hf_repo             | hf_file           | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
+      | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048        | 5   | 16384 | 512     | 16   | 512    | 250    | 50    | 42      | 1           | 42         |
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -31,9 +31,9 @@ Feature: llama.cpp server
    And   prometheus metrics are exposed

    Examples: Prompts
-      | prompt                           | n_predict | re_content                             | n_predicted |
-      | I believe the meaning of life is | 8         | (read<or>going)+                       | 8           |
-      | Write a joke about AI            | 64        | (park<or>friends<or>scared<or>always)+ | 32          |
+      | prompt                           | n_predict | re_content                       | n_predicted |
+      | I believe the meaning of life is | 8         | (read\|going)+                   | 8           |
+      | Write a joke about AI            | 64        | (park\|friends\|scared\|always)+ | 32          |

  Scenario Outline: OAI Compatibility
    Given a model <model>
@ -45,9 +45,9 @@ Feature: llama.cpp server
    Then  <n_predicted> tokens are predicted matching <re_content>

    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                 | n_predicted | enable_streaming |
-      | llama-2      | Book                        | What is the best book                | 8          | (Mom<or>what)+             | 8           | disabled         |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks<or>happy<or>bird)+ | 32          | enabled          |
+      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_predicted | enable_streaming |
+      | llama-2      | Book                        | What is the best book                | 8          | (Mom\|what)+           | 8           | disabled         |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks\|happy\|bird)+ | 32          | enabled          |

  Scenario: Embedding
    When embeddings are computed for:
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -84,12 +84,6 @@ def step_n_ctx(context, n_ctx):
    context.n_ctx = n_ctx


-@step(u'a KV cache size based on the model trained context {n_ctx_train:d}'
-      u' extended by {n_grp:d} with additional {n_keep:d} tokens')
-def step_kv_cache_size_extended(context, n_ctx_train, n_grp, n_keep):
-    context.n_ctx = n_ctx_train * n_grp + n_keep
-
-
@step(u'{n_slots:d} slots')
 def step_n_slots(context, n_slots):
    context.n_slots = n_slots
@ -146,7 +140,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
                                         slots_idle=context.n_slots,
                                         slots_processing=0,
                                         expected_slots=[{'id': slot_id, 'state': 0}
-                                                         for slot_id in range(context.n_slots if context.n_slots else 1)])
+                                                         for slot_id in
+                                                         range(context.n_slots if context.n_slots else 1)])
        case 'busy':
            await wait_for_health_status(context, context.base_url, 503,
                                         'no slot available',
@ -154,7 +149,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
                                         slots_idle=0,
                                         slots_processing=context.n_slots,
                                         expected_slots=[{'id': slot_id, 'state': 1}
-                                                         for slot_id in range(context.n_slots if context.n_slots else 1)])
+                                                         for slot_id in
+                                                         range(context.n_slots if context.n_slots else 1)])
        case _:
            assert False, "unknown status"

@ -258,11 +254,6 @@ def step_n_batch(context, n_batch):
    context.n_batch = n_batch


-@step(u'a self-extend context with a factor of {n_grp:d}')
-def step_n_grp(context, n_grp):
-    context.n_grp = n_grp
-
-
@step(u'{seed:d} as seed')
 def step_seed(context, seed):
    context.seed = seed
@ -282,6 +273,7 @@ def step_prompt_junk_suffix(context):
 def step_prompt_suffix(context):
    context.prompt_suffix = context.text

+
@step(u'{n_ga:d} group attention factor'
      u' to extend context size through self-extend')
 def step_impl(context, n_ga):
@ -294,8 +286,8 @@ def step_impl(context, n_ga_w):


@step(u'a passkey prompt template')
-def step_prompt_passkey_template(context):
-    context.prompt_passkey_template = context.text
+def step_prompt_passkey(context):
+    context.prompt_passkey = context.text


@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
@ -303,10 +295,11 @@ def step_prompt_passkey(context, passkey, i_pos):
    prompt = ""
    for i in range(context.n_junk):
        if i % context.n_junk == i_pos:
-            prompt += context.prompt_passkey_template
+            prompt += context.prompt_passkey # the passkey is already substituted
        prompt += context.prompt_junk_suffix
    if context.debug:
-        print(f"Passkey challenge:\n```\n{prompt}\n```\n")
+        passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
+        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
    context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)


@ -816,14 +809,18 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
    content = completion_response['content']
    n_predicted = completion_response['timings']['predicted_n']
    assert len(content) > 0, "no token predicted"
+    if re_content is not None:
+        re_content = f'^(.*)({re_content})(.*)$'
+        p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
+        match = p.match(content)
+        assert match and len(match.groups()) == 3, f'/{re_content}/g must match ```{content}```'
+        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+            highlighted = p.sub(r"\1<hi>\2</hi>\3", content).replace('<hi>', '\x1b[33m').replace('</hi>', '\x1b[0m')
+            print(f"Checking completion response: {highlighted}\n")
    if expected_predicted_n and expected_predicted_n > 0:
        assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
                                                     f' {n_predicted} <> {expected_predicted_n}')
-    if re_content is not None:
-        re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
-        assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
-            f'invalid tokens predicted:'
-            f' ```\n{content}\n``` do not match /{re_content}/')
+


 async def gather_tasks_results(context):
@ -840,7 +837,7 @@ async def wait_for_health_status(context,
                                 base_url,
                                 expected_http_status_code,
                                 expected_health_status,
-                                 timeout = 3,
+                                 timeout=3,
                                 params=None,
                                 slots_idle=None,
                                 slots_processing=None,
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@ -1,4 +1,4 @@
-# run with ./test.sh --tags wrong_usage
+# run with: ./tests.sh --no-skipped --tags wrong_usage
@wrong_usage
 Feature: Wrong usage of llama.cpp server

--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -5,7 +5,7 @@ set -eu
 if [ $# -lt 1 ]
 then
  # Start @llama.cpp scenario
-  behave --summary --stop --no-capture --exclude 'issues|wrong_usages|slow' --tags llama.cpp
+  behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 else
  behave "$@"
 fi