From 407cc609d376e1d803a74ff67248a3a0722bb762 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 2 Mar 2024 18:53:01 +0100
Subject: [PATCH] server: tests: fix passkey, add doc, fix regex content
 matching, fix timeout

---
 examples/server/tests/features/issues.feature |  1 +
 .../server/tests/features/passkey.feature     | 14 +++---
 examples/server/tests/features/server.feature | 12 +++---
 examples/server/tests/features/steps/steps.py | 43 +++++++++----------
 .../tests/features/wrong_usages.feature       |  2 +-
 examples/server/tests/tests.sh                |  2 +-
 6 files changed, 34 insertions(+), 40 deletions(-)
diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature
index bf5a175a3..7b13e44ca 100644
--- a/examples/server/tests/features/issues.feature
+++ b/examples/server/tests/features/issues.feature
@@ -1,4 +1,5 @@
 # List of ongoing issues
+# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug
 @bug
 Feature: Issues
   # No confirmed issue at the moment
diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature
index 0825046b0..f93b47d49 100644
--- a/examples/server/tests/features/passkey.feature
+++ b/examples/server/tests/features/passkey.feature
@@ -1,8 +1,6 @@
-#@llama.cpp
+# run with: ./tests.sh --no-skipped --tags passkey
 @passkey
-@wip
 @slow
-@bug
 Feature: Passkey / Self-extend with context shift
 
   Background: Server startup
@@ -17,10 +15,8 @@ Feature: Passkey / Self-extend with context shift
     And   <n_batch> as batch size
     And   <n_junk> as number of junk
     And   <n_predicted> server max tokens to predict
-    And   a self-extend context with a factor of <n_grp>
-    And   <seed> as seed
-    And   a KV cache size based on the model trained context <n_ctx_train> extended by <n_grp> with additional <n_keep> tokens
-    And   <n_slots> slots
+    And   42 as seed
+    And   <n_ctx> KV cache size
     And   <n_ga> group attention factor to extend context size through self-extend
     And   <n_ga_w> group attention width to extend context size through self-extend
     # Can be override with N_GPU_LAYERS
@@ -50,5 +46,5 @@ Feature: Passkey / Self-extend with context shift
     Then <n_predicted> tokens are predicted matching <re_content>
 
     Examples:
-      | hf_repo             | hf_file           | n_ctx_train | ngl | n_batch | n_slots | n_ga | n_ga_w | n_junk | n_grp | i_pos | seed | n_keep | passkey | n_predicted | re_content |
-      | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048        | 5   | 512     | 1       | 8    | 512    | 250    | 4     | 50    | 86   | 32     | 42      | 32          | .*42.*     |
+      | hf_repo             | hf_file           | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content |
+      | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048        | 5   | 16384 | 512     | 16   | 512    | 250    | 50    | 42      | 1           | 42         |
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 6b8dbf891..7c977bcce 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -31,9 +31,9 @@ Feature: llama.cpp server
     And   prometheus metrics are exposed
 
     Examples: Prompts
-      | prompt                           | n_predict | re_content                             | n_predicted |
-      | I believe the meaning of life is | 8         | (read<or>going)+                       | 8           |
-      | Write a joke about AI            | 64        | (park<or>friends<or>scared<or>always)+ | 32          |
+      | prompt                           | n_predict | re_content                       | n_predicted |
+      | I believe the meaning of life is | 8         | (read\|going)+                   | 8           |
+      | Write a joke about AI            | 64        | (park\|friends\|scared\|always)+ | 32          |
 
   Scenario Outline: OAI Compatibility
     Given a model <model>
@@ -45,9 +45,9 @@ Feature: llama.cpp server
     Then  <n_predicted> tokens are predicted matching <re_content>
 
     Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | re_content                 | n_predicted | enable_streaming |
-      | llama-2      | Book                        | What is the best book                | 8          | (Mom<or>what)+             | 8           | disabled         |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks<or>happy<or>bird)+ | 32          | enabled          |
+      | model        | system_prompt               | user_prompt                          | max_tokens | re_content             | n_predicted | enable_streaming |
+      | llama-2      | Book                        | What is the best book                | 8          | (Mom\|what)+           | 8           | disabled         |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64         | (thanks\|happy\|bird)+ | 32          | enabled          |
 
   Scenario: Embedding
     When embeddings are computed for:
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index f73dc9a56..af097fa61 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -84,12 +84,6 @@ def step_n_ctx(context, n_ctx):
     context.n_ctx = n_ctx
 
 
-@step(u'a KV cache size based on the model trained context {n_ctx_train:d}'
-      u' extended by {n_grp:d} with additional {n_keep:d} tokens')
-def step_kv_cache_size_extended(context, n_ctx_train, n_grp, n_keep):
-    context.n_ctx = n_ctx_train * n_grp + n_keep
-
-
 @step(u'{n_slots:d} slots')
 def step_n_slots(context, n_slots):
     context.n_slots = n_slots
@@ -146,7 +140,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
                                          slots_idle=context.n_slots,
                                          slots_processing=0,
                                          expected_slots=[{'id': slot_id, 'state': 0}
-                                                         for slot_id in range(context.n_slots if context.n_slots else 1)])
+                                                         for slot_id in
+                                                         range(context.n_slots if context.n_slots else 1)])
         case 'busy':
             await wait_for_health_status(context, context.base_url, 503,
                                          'no slot available',
@@ -154,7 +149,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status):
                                          slots_idle=0,
                                          slots_processing=context.n_slots,
                                          expected_slots=[{'id': slot_id, 'state': 1}
-                                                         for slot_id in range(context.n_slots if context.n_slots else 1)])
+                                                         for slot_id in
+                                                         range(context.n_slots if context.n_slots else 1)])
         case _:
             assert False, "unknown status"
 
@@ -258,11 +254,6 @@ def step_n_batch(context, n_batch):
     context.n_batch = n_batch
 
 
-@step(u'a self-extend context with a factor of {n_grp:d}')
-def step_n_grp(context, n_grp):
-    context.n_grp = n_grp
-
-
 @step(u'{seed:d} as seed')
 def step_seed(context, seed):
     context.seed = seed
@@ -282,6 +273,7 @@ def step_prompt_junk_suffix(context):
 def step_prompt_suffix(context):
     context.prompt_suffix = context.text
 
+
 @step(u'{n_ga:d} group attention factor'
       u' to extend context size through self-extend')
 def step_impl(context, n_ga):
@@ -294,8 +286,8 @@ def step_impl(context, n_ga_w):
 
 
 @step(u'a passkey prompt template')
-def step_prompt_passkey_template(context):
-    context.prompt_passkey_template = context.text
+def step_prompt_passkey(context):
+    context.prompt_passkey = context.text
 
 
 @step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk')
@@ -303,10 +295,11 @@ def step_prompt_passkey(context, passkey, i_pos):
     prompt = ""
     for i in range(context.n_junk):
         if i % context.n_junk == i_pos:
-            prompt += context.prompt_passkey_template
+            prompt += context.prompt_passkey # the passkey is already substituted
         prompt += context.prompt_junk_suffix
     if context.debug:
-        print(f"Passkey challenge:\n```\n{prompt}\n```\n")
+        passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m"
+        print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n")
     context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix)
 
 
@@ -816,14 +809,18 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
     content = completion_response['content']
     n_predicted = completion_response['timings']['predicted_n']
     assert len(content) > 0, "no token predicted"
+    if re_content is not None:
+        re_content = f'^(.*)({re_content})(.*)$'
+        p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL)
+        match = p.match(content)
+        assert match and len(match.groups()) == 3, f'/{re_content}/g must match ```{content}```'
+        if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
+            highlighted = p.sub(r"\1<hi>\2</hi>\3", content).replace('<hi>', '\x1b[33m').replace('</hi>', '\x1b[0m')
+            print(f"Checking completion response: {highlighted}\n")
     if expected_predicted_n and expected_predicted_n > 0:
         assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:'
                                                      f' {n_predicted} <> {expected_predicted_n}')
-    if re_content is not None:
-        re_content = '^.*' + re_content.replace('<or>', '|') + '.*$'
-        assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), (
-            f'invalid tokens predicted:'
-            f' ```\n{content}\n``` do not match /{re_content}/')
+
 
 
 async def gather_tasks_results(context):
@@ -840,7 +837,7 @@ async def wait_for_health_status(context,
                                  base_url,
                                  expected_http_status_code,
                                  expected_health_status,
-                                 timeout = 3,
+                                 timeout=3,
                                  params=None,
                                  slots_idle=None,
                                  slots_processing=None,
diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature
index bfc7a7f55..cf14b3b44 100644
--- a/examples/server/tests/features/wrong_usages.feature
+++ b/examples/server/tests/features/wrong_usages.feature
@@ -1,4 +1,4 @@
-# run with ./test.sh --tags wrong_usage
+# run with: ./tests.sh --no-skipped --tags wrong_usage
 @wrong_usage
 Feature: Wrong usage of llama.cpp server
 
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index a6f4e798e..1c6c5695f 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -5,7 +5,7 @@ set -eu
 if [ $# -lt 1 ]
 then
   # Start @llama.cpp scenario
-  behave --summary --stop --no-capture --exclude 'issues|wrong_usages|slow' --tags llama.cpp
+  behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 else
   behave "$@"
 fi