server: tests: add infinite loop

2024-02-20 23:05:27 +01:00 · 2024-02-20 23:05:27 +01:00 · 6b9dc4f291
commit 6b9dc4f291
parent 0772884b06
2 changed files with 30 additions and 19 deletions
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -36,24 +36,13 @@ Feature: llama.cpp server
  Scenario: Multi users
    Given a prompt:
      """
-      Write a formal complaint email to Air France about my delayed
-      baggage from my flight on Tuesday, January 17th, from Paris to Toulouse. Be verbose.
+      Write a very long story about AI.
      """
    And a prompt:
      """
-      Translate the following War & Peace chapter into Russian: WELL, PRINCE,
-      Genoa and Lucca are now no more than private estates of the Bonaparte
-      family. No, I warn you, that if you do not tell me we are at war,
-      if you again allow yourself to palliate all the infamies and atrocities
-      of this Antichrist (upon my word, I believe he is), I don’t know you
-      in future, you are no longer my friend, no longer my faithful slave,
-      as you say. There, how do you do, how do you do? I see I’m scaring you,
-      sit down and talk to me.” These words were uttered in July 1805 by
-      Anna Pavlovna Scherer, a distinguished lady of the court,
-      and confidential maid-of-honour to the Empress Marya Fyodorovna.
-      It was her greeting to Prince Vassily, a man high in rank
-      and office, who was the first to arrive at her soirée.
+      Write another very long music lyrics.
      """
+    And 512 max tokens to predict
    Given concurrent completion requests
    Then the server is busy
    And  all slots are busy
@ -65,8 +54,6 @@ Feature: llama.cpp server
  Scenario: Multi users OAI Compatibility
    Given a system prompt "You are an AI assistant."
    And a model tinyllama-2
-    And 1024 max tokens to predict
-    And streaming is enabled
    Given a prompt:
      """
      Write a very long story about AI.
@ -75,6 +62,8 @@ Feature: llama.cpp server
      """
      Write another very long music lyrics.
      """
+    And 512 max tokens to predict
+    And streaming is enabled
    Given concurrent OAI completions requests
    Then the server is busy
    And  all slots are busy
@ -82,3 +71,25 @@ Feature: llama.cpp server
    And  all slots are idle
    Then all prompts are predicted

+  # FIXME: infinite loop on the CI, not locally, if n_prompt * n_predict > kv_size
+  Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size
+    Given a prompt:
+      """
+      Write a very long story about AI.
+      """
+    And a prompt:
+      """
+      Write another very long music lyrics.
+      """
+    And a prompt:
+      """
+      Write a very long poem.
+      """
+    And 1024 max tokens to predict
+    Given concurrent completion requests
+    Then the server is busy
+    And  all slots are busy
+    Then the server is idle
+    And  all slots are idle
+    Then all prompts are predicted
+
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -105,7 +105,7 @@ def step_model(context, model):

@step(u'{max_tokens} max tokens to predict')
 def step_max_tokens(context, max_tokens):
-    context.max_tokens = int(max_tokens)
+    context.n_predict = int(max_tokens)


@step(u'streaming is {enable_streaming}')
@ -154,7 +154,7 @@ def concurrent_requests(context, f_completion):
 def request_completion(context, prompt, n_predict=None):
    response = requests.post(f'{context.base_url}/completion', json={
        "prompt": prompt,
-        "n_predict": int(n_predict) if n_predict is not None else 4096,
+        "n_predict": int(n_predict) if n_predict is not None else context.n_predict,
        "seed": context.seed
    })
    assert response.status_code == 200
@ -174,7 +174,7 @@ def oai_chat_completions(context, user_prompt):
            }
        ],
        model=context.model,
-        max_tokens=context.max_tokens,
+        max_tokens=context.n_predict,
        stream=context.enable_streaming,
        seed = context.seed
    )