server: tests: add infinite loop

2024-02-20 23:05:27 +01:00 · 2024-02-20 23:05:27 +01:00 · 6b9dc4f291
commit 6b9dc4f291
parent 0772884b06
2 changed files with 30 additions and 19 deletions
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -36,24 +36,13 @@ Feature: llama.cpp server
  Scenario: Multi users
    Given a prompt:
      """
-      Write a formal complaint email to Air France about my delayed
+      Write a very long story about AI.
      baggage from my flight on Tuesday, January 17th, from Paris to Toulouse. Be verbose.
      """
    And a prompt:
      """
-      Translate the following War & Peace chapter into Russian: WELL, PRINCE,
+      Write another very long music lyrics.
      Genoa and Lucca are now no more than private estates of the Bonaparte
      family. No, I warn you, that if you do not tell me we are at war,
      if you again allow yourself to palliate all the infamies and atrocities
      of this Antichrist (upon my word, I believe he is), I don’t know you
      in future, you are no longer my friend, no longer my faithful slave,
      as you say. There, how do you do, how do you do? I see I’m scaring you,
      sit down and talk to me.” These words were uttered in July 1805 by
      Anna Pavlovna Scherer, a distinguished lady of the court,
      and confidential maid-of-honour to the Empress Marya Fyodorovna.
      It was her greeting to Prince Vassily, a man high in rank
      and office, who was the first to arrive at her soirée.
      """
    And 512 max tokens to predict
    Given concurrent completion requests
    Then the server is busy
    And  all slots are busy
@ -65,8 +54,6 @@ Feature: llama.cpp server
  Scenario: Multi users OAI Compatibility
    Given a system prompt "You are an AI assistant."
    And a model tinyllama-2
    And 1024 max tokens to predict
    And streaming is enabled
    Given a prompt:
      """
      Write a very long story about AI.
@ -75,6 +62,8 @@ Feature: llama.cpp server
      """
      Write another very long music lyrics.
      """
    And 512 max tokens to predict
    And streaming is enabled
    Given concurrent OAI completions requests
    Then the server is busy
    And  all slots are busy
@ -82,3 +71,25 @@ Feature: llama.cpp server
    And  all slots are idle
    Then all prompts are predicted
  # FIXME: infinite loop on the CI, not locally, if n_prompt * n_predict > kv_size
  Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size
    Given a prompt:
      """
      Write a very long story about AI.
      """
    And a prompt:
      """
      Write another very long music lyrics.
      """
    And a prompt:
      """
      Write a very long poem.
      """
    And 1024 max tokens to predict
    Given concurrent completion requests
    Then the server is busy
    And  all slots are busy
    Then the server is idle
    And  all slots are idle
    Then all prompts are predicted
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -105,7 +105,7 @@ def step_model(context, model):
@step(u'{max_tokens} max tokens to predict')
 def step_max_tokens(context, max_tokens):
-    context.max_tokens = int(max_tokens)
+    context.n_predict = int(max_tokens)
@step(u'streaming is {enable_streaming}')
@ -154,7 +154,7 @@ def concurrent_requests(context, f_completion):
 def request_completion(context, prompt, n_predict=None):
    response = requests.post(f'{context.base_url}/completion', json={
        "prompt": prompt,
-        "n_predict": int(n_predict) if n_predict is not None else 4096,
+        "n_predict": int(n_predict) if n_predict is not None else context.n_predict,
        "seed": context.seed
    })
    assert response.status_code == 200
@ -174,7 +174,7 @@ def oai_chat_completions(context, user_prompt):
            }
        ],
        model=context.model,
-        max_tokens=context.max_tokens,
+        max_tokens=context.n_predict,
        stream=context.enable_streaming,
        seed = context.seed
    )