server: tests: add OAI stream test, fix file end of line, fast fail behave

2024-02-20 21:34:35 +01:00 · 2024-02-20 21:34:35 +01:00 · 9b7ea97979
commit 9b7ea97979
parent 56583bee41
5 changed files with 35 additions and 14 deletions
--- a/.github/workflows/server-test.yml
+++ b/.github/workflows/server-test.yml
@ -47,5 +47,3 @@ jobs:
        run: |
          cd examples/server/tests
          ./tests.sh ../../../stories260K.gguf
-          
-          
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -23,13 +23,14 @@ Feature: llama.cpp server
    And   a user prompt <user_prompt>
    And   a model <model>
    And   <max_tokens> max tokens to predict
+    And   streaming is <enable_streaming>
    Given an OAI compatible chat completions request
    Then  <predicted_n> tokens are predicted

    Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | predicted_n |
-      | llama-2      | You are ChatGPT.            | Say hello.                           | 64         | 64          |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512        | 512         |
+      | model        | system_prompt               | user_prompt                          | max_tokens | enable_streaming | predicted_n |
+      | llama-2      | You are ChatGPT.            | Say hello.                           | 64         | false            | 64          |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512        | true             | 512         |

  Scenario: Multi users
    Given a prompt:
@ -55,4 +56,4 @@ Feature: llama.cpp server
    Given concurrent completion requests
    Then the server is busy
    Then the server is idle
-    Then all prompts are predicted
+    Then all prompts are predicted
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -74,6 +74,11 @@ def step_max_tokens(context, max_tokens):
    context.max_tokens = int(max_tokens)


+@step(u'streaming is {enable_streaming}')
+def step_streaming(context, enable_streaming):
+    context.enable_streaming = bool(enable_streaming)
+
+
@step(u'an OAI compatible chat completions request')
 def step_oai_chat_completions(context):
    chat_completion = openai.Completion.create(
@ -88,14 +93,31 @@ def step_oai_chat_completions(context):
            }
        ],
        model=context.model,
-        max_tokens=context.max_tokens
+        max_tokens=context.max_tokens,
+        stream=context.enable_streaming
    )
-    context.completions.append({
-        'content': chat_completion.choices[0].message,
-        'timings': {
-            'predicted_n': chat_completion.usage.completion_tokens
+    if context.enable_streaming:
+        completion_response = {
+            'content': '',
+            'timings': {
+                'predicted_n': 0
+            }
        }
-    })
+        for chunk in chat_completion:
+            assert len(chunk.choices) == 1
+            delta = chunk.choices[0].delta
+            if 'content' in delta:
+                completion_response['content'] += delta['content']
+                completion_response['timings']['predicted_n'] += 1
+        context.completions.append(completion_response)
+    else:
+        assert len(chat_completion.choices) == 1
+        context.completions.append({
+            'content': chat_completion.choices[0].message,
+            'timings': {
+                'predicted_n': chat_completion.usage.completion_tokens
+            }
+        })


@step(u'a prompt')
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@ -1,2 +1,2 @@
 behave~=1.2.6
-openai~=0.25.0
+openai~=0.25.0
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -32,4 +32,4 @@ set -eu
            "$@" &

 # Start tests
-behave
+behave --summary --stop