From 9b7ea97979a087a8ffbcba5368fa81385d6580bf Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Tue, 20 Feb 2024 21:34:35 +0100
Subject: [PATCH] server: tests: add OAI stream test, fix file end of line,
 fast fail behave

---
 .github/workflows/server-test.yml             |  2 --
 examples/server/tests/features/server.feature |  9 ++---
 examples/server/tests/features/steps/steps.py | 34 +++++++++++++++----
 examples/server/tests/requirements.txt        |  2 +-
 examples/server/tests/tests.sh                |  2 +-
 5 files changed, 35 insertions(+), 14 deletions(-)
diff --git a/.github/workflows/server-test.yml b/.github/workflows/server-test.yml
index d05230fbd..b70006e04 100644
--- a/.github/workflows/server-test.yml
+++ b/.github/workflows/server-test.yml
@@ -47,5 +47,3 @@ jobs:
         run: |
           cd examples/server/tests
           ./tests.sh ../../../stories260K.gguf
-          
-          
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index d2e691f12..a14d1459a 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -23,13 +23,14 @@ Feature: llama.cpp server
     And   a user prompt <user_prompt>
     And   a model <model>
     And   <max_tokens> max tokens to predict
+    And   streaming is <enable_streaming>
     Given an OAI compatible chat completions request
     Then  <predicted_n> tokens are predicted
 
     Examples: Prompts
-      | model        | system_prompt               | user_prompt                          | max_tokens | predicted_n |
-      | llama-2      | You are ChatGPT.            | Say hello.                           | 64         | 64          |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512        | 512         |
+      | model        | system_prompt               | user_prompt                          | max_tokens | enable_streaming | predicted_n |
+      | llama-2      | You are ChatGPT.            | Say hello.                           | 64         | false            | 64          |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512        | true             | 512         |
 
   Scenario: Multi users
     Given a prompt:
@@ -55,4 +56,4 @@ Feature: llama.cpp server
     Given concurrent completion requests
     Then the server is busy
     Then the server is idle
-    Then all prompts are predicted
\ No newline at end of file
+    Then all prompts are predicted
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index c6fbff84d..f9823b51f 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -74,6 +74,11 @@ def step_max_tokens(context, max_tokens):
     context.max_tokens = int(max_tokens)
 
 
+@step(u'streaming is {enable_streaming}')
+def step_streaming(context, enable_streaming):
+    context.enable_streaming = bool(enable_streaming)
+
+
 @step(u'an OAI compatible chat completions request')
 def step_oai_chat_completions(context):
     chat_completion = openai.Completion.create(
@@ -88,14 +93,31 @@ def step_oai_chat_completions(context):
             }
         ],
         model=context.model,
-        max_tokens=context.max_tokens
+        max_tokens=context.max_tokens,
+        stream=context.enable_streaming
     )
-    context.completions.append({
-        'content': chat_completion.choices[0].message,
-        'timings': {
-            'predicted_n': chat_completion.usage.completion_tokens
+    if context.enable_streaming:
+        completion_response = {
+            'content': '',
+            'timings': {
+                'predicted_n': 0
+            }
         }
-    })
+        for chunk in chat_completion:
+            assert len(chunk.choices) == 1
+            delta = chunk.choices[0].delta
+            if 'content' in delta:
+                completion_response['content'] += delta['content']
+                completion_response['timings']['predicted_n'] += 1
+        context.completions.append(completion_response)
+    else:
+        assert len(chat_completion.choices) == 1
+        context.completions.append({
+            'content': chat_completion.choices[0].message,
+            'timings': {
+                'predicted_n': chat_completion.usage.completion_tokens
+            }
+        })
 
 
 @step(u'a prompt')
diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index f5c6f2e4a..b64fbc6ba 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -1,2 +1,2 @@
 behave~=1.2.6
-openai~=0.25.0
\ No newline at end of file
+openai~=0.25.0
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 01b2f5d4d..230ee45ad 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -32,4 +32,4 @@ set -eu
             "$@" &
 
 # Start tests
-behave
\ No newline at end of file
+behave --summary --stop