server: tests: fix concurrent OAI streaming request

2024-02-23 19:28:06 +01:00 · 2024-02-23 19:28:06 +01:00 · 71831494b1
commit 71831494b1
parent 77b8589dbb
2 changed files with 19 additions and 14 deletions
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -51,7 +51,7 @@ Feature: Parallel
    Examples:
      | streaming | n_predict |
      | disabled  | 128       |
-      #| enabled   | 64       | FIXME: phymbert: need to investigate why in aiohttp with streaming only one token is generated
+      | enabled   | 64        |

  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
    Given a prompt:
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -485,20 +485,25 @@ async def oai_chat_completions(user_prompt,
                    assert response.status == 200
                    assert response.headers['Access-Control-Allow-Origin'] == origin
                    assert response.headers['Content-Type'] == "text/event-stream"
+                    event_received = True
+                    while event_received:
+                        event_received = False
+                        async for line_in_bytes in response.content:
+                            line = line_in_bytes.decode('utf8')
+                            line = line.rstrip('\n').rstrip('\r')
+                            if line == '':
+                                continue
+                            event_data = line.split(': ', 1)
+                            assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
+                            chunk_raw = event_data[1]

-                    async for line_in_bytes in response.content:
-                        line = line_in_bytes.decode('utf8')
-                        event_data = line.split(': ', 1)
-                        assert event_data[0] == 'data', f'{event_data}'
-                        chunk_raw = event_data[1]
-
-                        chunk = json.loads(chunk_raw)
-                        assert len(chunk['choices']) == 1
-                        delta = chunk['choices'][0]['delta']
-                        if 'content' in delta:
-                            completion_response['content'] += delta['content']
-                            completion_response['timings']['predicted_n'] += 1
-                        print(f"DEBUG completion_response: {completion_response}")
+                            chunk = json.loads(chunk_raw)
+                            assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
+                            delta = chunk['choices'][0]['delta']
+                            if 'content' in delta:
+                                completion_response['content'] += delta['content']
+                                completion_response['timings']['predicted_n'] += 1
+                            print(f"DEBUG completion_response: {completion_response}")
                else:
                    if expect_api_error is None or not expect_api_error:
                        assert response.status == 200