diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 07be39ef5..802d624ff 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -51,7 +51,7 @@ Feature: Parallel Examples: | streaming | n_predict | | disabled | 128 | - #| enabled | 64 | FIXME: phymbert: need to investigate why in aiohttp with streaming only one token is generated + | enabled | 64 | Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 Given a prompt: diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 5e2b729eb..fda8aab8f 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -485,20 +485,25 @@ async def oai_chat_completions(user_prompt, assert response.status == 200 assert response.headers['Access-Control-Allow-Origin'] == origin assert response.headers['Content-Type'] == "text/event-stream" + event_received = True + while event_received: + event_received = False + async for line_in_bytes in response.content: + line = line_in_bytes.decode('utf8') + line = line.rstrip('\n').rstrip('\r') + if line == '': + continue + event_data = line.split(': ', 1) + assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' + chunk_raw = event_data[1] - async for line_in_bytes in response.content: - line = line_in_bytes.decode('utf8') - event_data = line.split(': ', 1) - assert event_data[0] == 'data', f'{event_data}' - chunk_raw = event_data[1] - - chunk = json.loads(chunk_raw) - assert len(chunk['choices']) == 1 - delta = chunk['choices'][0]['delta'] - if 'content' in delta: - completion_response['content'] += delta['content'] - completion_response['timings']['predicted_n'] += 1 - print(f"DEBUG completion_response: {completion_response}") + chunk = json.loads(chunk_raw) + assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" + delta = chunk['choices'][0]['delta'] + if 'content' in delta: + completion_response['content'] += delta['content'] + completion_response['timings']['predicted_n'] += 1 + print(f"DEBUG completion_response: {completion_response}") else: if expect_api_error is None or not expect_api_error: assert response.status == 200