server: tests: fix concurrent OAI streaming request
This commit is contained in:
parent
77b8589dbb
commit
71831494b1
2 changed files with 19 additions and 14 deletions
|
@ -51,7 +51,7 @@ Feature: Parallel
|
||||||
Examples:
|
Examples:
|
||||||
| streaming | n_predict |
|
| streaming | n_predict |
|
||||||
| disabled | 128 |
|
| disabled | 128 |
|
||||||
#| enabled | 64 | FIXME: phymbert: need to investigate why in aiohttp with streaming only one token is generated
|
| enabled | 64 |
|
||||||
|
|
||||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
|
|
|
@ -485,20 +485,25 @@ async def oai_chat_completions(user_prompt,
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
assert response.headers['Access-Control-Allow-Origin'] == origin
|
assert response.headers['Access-Control-Allow-Origin'] == origin
|
||||||
assert response.headers['Content-Type'] == "text/event-stream"
|
assert response.headers['Content-Type'] == "text/event-stream"
|
||||||
|
event_received = True
|
||||||
|
while event_received:
|
||||||
|
event_received = False
|
||||||
|
async for line_in_bytes in response.content:
|
||||||
|
line = line_in_bytes.decode('utf8')
|
||||||
|
line = line.rstrip('\n').rstrip('\r')
|
||||||
|
if line == '':
|
||||||
|
continue
|
||||||
|
event_data = line.split(': ', 1)
|
||||||
|
assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
|
||||||
|
chunk_raw = event_data[1]
|
||||||
|
|
||||||
async for line_in_bytes in response.content:
|
chunk = json.loads(chunk_raw)
|
||||||
line = line_in_bytes.decode('utf8')
|
assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"
|
||||||
event_data = line.split(': ', 1)
|
delta = chunk['choices'][0]['delta']
|
||||||
assert event_data[0] == 'data', f'{event_data}'
|
if 'content' in delta:
|
||||||
chunk_raw = event_data[1]
|
completion_response['content'] += delta['content']
|
||||||
|
completion_response['timings']['predicted_n'] += 1
|
||||||
chunk = json.loads(chunk_raw)
|
print(f"DEBUG completion_response: {completion_response}")
|
||||||
assert len(chunk['choices']) == 1
|
|
||||||
delta = chunk['choices'][0]['delta']
|
|
||||||
if 'content' in delta:
|
|
||||||
completion_response['content'] += delta['content']
|
|
||||||
completion_response['timings']['predicted_n'] += 1
|
|
||||||
print(f"DEBUG completion_response: {completion_response}")
|
|
||||||
else:
|
else:
|
||||||
if expect_api_error is None or not expect_api_error:
|
if expect_api_error is None or not expect_api_error:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue