server: tests: slots endpoint checks
This commit is contained in:
parent
11adf1d864
commit
c355f76427
2 changed files with 60 additions and 7 deletions
|
@ -8,6 +8,7 @@ Feature: llama.cpp server
|
|||
Scenario: Health
|
||||
When the server is healthy
|
||||
Then the server is ready
|
||||
And all slots are idle
|
||||
|
||||
Scenario Outline: Completion
|
||||
Given a <prompt> completion request with maximum <n_predict> tokens
|
||||
|
@ -55,7 +56,9 @@ Feature: llama.cpp server
|
|||
"""
|
||||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
And all slots are busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all prompts are predicted
|
||||
|
||||
|
||||
|
@ -78,5 +81,7 @@ Feature: llama.cpp server
|
|||
"""
|
||||
Given concurrent OAI completions requests
|
||||
Then the server is busy
|
||||
And all slots are busy
|
||||
Then the server is idle
|
||||
And all slots are idle
|
||||
Then all prompts are predicted
|
|
@ -1,6 +1,5 @@
|
|||
import socket
|
||||
import threading
|
||||
import time
|
||||
from contextlib import closing
|
||||
|
||||
import openai
|
||||
|
@ -38,13 +37,46 @@ def step_wait_for_the_server_to_be_started(context, expecting_status):
|
|||
case 'healthy':
|
||||
wait_for_health_status(context, 200, 'ok')
|
||||
case 'ready' | 'idle':
|
||||
wait_for_health_status(context, 200, 'ok', params={'fail_on_no_slot': True})
|
||||
wait_for_health_status(context, 200, 'ok',
|
||||
params={'fail_on_no_slot': True},
|
||||
slots_idle=context.n_slots,
|
||||
slots_processing=0)
|
||||
request_slots_status(context, [
|
||||
{'id': 0, 'state': 0},
|
||||
{'id': 1, 'state': 0}
|
||||
])
|
||||
case 'busy':
|
||||
wait_for_health_status(context, 503, 'no slot available', params={'fail_on_no_slot': True})
|
||||
wait_for_health_status(context, 503, 'no slot available',
|
||||
params={'fail_on_no_slot': True},
|
||||
slots_idle=0,
|
||||
slots_processing=context.n_slots)
|
||||
request_slots_status(context, [
|
||||
{'id': 0, 'state': 1},
|
||||
{'id': 1, 'state': 1}
|
||||
])
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
|
||||
@step(u'all slots are {expected_slot_status_string}')
|
||||
def step_all_slots_status(context, expected_slot_status_string):
|
||||
match expected_slot_status_string:
|
||||
case 'idle':
|
||||
expected_slot_status = 0
|
||||
case 'busy':
|
||||
expected_slot_status = 1
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
expected_slots = []
|
||||
for slot_id in range(context.n_slots):
|
||||
expected_slots.append({
|
||||
'id': slot_id,
|
||||
'state': expected_slot_status
|
||||
})
|
||||
request_slots_status(context, expected_slots)
|
||||
|
||||
|
||||
@step(u'a {prompt} completion request with maximum {n_predict} tokens')
|
||||
def step_request_completion(context, prompt, n_predict):
|
||||
request_completion(context, prompt, n_predict)
|
||||
|
@ -123,8 +155,7 @@ def request_completion(context, prompt, n_predict=None):
|
|||
"prompt": prompt,
|
||||
"n_predict": int(n_predict) if n_predict is not None else 4096,
|
||||
})
|
||||
status_code = response.status_code
|
||||
assert status_code == 200
|
||||
assert response.status_code == 200
|
||||
context.completions.append(response.json())
|
||||
|
||||
|
||||
|
@ -177,10 +208,27 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None):
|
|||
f' "{n_predicted}" <> "{expected_predicted_n}"')
|
||||
|
||||
|
||||
def wait_for_health_status(context, expected_http_status_code, expected_health_status, params=None):
|
||||
def wait_for_health_status(context, expected_http_status_code,
|
||||
expected_health_status,
|
||||
params=None,
|
||||
slots_idle=None,
|
||||
slots_processing=None):
|
||||
while True:
|
||||
health_response = requests.get(f'{context.base_url}/health', params)
|
||||
status_code = health_response.status_code
|
||||
health = health_response.json()
|
||||
if status_code == expected_http_status_code and health['status'] == expected_health_status:
|
||||
if (status_code == expected_http_status_code
|
||||
and health['status'] == expected_health_status
|
||||
and (slots_idle is None or health['slots_idle'] == slots_idle)
|
||||
and (slots_processing is None or health['slots_processing'] == slots_processing)):
|
||||
break
|
||||
|
||||
|
||||
def request_slots_status(context, expected_slots):
|
||||
slots_response = requests.get(f'{context.base_url}/slots')
|
||||
assert slots_response.status_code == 200
|
||||
slots = slots_response.json()
|
||||
assert len(slots) == len(expected_slots)
|
||||
for expected, slot in zip(expected_slots, slots):
|
||||
for key in expected:
|
||||
assert expected[key] == slot[key], f"expected[{key}] != slot[{key}]"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue