server : refactor middleware and /health endpoint (#9056)
* server : refactor middleware and /health endpoint * move "fail_on_no_slot" to /slots * Update examples/server/server.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix server tests * fix CI * update server docs --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
d565bb2fd5
commit
8b3befc0e2
3 changed files with 177 additions and 217 deletions
|
@ -205,27 +205,20 @@ def step_start_server(context):
|
|||
async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
||||
match expecting_status:
|
||||
case 'healthy':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||
timeout=30)
|
||||
await wait_for_slots_status(context, context.base_url, 200,
|
||||
timeout=30)
|
||||
|
||||
case 'ready' | 'idle':
|
||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||
timeout=30,
|
||||
params={'fail_on_no_slot': 0, 'include_slots': 0},
|
||||
slots_idle=context.n_slots,
|
||||
slots_processing=0,
|
||||
expected_slots=[{'id': slot_id, 'state': 0}
|
||||
for slot_id in
|
||||
range(context.n_slots if context.n_slots else 1)])
|
||||
await wait_for_slots_status(context, context.base_url, 200,
|
||||
timeout=30,
|
||||
params={'fail_on_no_slot': 1},
|
||||
slots_idle=context.n_slots,
|
||||
slots_processing=0)
|
||||
case 'busy':
|
||||
await wait_for_health_status(context, context.base_url, 503,
|
||||
'no slot available',
|
||||
params={'fail_on_no_slot': 0, 'include_slots': 0},
|
||||
slots_idle=0,
|
||||
slots_processing=context.n_slots,
|
||||
expected_slots=[{'id': slot_id, 'state': 1}
|
||||
for slot_id in
|
||||
range(context.n_slots if context.n_slots else 1)])
|
||||
await wait_for_slots_status(context, context.base_url, 503,
|
||||
params={'fail_on_no_slot': 1},
|
||||
slots_idle=0,
|
||||
slots_processing=context.n_slots)
|
||||
case _:
|
||||
assert False, "unknown status"
|
||||
|
||||
|
@ -1187,17 +1180,15 @@ async def gather_tasks_results(context):
|
|||
return n_completions
|
||||
|
||||
|
||||
async def wait_for_health_status(context,
|
||||
base_url,
|
||||
expected_http_status_code,
|
||||
expected_health_status,
|
||||
timeout=3,
|
||||
params=None,
|
||||
slots_idle=None,
|
||||
slots_processing=None,
|
||||
expected_slots=None):
|
||||
async def wait_for_slots_status(context,
|
||||
base_url,
|
||||
expected_http_status_code,
|
||||
timeout=3,
|
||||
params=None,
|
||||
slots_idle=None,
|
||||
slots_processing=None):
|
||||
if context.debug:
|
||||
print(f"Starting checking for health for expected_health_status={expected_health_status}")
|
||||
print(f"Starting checking for health for expected_http_status_code={expected_http_status_code}")
|
||||
interval = 0.5
|
||||
counter = 0
|
||||
if 'GITHUB_ACTIONS' in os.environ:
|
||||
|
@ -1205,26 +1196,19 @@ async def wait_for_health_status(context,
|
|||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
while True:
|
||||
async with await session.get(f'{base_url}/health', params=params) as health_response:
|
||||
status_code = health_response.status
|
||||
health = await health_response.json()
|
||||
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
||||
status_code = slots_response.status
|
||||
slots = await slots_response.json()
|
||||
if context.debug:
|
||||
print(f"HEALTH - response for expected health status='{expected_health_status}' on "
|
||||
f"'{base_url}/health'?{params} is {health}\n")
|
||||
if (status_code == expected_http_status_code
|
||||
and health['status'] == expected_health_status
|
||||
and (slots_idle is None or health['slots_idle'] == slots_idle)
|
||||
and (slots_processing is None or health['slots_processing'] == slots_processing)):
|
||||
if expected_slots is not None:
|
||||
assert_slots_status(health['slots'], expected_slots)
|
||||
return
|
||||
if (status_code == expected_http_status_code
|
||||
and health['status'] == expected_health_status
|
||||
and (slots_idle is None or health['slots_idle'] == slots_idle)
|
||||
and (slots_processing is None or health['slots_processing'] == slots_processing)):
|
||||
if expected_slots is not None:
|
||||
assert_slots_status(health['slots'], expected_slots)
|
||||
print(f"slots responses {slots}\n")
|
||||
if status_code == 503 and status_code == expected_http_status_code:
|
||||
return
|
||||
if status_code == 200 and status_code == expected_http_status_code:
|
||||
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
|
||||
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
|
||||
if ((slots_idle is None or slots_idle == n_slots_idle)
|
||||
and (slots_processing is None or slots_processing == n_slots_processing)):
|
||||
return
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
counter += interval
|
||||
|
@ -1238,7 +1222,7 @@ async def wait_for_health_status(context,
|
|||
if n_completions > 0:
|
||||
return
|
||||
|
||||
assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
|
||||
assert False, f'slots check timeout exceeded {counter}s>={timeout}'
|
||||
|
||||
|
||||
def assert_embeddings(embeddings):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue