server: tests: slots endpoint checks
This commit is contained in:
parent
11adf1d864
commit
c355f76427
2 changed files with 60 additions and 7 deletions
|
@ -8,6 +8,7 @@ Feature: llama.cpp server
|
||||||
Scenario: Health
|
Scenario: Health
|
||||||
When the server is healthy
|
When the server is healthy
|
||||||
Then the server is ready
|
Then the server is ready
|
||||||
|
And all slots are idle
|
||||||
|
|
||||||
Scenario Outline: Completion
|
Scenario Outline: Completion
|
||||||
Given a <prompt> completion request with maximum <n_predict> tokens
|
Given a <prompt> completion request with maximum <n_predict> tokens
|
||||||
|
@ -55,7 +56,9 @@ Feature: llama.cpp server
|
||||||
"""
|
"""
|
||||||
Given concurrent completion requests
|
Given concurrent completion requests
|
||||||
Then the server is busy
|
Then the server is busy
|
||||||
|
And all slots are busy
|
||||||
Then the server is idle
|
Then the server is idle
|
||||||
|
And all slots are idle
|
||||||
Then all prompts are predicted
|
Then all prompts are predicted
|
||||||
|
|
||||||
|
|
||||||
|
@ -78,5 +81,7 @@ Feature: llama.cpp server
|
||||||
"""
|
"""
|
||||||
Given concurrent OAI completions requests
|
Given concurrent OAI completions requests
|
||||||
Then the server is busy
|
Then the server is busy
|
||||||
|
And all slots are busy
|
||||||
Then the server is idle
|
Then the server is idle
|
||||||
|
And all slots are idle
|
||||||
Then all prompts are predicted
|
Then all prompts are predicted
|
|
@ -1,6 +1,5 @@
|
||||||
import socket
|
import socket
|
||||||
import threading
|
import threading
|
||||||
import time
|
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
|
|
||||||
import openai
|
import openai
|
||||||
|
@ -38,13 +37,46 @@ def step_wait_for_the_server_to_be_started(context, expecting_status):
|
||||||
case 'healthy':
|
case 'healthy':
|
||||||
wait_for_health_status(context, 200, 'ok')
|
wait_for_health_status(context, 200, 'ok')
|
||||||
case 'ready' | 'idle':
|
case 'ready' | 'idle':
|
||||||
wait_for_health_status(context, 200, 'ok', params={'fail_on_no_slot': True})
|
wait_for_health_status(context, 200, 'ok',
|
||||||
|
params={'fail_on_no_slot': True},
|
||||||
|
slots_idle=context.n_slots,
|
||||||
|
slots_processing=0)
|
||||||
|
request_slots_status(context, [
|
||||||
|
{'id': 0, 'state': 0},
|
||||||
|
{'id': 1, 'state': 0}
|
||||||
|
])
|
||||||
case 'busy':
|
case 'busy':
|
||||||
wait_for_health_status(context, 503, 'no slot available', params={'fail_on_no_slot': True})
|
wait_for_health_status(context, 503, 'no slot available',
|
||||||
|
params={'fail_on_no_slot': True},
|
||||||
|
slots_idle=0,
|
||||||
|
slots_processing=context.n_slots)
|
||||||
|
request_slots_status(context, [
|
||||||
|
{'id': 0, 'state': 1},
|
||||||
|
{'id': 1, 'state': 1}
|
||||||
|
])
|
||||||
case _:
|
case _:
|
||||||
assert False, "unknown status"
|
assert False, "unknown status"
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'all slots are {expected_slot_status_string}')
|
||||||
|
def step_all_slots_status(context, expected_slot_status_string):
|
||||||
|
match expected_slot_status_string:
|
||||||
|
case 'idle':
|
||||||
|
expected_slot_status = 0
|
||||||
|
case 'busy':
|
||||||
|
expected_slot_status = 1
|
||||||
|
case _:
|
||||||
|
assert False, "unknown status"
|
||||||
|
|
||||||
|
expected_slots = []
|
||||||
|
for slot_id in range(context.n_slots):
|
||||||
|
expected_slots.append({
|
||||||
|
'id': slot_id,
|
||||||
|
'state': expected_slot_status
|
||||||
|
})
|
||||||
|
request_slots_status(context, expected_slots)
|
||||||
|
|
||||||
|
|
||||||
@step(u'a {prompt} completion request with maximum {n_predict} tokens')
|
@step(u'a {prompt} completion request with maximum {n_predict} tokens')
|
||||||
def step_request_completion(context, prompt, n_predict):
|
def step_request_completion(context, prompt, n_predict):
|
||||||
request_completion(context, prompt, n_predict)
|
request_completion(context, prompt, n_predict)
|
||||||
|
@ -123,8 +155,7 @@ def request_completion(context, prompt, n_predict=None):
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"n_predict": int(n_predict) if n_predict is not None else 4096,
|
"n_predict": int(n_predict) if n_predict is not None else 4096,
|
||||||
})
|
})
|
||||||
status_code = response.status_code
|
assert response.status_code == 200
|
||||||
assert status_code == 200
|
|
||||||
context.completions.append(response.json())
|
context.completions.append(response.json())
|
||||||
|
|
||||||
|
|
||||||
|
@ -177,10 +208,27 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None):
|
||||||
f' "{n_predicted}" <> "{expected_predicted_n}"')
|
f' "{n_predicted}" <> "{expected_predicted_n}"')
|
||||||
|
|
||||||
|
|
||||||
def wait_for_health_status(context, expected_http_status_code, expected_health_status, params=None):
|
def wait_for_health_status(context, expected_http_status_code,
|
||||||
|
expected_health_status,
|
||||||
|
params=None,
|
||||||
|
slots_idle=None,
|
||||||
|
slots_processing=None):
|
||||||
while True:
|
while True:
|
||||||
health_response = requests.get(f'{context.base_url}/health', params)
|
health_response = requests.get(f'{context.base_url}/health', params)
|
||||||
status_code = health_response.status_code
|
status_code = health_response.status_code
|
||||||
health = health_response.json()
|
health = health_response.json()
|
||||||
if status_code == expected_http_status_code and health['status'] == expected_health_status:
|
if (status_code == expected_http_status_code
|
||||||
|
and health['status'] == expected_health_status
|
||||||
|
and (slots_idle is None or health['slots_idle'] == slots_idle)
|
||||||
|
and (slots_processing is None or health['slots_processing'] == slots_processing)):
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def request_slots_status(context, expected_slots):
|
||||||
|
slots_response = requests.get(f'{context.base_url}/slots')
|
||||||
|
assert slots_response.status_code == 200
|
||||||
|
slots = slots_response.json()
|
||||||
|
assert len(slots) == len(expected_slots)
|
||||||
|
for expected, slot in zip(expected_slots, slots):
|
||||||
|
for key in expected:
|
||||||
|
assert expected[key] == slot[key], f"expected[{key}] != slot[{key}]"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue