server : simplify state machine for slot (#9283)
* server : simplify state machine for slot * add SLOT_STATE_DONE_PROMPT * pop_deferred_task * add missing notify_one * fix passkey test * metrics : add n_busy_slots_per_decode * fix test step * add test * maybe fix AddressSanitizer? * fix deque ? * missing lock * pop_deferred_task: also notify * Update examples/server/server.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
134bc38ecf
commit
9b2c24c099
4 changed files with 147 additions and 93 deletions
|
@ -77,6 +77,35 @@ Feature: Parallel
|
|||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario Outline: Multi users with number of prompts exceeding number of slots
|
||||
Given a system prompt You are a writer.
|
||||
And a model tinyllama-2
|
||||
Given a prompt:
|
||||
"""
|
||||
Write a very long book.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
Write another a poem.
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
What is LLM?
|
||||
"""
|
||||
And a prompt:
|
||||
"""
|
||||
The sky is blue and I love it.
|
||||
"""
|
||||
And <n_predict> max tokens to predict
|
||||
And streaming is <streaming>
|
||||
Given concurrent OAI completions requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted with <n_predict> tokens
|
||||
Examples:
|
||||
| streaming | n_predict |
|
||||
| disabled | 128 |
|
||||
| enabled | 64 |
|
||||
|
||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||
Given a prompt:
|
||||
|
|
|
@ -15,6 +15,7 @@ Feature: Passkey / Self-extend with context shift
|
|||
And <n_junk> as number of junk
|
||||
And <n_predicted> server max tokens to predict
|
||||
And 42 as seed
|
||||
And 0.0 temperature
|
||||
And <n_ctx> KV cache size
|
||||
And 1 slots
|
||||
And <n_ga> group attention factor to extend context size through self-extend
|
||||
|
@ -22,7 +23,8 @@ Feature: Passkey / Self-extend with context shift
|
|||
# Can be override with N_GPU_LAYERS
|
||||
And <ngl> GPU offloaded layers
|
||||
Then the server is starting
|
||||
Then the server is healthy
|
||||
# Higher timeout because the model may need to be downloaded from the internet
|
||||
Then the server is healthy with timeout 120 seconds
|
||||
Given available models
|
||||
Then model 0 is trained on <n_ctx_train> tokens context
|
||||
Given a prefix prompt:
|
||||
|
|
|
@ -202,17 +202,15 @@ def step_start_server(context):
|
|||
time.sleep(0.1)
|
||||
|
||||
|
||||
@step("the server is {expecting_status}")
|
||||
@async_run_until_complete
|
||||
async def step_wait_for_the_server_to_be_started(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
||||
async def wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||
match expecting_status:
|
||||
case 'healthy':
|
||||
await wait_for_slots_status(context, context.base_url, 200,
|
||||
timeout=30)
|
||||
timeout=timeout)
|
||||
|
||||
case 'ready' | 'idle':
|
||||
await wait_for_slots_status(context, context.base_url, 200,
|
||||
timeout=30,
|
||||
timeout=timeout,
|
||||
params={'fail_on_no_slot': 1},
|
||||
slots_idle=context.n_slots,
|
||||
slots_processing=0)
|
||||
|
@ -225,6 +223,18 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status: Lite
|
|||
assert False, "unknown status"
|
||||
|
||||
|
||||
@step("the server is {expecting_status} with timeout {timeout:d} seconds")
|
||||
@async_run_until_complete
|
||||
async def step_wait_for_server_status_with_timeout(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str, timeout: int):
|
||||
await wait_for_server_status_with_timeout(context, expecting_status, timeout)
|
||||
|
||||
|
||||
@step("the server is {expecting_status}")
|
||||
@async_run_until_complete
|
||||
async def step_wait_for_server_status(context, expecting_status: Literal['healthy', 'ready', 'idle', 'busy'] | str):
|
||||
await wait_for_server_status_with_timeout(context, expecting_status, 30)
|
||||
|
||||
|
||||
@step('all slots are {expected_slot_status_string}')
|
||||
@async_run_until_complete
|
||||
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue