server: test request cancellation (WIP)

2024-09-29 01:10:18 +01:00 · 2024-09-29 01:10:18 +01:00 · 5f00747a90
commit 5f00747a90
parent 4dcb3ea943
2 changed files with 69 additions and 2 deletions
--- a/examples/server/tests/features/cancel.feature
+++ b/examples/server/tests/features/cancel.feature
@ -0,0 +1,43 @@
@llama.cpp
@server
 Feature: Cancellation of llama.cpp server requests
  Background: Server startup
    Given a server listening on localhost:8080
    And   500 milliseconds delay in sampler for testing
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a model file test-model.gguf
    And   a model alias tinyllama-2
    And   BOS token is 1
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
      # that can be stored across all independent sequences: #4130
      # see --ctx-size and #5568
    And   256 KV cache size
    And   32 as batch size
    And   1 slots
    And   64 server max tokens to predict
    Then  the server is starting
    Then  the server is healthy
  # Scenario: Health
  #   Then the server is ready
  #   And  all slots are idle
  @wip
  Scenario Outline: Cancelling completion request frees up slot
    Given a prompt:
    """
    Once upon
    """
    And   256 max tokens to predict
    And   256 server max tokens to predict
    And   streaming is <enable_streaming>
    And   a completion request cancelled after 100 milliseconds
    # And   wait for 50 milliseconds
    Then  all slots are idle
    Examples: Prompts
      | enable_streaming |
      | disabled         |
      | enabled          |
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -291,6 +291,25 @@ async def step_request_completion(context, api_error: Literal['raised'] | str):
        api_error_code = int(api_error)
        assert completion == api_error_code, f"completion must be an {api_error_code} status code: {completion}"
@step('wait for {millis:d} milliseconds')
@async_run_until_complete
 async def step_request_completion(context, millis: int):
    await asyncio.sleep(millis / 1000.0)
@step('a completion request cancelled after {disconnect_after_millis:d} milliseconds')
@async_run_until_complete
 async def step_request_completion(context, disconnect_after_millis: int):
    seeds = await completions_seed(context, num_seeds=1)
    await request_completion(context.prompts.pop(),
                                          seeds[0] if seeds is not None else seeds,
                                          context.base_url,
                                          debug=context.debug,
                                          n_predict=context.n_predict,
                                          cache_prompt=context.cache_prompt,
                                          id_slot=context.id_slot,
                                          disconnect_after_millis=disconnect_after_millis,
                                          user_api_key=context.user_api_key,
                                          temperature=context.temperature)
@step('{predicted_n:d} tokens are predicted matching {re_content}')
 def step_n_tokens_predicted_with_content(context, predicted_n, re_content):
@ -982,9 +1001,10 @@ async def request_completion(prompt,
                             id_slot=None,
                             expect_api_error=None,
                             user_api_key=None,
                             disconnect_after_millis=None,
                             temperature=None) -> int | dict[str, Any]:
    if debug:
-        print(f"Sending completion request: {prompt}")
+        print(f"Sending completion request: {prompt} with n_predict={n_predict}")
    origin = "my.super.domain"
    headers = {
        'Origin': origin
@ -1008,6 +1028,10 @@ async def request_completion(prompt,
                                    "n_probs": 2,
                                },
                                headers=headers) as response:
            if disconnect_after_millis is not None:
                await asyncio.sleep(disconnect_after_millis / 1000)
                return 0
            if expect_api_error is None or not expect_api_error:
                assert response.status == 200
                assert response.headers['Access-Control-Allow-Origin'] == origin
@ -1352,7 +1376,7 @@ async def request_slots_status(context, expected_slots):
 def assert_slots_status(slots, expected_slots):
-    assert len(slots) == len(expected_slots)
+    assert len(slots) == len(expected_slots), f'invalid number of slots: {len(slots)} (actual) != {len(expected_slots)} (expected)'
    for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)):
        for key in expected:
            assert expected[key] == slot[key], (f"invalid slot {slot_id}"