From 8bb586bf066f1230e81a12cd9dcc812b9c4d75a4 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 20 Feb 2024 01:15:31 +0100 Subject: [PATCH] server: tests: add health check and concurrent request example --- examples/server/tests/features/server.feature | 22 ++- examples/server/tests/features/steps/steps.py | 142 ++++++++++++++++-- examples/server/tests/tests.sh | 16 +- 3 files changed, 151 insertions(+), 29 deletions(-) diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 60d8de954..6ab35b2e7 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -1,5 +1,13 @@ Feature: llama.cpp server + Background: The server is started and ready to accept prompts + When wait for the server to be started + Then wait for the server to be healthy + + Scenario: Health endpoint + Given an health liveness probe + Then the server must be healthy + Scenario Outline: run a completion request Given a prompt When we request a completion @@ -18,6 +26,14 @@ Feature: llama.cpp server Then the oai response contains completion tokens Examples: Prompts - | model | system_prompt | user_prompt | - | tinyllama-2 | You are ChatGPT. | Say hello | - | tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ | \ No newline at end of file + | model | system_prompt | user_prompt | + | tinyllama-2 | You are ChatGPT. | Say hello | + | tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ | + + + Scenario: Health endpoint during processing with concurrent requests + Given 2 slow concurrent prompts + Then wait for all slots processing + Then the server is overloaded + When wait for all slots idle + Then all prompts must be predicted \ No newline at end of file diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index f2721097b..a9933a724 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,19 +1,77 @@ +import socket +import threading +import time +from contextlib import closing + import openai import requests -from behave import * +from behave import step +from behave.api.async_step import async_run_until_complete + +base_fqdn = 'localhost' +base_port = 8080 +base_url = f"http://{base_fqdn}:{base_port}" openai.api_key = 'llama.cpp' -openai.api_base = "http://localhost:8080/v1/chat" +openai.api_base = f"{base_url}/v1/chat" + +slow_prompt = 'say hello ' * 10 +fast_prompt = 'Write a joke' + +n_slots = 2 -@given(u'a prompt {prompt}') +@step(u'wait for the server to be started') +def step_wait_for_the_server_to_be_started(context): + server_started = False + while not server_started: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((base_fqdn, base_port)) + if result != 0: + print("server not ready: ", base_fqdn, base_port, result) + time.sleep(1) + else: + return 0 + + +@step(u'wait for the server to be healthy') +def step_wait_for_the_server_to_be_healthy(context): + status_code = 500 + while status_code != 200: + status_code = requests.get(f'{base_url}/health').status_code + if status_code != 200: + time.sleep(1) + + +@step(u'an health liveness probe') +def step_an_health_liveness_probe(context): + response = requests.get(f'{base_url}/health') + context.status_code = response.status_code + context.response_data = response.json() + + +@step(u'the server must be healthy') +def step_server_healthy(context): + assert context.status_code == 200 + assert context.response_data['status'] == 'ok' + + +@step(u'the server is overloaded') +@async_run_until_complete() +async def step_server_overloaded(context): + response = requests.get(f'{base_url}/health?fail_on_no_slot') + assert response.status_code == 503 + assert response.json()['status'] == 'no slot available' + + +@step(u'a prompt {prompt}') def step_prompt(context, prompt): context.prompt = prompt -@when(u'we request a completion') +@step(u'we request a completion') def step_request_completion(context): - response = requests.post('http://localhost:8080/completion', json={ + response = requests.post(f'{base_url}/completion', json={ "prompt": context.prompt }) status_code = response.status_code @@ -21,28 +79,27 @@ def step_request_completion(context): context.response_data = response.json() -@then(u'tokens are predicted') +@step(u'tokens are predicted') def step_request_completion(context): - assert len(context.response_data['content']) > 0 - assert context.response_data['timings']['predicted_n'] > 0 + prompt_predicted(context.response_data) -@given(u'a user prompt {user_prompt}') +@step(u'a user prompt {user_prompt}') def step_user_prompt(context, user_prompt): context.user_prompt = user_prompt -@given(u'a system prompt {system_prompt}') +@step(u'a system prompt {system_prompt}') def step_system_prompt(context, system_prompt): context.system_prompt = system_prompt -@given(u'a model {model}') +@step(u'a model {model}') def step_model(context, model): context.model = model -@when(u'we request the oai completions endpoint') +@step(u'we request the oai completions endpoint') def step_oai_completions(context): context.chat_completion = openai.Completion.create( messages=[ @@ -59,8 +116,67 @@ def step_oai_completions(context): ) -@then(u'the oai response contains completion tokens') +@step(u'the oai response contains completion tokens') def step_oai_response_has_completion_tokens(context): assert len(context.chat_completion.choices) == 1 assert len(context.chat_completion.choices[0].message) > 0 assert context.chat_completion.usage.completion_tokens > 0 + + +def async_prompt(context, prompt): + response = requests.post(f'{base_url}/completion', json={ + "prompt": prompt + }) + + context.async_responses.append(response) + + +@step(u'{n_prompt} {prompt_type} concurrent prompts') +def step_n_concurrent_prompts(context, n_prompt, prompt_type): + prompt = fast_prompt + if prompt_type == 'slow': + prompt = slow_prompt + context.async_responses = [] + context.threads = [] + for i in range(int(n_prompt)): + thread = threading.Thread(target=async_prompt, args=(context, prompt)) + thread.start() + context.threads.append(thread) + + +def wait_for_slots_processing(context, expected_slots_processing): + while True: + health = requests.get(f'{base_url}/health').json() + if 'slots_processing' in health: # FIXME when #5594 is merged + slots_processing = health['slots_processing'] + else: + slots_processing = 0 + if slots_processing == expected_slots_processing: + break + else: + time.sleep(0.2) + + +@step(u'wait for all slots processing') +def step_wait_for_all_slots_processing(context): + wait_for_slots_processing(context, n_slots) + + +@step(u'wait for all slots idle') +def step_wait_for_all_slots_idle(context): + wait_for_slots_processing(context, 0) + + +@step(u'all prompts must be predicted') +def step_all_prompts_must_be_predicted(context): + for thread in context.threads: + thread.join() + for async_response in context.async_responses: + assert async_response.status_code == 200 + response_data = async_response.json() + prompt_predicted(response_data) + + +def prompt_predicted(response_data): + assert len(response_data['content']) > 0 + assert response_data['timings']['predicted_n'] > 0 diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index d3d414cd3..01b2f5d4d 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -6,6 +6,7 @@ then exit 1 fi +# kill the server at the end cleanup() { pkill -P $$ } @@ -20,9 +21,9 @@ set -eu ../../../build/bin/server \ --model "$model_path" \ --alias tinyllama-2 \ - --ctx-size 64 \ + --ctx-size 1024 \ --parallel 2 \ - --n-predict 32 \ + --n-predict 1024 \ --batch-size 32 \ --threads 4 \ --threads-batch 4 \ @@ -30,16 +31,5 @@ set -eu --cont-batching \ "$@" & -# Wait for the server to start -max_attempts=30 -attempts=${max_attempts} -until curl --silent --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do - attempts=$(( attempts - 1)); - [ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; } - sleep_time=$(( (max_attempts - attempts) * 2 )) - echo "waiting for server to be ready ${sleep_time}s..." - sleep ${sleep_time} -done - # Start tests behave \ No newline at end of file