diff --git a/.github/workflows/server-test.yml b/.github/workflows/server.yml similarity index 100% rename from .github/workflows/server-test.yml rename to .github/workflows/server.yml diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 6ebff8674..8f708f8ab 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -12,8 +12,9 @@ Server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_ 3. Start the test: `./tests.sh` It's possible to override some scenario steps values with environment variables: - - `$PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` - - `$LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` + - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` + - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` + - `DEBUG` -> "ON" to enable server verbose mode `--verbose` ### Run @bug, @wip or @wrong_usage annotated scenario @@ -23,4 +24,4 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de - `@wip` to focus on a scenario working in progress To run a scenario annotated with `@bug`, start: -`./tests.sh --tags bug` \ No newline at end of file +`DEBUG=ON ./tests.sh --no-skipped --tags bug` \ No newline at end of file diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 690571d23..13cc84101 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -24,7 +24,7 @@ def after_scenario(context, scenario): for line in f: print(line) if not is_server_listening(context.server_fqdn, context.server_port): - print("ERROR: Server has crashed") + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") if not pid_exists(context.server_process.pid): assert False, f"Server not running pid={context.server_process.pid} ..." @@ -41,7 +41,7 @@ def after_scenario(context, scenario): time.sleep(0.1) attempts += 1 if attempts > 5: - print(f"Server dandling exits, killing all {context.server_path} ...") + print(f"Server dangling exits, killing all {context.server_path} ...") process = subprocess.run(['killall', '-9', context.server_path], stderr=subprocess.PIPE, universal_newlines=True) diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature new file mode 100644 index 000000000..542006d9a --- /dev/null +++ b/examples/server/tests/features/issues.feature @@ -0,0 +1,36 @@ +# List of ongoing issues +@bug +Feature: Issues + # Issue #5655 + Scenario: Multi users embeddings + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a model alias tinyllama-2 + And 42 as server seed + And 64 KV cache size + And 2 slots + And continuous batching + And embeddings extraction + Then the server is starting + Then the server is healthy + + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index c8be94eb8..5e2b729eb 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -35,8 +35,8 @@ def step_server_config(context, server_fqdn, server_port): context.server_seed = None context.user_api_key = None - context.completions = [] - context.concurrent_completion_tasks = [] + context.tasks_result = [] + context.concurrent_tasks = [] context.prompts = [] @@ -149,7 +149,7 @@ async def step_request_completion(context, api_error): server_seed=context.server_seed, expect_api_error=expect_api_error, user_api_key=context.user_api_key) - context.completions.append(completion) + context.tasks_result.append(completion) print(f"Completion response: {completion}") if expect_api_error: assert completion == 401, f"completion must be an 401 status code: {completion}" @@ -157,12 +157,12 @@ async def step_request_completion(context, api_error): @step(u'{predicted_n} tokens are predicted matching {re_content}') def step_n_tokens_predicted_with_content(context, predicted_n, re_content): - assert_n_tokens_predicted(context.completions.pop(), int(predicted_n), re_content) + assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content) @step(u'{predicted_n} tokens are predicted') def step_n_tokens_predicted(context, predicted_n): - assert_n_tokens_predicted(context.completions.pop(), int(predicted_n)) + assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n)) @step(u'a user prompt {user_prompt}') @@ -195,13 +195,13 @@ def step_user_api_key(context, user_api_key): context.user_api_key = user_api_key -@step(u'a user api key ') +@step(u'no user api key') def step_no_user_api_key(context): context.user_api_key = None -@step(u'no user api key') -def step_no_user_api_key(context): +@step(u'a user api key ') +def step_no_user_api_key_space(context): context.user_api_key = None @@ -234,7 +234,7 @@ async def step_oai_chat_completions(context, api_error): if hasattr(context, 'user_api_key') else None, expect_api_error=expect_api_error) - context.completions.append(completion) + context.tasks_result.append(completion) print(f"Completion response: {completion}") if expect_api_error: assert completion == 401, f"completion must be an 401 status code: {completion}" @@ -285,47 +285,38 @@ async def step_oai_chat_completions(context): if hasattr(context, 'user_api_key') else None) -@async_run_until_complete @step(u'all prompts are predicted') -async def step_impl(context): +@async_run_until_complete +async def step_all_prompts_are_predicted(context): await all_prompts_are_predicted(context) @step(u'all prompts are predicted with {n_predict} tokens') @async_run_until_complete -async def step_all_prompts_are_predicted(context, n_predict): +async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict): expected_predicted_n = int(n_predict) await all_prompts_are_predicted(context, expected_predicted_n) async def all_prompts_are_predicted(context, expected_predicted_n=None): - n_completions = await gather_concurrent_completions_tasks(context) + n_completions = await gather_tasks_results(context) assert n_completions > 0 for i in range(n_completions): - assert_n_tokens_predicted(context.completions.pop(), expected_predicted_n=expected_predicted_n) + assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n) + assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests" @step(u'embeddings are computed for') @async_run_until_complete async def step_compute_embedding(context): - async with aiohttp.ClientSession() as session: - async with session.post(f'{context.base_url}/embedding', - json={ - "content": context.text, - }) as response: - assert response.status == 200 - response_json = await response.json() - context.embeddings = response_json['embedding'] + content = context.text + base_url = context.base_url + context.embeddings = await request_embedding(content, base_url) @step(u'embeddings are generated') -def step_compute_embeddings(context): - assert len(context.embeddings) > 0 - embeddings_computed = False - for emb in context.embeddings: - if emb != 0: - embeddings_computed = True - assert embeddings_computed, f"Embeddings: {context.embeddings}" +def step_assert_embeddings(context): + assert_embeddings(context.embeddings) @step(u'an OAI compatible embeddings computation request for') @@ -341,6 +332,24 @@ def step_oai_compute_embedding(context): context.embeddings = embeddings +@step(u'concurrent embedding requests') +@async_run_until_complete() +async def step_concurrent_embedding_requests(context): + await concurrent_completion_requests(context, + request_embedding, + # prompt is inserted automatically + context.base_url) + + +@step(u'all embeddings are generated') +@async_run_until_complete() +async def all_embeddings_are_generated(context): + n_embedding_requests = await gather_tasks_results(context) + assert n_embedding_requests > 0 + for i in range(n_embedding_requests): + assert_embeddings(context.tasks_result.pop()) + + @step(u'tokenizing') @async_run_until_complete async def step_tokenize(context): @@ -391,7 +400,7 @@ async def concurrent_completion_requests(context, f_completion, *args, **kwargs) assert n_prompts > 0 for prompt_no in range(n_prompts): shifted_args = [context.prompts.pop(), *args] - context.concurrent_completion_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) + context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) await asyncio.sleep(0.1) @@ -540,6 +549,17 @@ async def oai_chat_completions(user_prompt, return completion_response +async def request_embedding(content, base_url): + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/embedding', + json={ + "content": content, + }) as response: + assert response.status == 200 + response_json = await response.json() + return response_json['embedding'] + + def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): content = completion_response['content'] n_predicted = completion_response['timings']['predicted_n'] @@ -554,12 +574,12 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re f' ```\n{content}\n``` do not match /{re_content}/') -async def gather_concurrent_completions_tasks(context): - n_completion_tasks = len(context.concurrent_completion_tasks) - print(f"Waiting for all {n_completion_tasks} completion responses...") - for task_no in range(n_completion_tasks): - context.completions.append(await context.concurrent_completion_tasks.pop()) - n_completions = len(context.completions) +async def gather_tasks_results(context): + n_tasks = len(context.concurrent_tasks) + print(f"Waiting for all {n_tasks} tasks results...") + for task_no in range(n_tasks): + context.tasks_result.append(await context.concurrent_tasks.pop()) + n_completions = len(context.tasks_result) return n_completions @@ -602,16 +622,25 @@ async def wait_for_health_status(context, if counter >= timeout: # Sometimes health requests are triggered after completions are predicted if expected_http_status_code == 503: - if len(context.completions) == 0: - print("\x1b[5;37;43mWARNING: forcing concurrents completions tasks," + if len(context.tasks_result) == 0: + print("\x1b[5;37;43mWARNING: forcing concurrent tasks," " busy health check missed, probably too fast inference\x1b[0m") - n_completions = await gather_concurrent_completions_tasks(context) + n_completions = await gather_tasks_results(context) if n_completions > 0: return assert False, 'timeout exceeded' +def assert_embeddings(embeddings): + assert len(embeddings) > 0 + embeddings_computed = False + for emb in embeddings: + if emb != 0: + embeddings_computed = True + assert embeddings_computed, f"Embeddings: {embeddings}" + + async def request_slots_status(context, expected_slots): async with aiohttp.ClientSession() as session: async with await session.get(f'{context.base_url}/slots') as slots_response: @@ -652,6 +681,8 @@ def start_server_background(context): server_args.extend(['--n-predict', context.n_server_predict]) if context.server_api_key is not None: server_args.extend(['--api-key', context.server_api_key]) + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + server_args.append('--verbose') print(f"starting server with: {context.server_path}", *server_args) context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], diff --git a/examples/server/tests/features/wrong_usage.feature b/examples/server/tests/features/wrong_usages.feature similarity index 77% rename from examples/server/tests/features/wrong_usage.feature rename to examples/server/tests/features/wrong_usages.feature index 59098a290..e228b2371 100644 --- a/examples/server/tests/features/wrong_usage.feature +++ b/examples/server/tests/features/wrong_usages.feature @@ -4,12 +4,10 @@ Feature: Wrong usage of llama.cpp server #3969 The user must always set --n-predict option # to cap the number of tokens any completion request can generate - # or pass n_predict or max_tokens in the request. + # or pass n_predict/max_tokens in the request. Scenario: Infinite loop Given a server listening on localhost:8080 And a model file stories260K.gguf - And 1 slots - And 32 KV cache size # Uncomment below to fix the issue #And 64 server max tokens to predict Then the server is starting @@ -17,6 +15,7 @@ Feature: Wrong usage of llama.cpp server """ Go to: infinite loop """ + # Uncomment below to fix the issue + #And 128 max tokens to predict Given concurrent completion requests - - Then all prompts are predicted \ No newline at end of file + Then all prompts are predicted diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index a2e29748d..17a4e6fc6 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -5,7 +5,7 @@ set -eu if [ $# -lt 1 ] then # Start @llama.cpp scenario - behave --summary --stop --no-capture --tags llama.cpp + behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp else behave "$@" fi