From 672d98f6f0acee9f93bf74e44a032eee5942ff5a Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 21 Feb 2024 01:49:39 +0100 Subject: [PATCH] server: tests: CORS and api key checks scenario --- examples/server/tests/features/server.feature | 40 ++++-- examples/server/tests/features/steps/steps.py | 135 ++++++++++++------ examples/server/tests/tests.sh | 1 + 3 files changed, 125 insertions(+), 51 deletions(-) diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index d6894ae5f..44c676303 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -1,7 +1,7 @@ Feature: llama.cpp server Background: Server startup - Given a server listening on localhost:8080 with 2 slots and 42 as seed + Given a server listening on localhost:8080 with 2 slots, 42 as seed and llama.cpp as api key Then the server is starting Then the server is healthy @@ -13,13 +13,17 @@ Feature: llama.cpp server @llama.cpp Scenario Outline: Completion - Given a completion request with maximum tokens + Given a prompt + And a user api key + And max tokens to predict + And a completion request Then tokens are predicted Examples: Prompts - | prompt | n_predict | - | I believe the meaning of life is | 128 | - | Write a joke about AI | 512 | + | prompt | n_predict | api_key | + | I believe the meaning of life is | 128 | llama.cpp | + | Write a joke about AI | 512 | llama.cpp | + | say goodbye | 0 | | @llama.cpp Scenario Outline: OAI Compatibility @@ -28,13 +32,15 @@ Feature: llama.cpp server And a model And max tokens to predict And streaming is - Given an OAI compatible chat completions request + And a user api key + Given an OAI compatible chat completions request with an api error Then tokens are predicted Examples: Prompts - | model | system_prompt | user_prompt | max_tokens | enable_streaming | - | llama-2 | You are ChatGPT. | Say hello. | 64 | false | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512 | true | + | model | system_prompt | user_prompt | max_tokens | enable_streaming | api_key | api_error | + | llama-2 | You are ChatGPT. | Say hello. | 64 | false | llama.cpp | none | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512 | true | llama.cpp | none | + | John-Doe | You are an hacker. | Write segfault code in rust. | 0 | true | hackme | raised | @llama.cpp Scenario: Multi users @@ -47,6 +53,7 @@ Feature: llama.cpp server Write another very long music lyrics. """ And 32 max tokens to predict + And a user api key llama.cpp Given concurrent completion requests Then the server is busy And all slots are busy @@ -57,7 +64,7 @@ Feature: llama.cpp server @llama.cpp Scenario: Multi users OAI Compatibility Given a system prompt "You are an AI assistant." - And a model tinyllama-2 + And a model tinyllama-2 Given a prompt: """ Write a very long story about AI. @@ -68,6 +75,7 @@ Feature: llama.cpp server """ And 32 max tokens to predict And streaming is enabled + And a user api key llama.cpp Given concurrent OAI completions requests Then the server is busy And all slots are busy @@ -126,3 +134,15 @@ Feature: llama.cpp server """ Then tokens can be detokenize + @llama.cpp + Scenario Outline: CORS Options + When an OPTIONS request is sent from + Then CORS header is set to + + Examples: Headers + | origin | cors_header | cors_header_value | + | localhost | Access-Control-Allow-Origin | localhost | + | web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr | + | origin | Access-Control-Allow-Credentials | true | + | web.mydomain.fr | Access-Control-Allow-Methods | POST | + | web.mydomain.fr | Access-Control-Allow-Headers | * | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index e1c69c11b..edba61777 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -7,8 +7,9 @@ import requests from behave import step -@step(u"a server listening on {server_fqdn}:{server_port} with {n_slots} slots and {seed} as seed") -def step_server_config(context, server_fqdn, server_port, n_slots, seed): +@step( + u"a server listening on {server_fqdn}:{server_port} with {n_slots} slots, {seed} as seed and {api_key} as api key") +def step_server_config(context, server_fqdn, server_port, n_slots, seed, api_key): context.server_fqdn = server_fqdn context.server_port = int(server_port) context.n_slots = int(n_slots) @@ -19,7 +20,8 @@ def step_server_config(context, server_fqdn, server_port, n_slots, seed): context.completion_threads = [] context.prompts = [] - openai.api_key = 'llama.cpp' + context.api_key = api_key + openai.api_key = context.api_key @step(u"the server is {expecting_status}") @@ -77,14 +79,16 @@ def step_all_slots_status(context, expected_slot_status_string): request_slots_status(context, expected_slots) -@step(u'a {prompt} completion request with maximum {n_predict} tokens') -def step_request_completion(context, prompt, n_predict): - request_completion(context, prompt, n_predict) +@step(u'a completion request') +def step_request_completion(context): + request_completion(context, context.prompts.pop(), context.n_predict, context.user_api_key) + context.user_api_key = None @step(u'{predicted_n} tokens are predicted') def step_n_tokens_predicted(context, predicted_n): - assert_n_tokens_predicted(context.completions[0], int(predicted_n)) + if int(predicted_n) > 0: + assert_n_tokens_predicted(context.completions[0], int(predicted_n)) @step(u'a user prompt {user_prompt}') @@ -112,9 +116,20 @@ def step_streaming(context, enable_streaming): context.enable_streaming = enable_streaming == 'enabled' or bool(enable_streaming) -@step(u'an OAI compatible chat completions request') -def step_oai_chat_completions(context): - oai_chat_completions(context, context.user_prompt) +@step(u'a user api key {user_api_key}') +def step_user_api_key(context, user_api_key): + context.user_api_key = user_api_key + + +@step(u'a user api key ') +def step_user_api_key(context): + context.user_api_key = None + + +@step(u'an OAI compatible chat completions request with an api error {api_error}') +def step_oai_chat_completions(context, api_error): + oai_chat_completions(context, context.user_prompt, api_error=api_error == 'raised') + context.user_api_key = None @step(u'a prompt') @@ -122,14 +137,19 @@ def step_a_prompt(context): context.prompts.append(context.text) +@step(u'a prompt {prompt}') +def step_a_prompt_prompt(context, prompt): + context.prompts.append(prompt) + + @step(u'concurrent completion requests') def step_concurrent_completion_requests(context): - concurrent_requests(context, request_completion) + concurrent_requests(context, request_completion, context.n_predict, context.user_api_key) @step(u'concurrent OAI completions requests') def step_oai_chat_completions(context): - concurrent_requests(context, oai_chat_completions) + concurrent_requests(context, oai_chat_completions, context.user_api_key) @step(u'all prompts are predicted') @@ -168,7 +188,7 @@ def step_oai_compute_embedding(context): def step_tokenize(context): context.tokenized_text = context.text response = requests.post(f'{context.base_url}/tokenize', json={ - "content":context.tokenized_text, + "content": context.tokenized_text, }) assert response.status_code == 200 context.tokens = response.json()['tokens'] @@ -181,49 +201,82 @@ def step_detokenize(context): "tokens": context.tokens, }) assert response.status_code == 200 - print(response.json()) # FIXME the detokenize answer contains a space prefix ? see #3287 assert context.tokenized_text == response.json()['content'].strip() -def concurrent_requests(context, f_completion): +@step(u'an OPTIONS request is sent from {origin}') +def step_options_request(context, origin): + options_response = requests.options(f'{context.base_url}/v1/chat/completions', + headers={"Origin": origin}) + assert options_response.status_code == 200 + context.options_response = options_response + + +@step(u'CORS header {cors_header} is set to {cors_header_value}') +def step_check_options_header_value(context, cors_header, cors_header_value): + assert context.options_response.headers[cors_header] == cors_header_value + + +def concurrent_requests(context, f_completion, *argv): context.completions.clear() context.completion_threads.clear() for prompt in context.prompts: - completion_thread = threading.Thread(target=f_completion, args=(context, prompt)) + completion_thread = threading.Thread(target=f_completion, args=(context, prompt, *argv)) completion_thread.start() context.completion_threads.append(completion_thread) context.prompts.clear() -def request_completion(context, prompt, n_predict=None): - response = requests.post(f'{context.base_url}/completion', json={ - "prompt": prompt, - "n_predict": int(n_predict) if n_predict is not None else context.n_predict, - "seed": context.seed - }) - assert response.status_code == 200 - context.completions.append(response.json()) +def request_completion(context, prompt, n_predict=None, user_api_key=None): + origin = "my.super.domain" + headers = { + 'Origin': origin + } + if 'user_api_key' in context: + headers['Authorization'] = f'Bearer {user_api_key}' + + response = requests.post(f'{context.base_url}/completion', + json={ + "prompt": prompt, + "n_predict": int(n_predict) if n_predict is not None else context.n_predict, + "seed": context.seed + }, + headers=headers) + if n_predict is not None and n_predict > 0: + assert response.status_code == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + context.completions.append(response.json()) + else: + assert response.status_code == 401 -def oai_chat_completions(context, user_prompt): + +def oai_chat_completions(context, user_prompt, api_error=None): + openai.api_key = context.user_api_key openai.api_base = f'{context.base_url}/v1/chat' - chat_completion = openai.Completion.create( - messages=[ - { - "role": "system", - "content": context.system_prompt, - }, - { - "role": "user", - "content": user_prompt, - } - ], - model=context.model, - max_tokens=context.n_predict, - stream=context.enable_streaming, - seed=context.seed - ) + try: + chat_completion = openai.Completion.create( + messages=[ + { + "role": "system", + "content": context.system_prompt, + }, + { + "role": "user", + "content": user_prompt, + } + ], + model=context.model, + max_tokens=context.n_predict, + stream=context.enable_streaming, + seed=context.seed + ) + except openai.error.APIError: + if api_error: + openai.api_key = context.api_key + return + openai.api_key = context.api_key if context.enable_streaming: completion_response = { 'content': '', diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 52908b839..19cd7f17b 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -29,6 +29,7 @@ set -eu --threads-batch 4 \ --embedding \ --cont-batching \ + --api-key llama.cpp \ "$@" & # Start tests