server: tests: CORS and api key checks scenario

This commit is contained in:
Pierrick HYMBERT 2024-02-21 01:49:39 +01:00
parent 6dcbcfe6ba
commit 672d98f6f0
3 changed files with 125 additions and 51 deletions

View file

@ -1,7 +1,7 @@
Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080 with 2 slots and 42 as seed
Given a server listening on localhost:8080 with 2 slots, 42 as seed and llama.cpp as api key
Then the server is starting
Then the server is healthy
@ -13,13 +13,17 @@ Feature: llama.cpp server
@llama.cpp
Scenario Outline: Completion
Given a <prompt> completion request with maximum <n_predict> tokens
Given a prompt <prompt>
And a user api key <api_key>
And <n_predict> max tokens to predict
And a completion request
Then <n_predict> tokens are predicted
Examples: Prompts
| prompt | n_predict |
| I believe the meaning of life is | 128 |
| Write a joke about AI | 512 |
| prompt | n_predict | api_key |
| I believe the meaning of life is | 128 | llama.cpp |
| Write a joke about AI | 512 | llama.cpp |
| say goodbye | 0 | |
@llama.cpp
Scenario Outline: OAI Compatibility
@ -28,13 +32,15 @@ Feature: llama.cpp server
And a model <model>
And <max_tokens> max tokens to predict
And streaming is <enable_streaming>
Given an OAI compatible chat completions request
And a user api key <api_key>
Given an OAI compatible chat completions request with an api error <api_error>
Then <max_tokens> tokens are predicted
Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | enable_streaming |
| llama-2 | You are ChatGPT. | Say hello. | 64 | false |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512 | true |
| model | system_prompt | user_prompt | max_tokens | enable_streaming | api_key | api_error |
| llama-2 | You are ChatGPT. | Say hello. | 64 | false | llama.cpp | none |
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512 | true | llama.cpp | none |
| John-Doe | You are an hacker. | Write segfault code in rust. | 0 | true | hackme | raised |
@llama.cpp
Scenario: Multi users
@ -47,6 +53,7 @@ Feature: llama.cpp server
Write another very long music lyrics.
"""
And 32 max tokens to predict
And a user api key llama.cpp
Given concurrent completion requests
Then the server is busy
And all slots are busy
@ -57,7 +64,7 @@ Feature: llama.cpp server
@llama.cpp
Scenario: Multi users OAI Compatibility
Given a system prompt "You are an AI assistant."
And a model tinyllama-2
And a model tinyllama-2
Given a prompt:
"""
Write a very long story about AI.
@ -68,6 +75,7 @@ Feature: llama.cpp server
"""
And 32 max tokens to predict
And streaming is enabled
And a user api key llama.cpp
Given concurrent OAI completions requests
Then the server is busy
And all slots are busy
@ -126,3 +134,15 @@ Feature: llama.cpp server
"""
Then tokens can be detokenize
@llama.cpp
Scenario Outline: CORS Options
When an OPTIONS request is sent from <origin>
Then CORS header <cors_header> is set to <cors_header_value>
Examples: Headers
| origin | cors_header | cors_header_value |
| localhost | Access-Control-Allow-Origin | localhost |
| web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr |
| origin | Access-Control-Allow-Credentials | true |
| web.mydomain.fr | Access-Control-Allow-Methods | POST |
| web.mydomain.fr | Access-Control-Allow-Headers | * |

View file

@ -7,8 +7,9 @@ import requests
from behave import step
@step(u"a server listening on {server_fqdn}:{server_port} with {n_slots} slots and {seed} as seed")
def step_server_config(context, server_fqdn, server_port, n_slots, seed):
@step(
u"a server listening on {server_fqdn}:{server_port} with {n_slots} slots, {seed} as seed and {api_key} as api key")
def step_server_config(context, server_fqdn, server_port, n_slots, seed, api_key):
context.server_fqdn = server_fqdn
context.server_port = int(server_port)
context.n_slots = int(n_slots)
@ -19,7 +20,8 @@ def step_server_config(context, server_fqdn, server_port, n_slots, seed):
context.completion_threads = []
context.prompts = []
openai.api_key = 'llama.cpp'
context.api_key = api_key
openai.api_key = context.api_key
@step(u"the server is {expecting_status}")
@ -77,14 +79,16 @@ def step_all_slots_status(context, expected_slot_status_string):
request_slots_status(context, expected_slots)
@step(u'a {prompt} completion request with maximum {n_predict} tokens')
def step_request_completion(context, prompt, n_predict):
request_completion(context, prompt, n_predict)
@step(u'a completion request')
def step_request_completion(context):
request_completion(context, context.prompts.pop(), context.n_predict, context.user_api_key)
context.user_api_key = None
@step(u'{predicted_n} tokens are predicted')
def step_n_tokens_predicted(context, predicted_n):
assert_n_tokens_predicted(context.completions[0], int(predicted_n))
if int(predicted_n) > 0:
assert_n_tokens_predicted(context.completions[0], int(predicted_n))
@step(u'a user prompt {user_prompt}')
@ -112,9 +116,20 @@ def step_streaming(context, enable_streaming):
context.enable_streaming = enable_streaming == 'enabled' or bool(enable_streaming)
@step(u'an OAI compatible chat completions request')
def step_oai_chat_completions(context):
oai_chat_completions(context, context.user_prompt)
@step(u'a user api key {user_api_key}')
def step_user_api_key(context, user_api_key):
context.user_api_key = user_api_key
@step(u'a user api key ')
def step_user_api_key(context):
context.user_api_key = None
@step(u'an OAI compatible chat completions request with an api error {api_error}')
def step_oai_chat_completions(context, api_error):
oai_chat_completions(context, context.user_prompt, api_error=api_error == 'raised')
context.user_api_key = None
@step(u'a prompt')
@ -122,14 +137,19 @@ def step_a_prompt(context):
context.prompts.append(context.text)
@step(u'a prompt {prompt}')
def step_a_prompt_prompt(context, prompt):
context.prompts.append(prompt)
@step(u'concurrent completion requests')
def step_concurrent_completion_requests(context):
concurrent_requests(context, request_completion)
concurrent_requests(context, request_completion, context.n_predict, context.user_api_key)
@step(u'concurrent OAI completions requests')
def step_oai_chat_completions(context):
concurrent_requests(context, oai_chat_completions)
concurrent_requests(context, oai_chat_completions, context.user_api_key)
@step(u'all prompts are predicted')
@ -168,7 +188,7 @@ def step_oai_compute_embedding(context):
def step_tokenize(context):
context.tokenized_text = context.text
response = requests.post(f'{context.base_url}/tokenize', json={
"content":context.tokenized_text,
"content": context.tokenized_text,
})
assert response.status_code == 200
context.tokens = response.json()['tokens']
@ -181,49 +201,82 @@ def step_detokenize(context):
"tokens": context.tokens,
})
assert response.status_code == 200
print(response.json())
# FIXME the detokenize answer contains a space prefix ? see #3287
assert context.tokenized_text == response.json()['content'].strip()
def concurrent_requests(context, f_completion):
@step(u'an OPTIONS request is sent from {origin}')
def step_options_request(context, origin):
options_response = requests.options(f'{context.base_url}/v1/chat/completions',
headers={"Origin": origin})
assert options_response.status_code == 200
context.options_response = options_response
@step(u'CORS header {cors_header} is set to {cors_header_value}')
def step_check_options_header_value(context, cors_header, cors_header_value):
assert context.options_response.headers[cors_header] == cors_header_value
def concurrent_requests(context, f_completion, *argv):
context.completions.clear()
context.completion_threads.clear()
for prompt in context.prompts:
completion_thread = threading.Thread(target=f_completion, args=(context, prompt))
completion_thread = threading.Thread(target=f_completion, args=(context, prompt, *argv))
completion_thread.start()
context.completion_threads.append(completion_thread)
context.prompts.clear()
def request_completion(context, prompt, n_predict=None):
response = requests.post(f'{context.base_url}/completion', json={
"prompt": prompt,
"n_predict": int(n_predict) if n_predict is not None else context.n_predict,
"seed": context.seed
})
assert response.status_code == 200
context.completions.append(response.json())
def request_completion(context, prompt, n_predict=None, user_api_key=None):
origin = "my.super.domain"
headers = {
'Origin': origin
}
if 'user_api_key' in context:
headers['Authorization'] = f'Bearer {user_api_key}'
response = requests.post(f'{context.base_url}/completion',
json={
"prompt": prompt,
"n_predict": int(n_predict) if n_predict is not None else context.n_predict,
"seed": context.seed
},
headers=headers)
if n_predict is not None and n_predict > 0:
assert response.status_code == 200
assert response.headers['Access-Control-Allow-Origin'] == origin
context.completions.append(response.json())
else:
assert response.status_code == 401
def oai_chat_completions(context, user_prompt):
def oai_chat_completions(context, user_prompt, api_error=None):
openai.api_key = context.user_api_key
openai.api_base = f'{context.base_url}/v1/chat'
chat_completion = openai.Completion.create(
messages=[
{
"role": "system",
"content": context.system_prompt,
},
{
"role": "user",
"content": user_prompt,
}
],
model=context.model,
max_tokens=context.n_predict,
stream=context.enable_streaming,
seed=context.seed
)
try:
chat_completion = openai.Completion.create(
messages=[
{
"role": "system",
"content": context.system_prompt,
},
{
"role": "user",
"content": user_prompt,
}
],
model=context.model,
max_tokens=context.n_predict,
stream=context.enable_streaming,
seed=context.seed
)
except openai.error.APIError:
if api_error:
openai.api_key = context.api_key
return
openai.api_key = context.api_key
if context.enable_streaming:
completion_response = {
'content': '',

View file

@ -29,6 +29,7 @@ set -eu
--threads-batch 4 \
--embedding \
--cont-batching \
--api-key llama.cpp \
"$@" &
# Start tests