server: tests: add OAI stream test, fix file end of line, fast fail behave
This commit is contained in:
parent
56583bee41
commit
9b7ea97979
5 changed files with 35 additions and 14 deletions
2
.github/workflows/server-test.yml
vendored
2
.github/workflows/server-test.yml
vendored
|
@ -47,5 +47,3 @@ jobs:
|
|||
run: |
|
||||
cd examples/server/tests
|
||||
./tests.sh ../../../stories260K.gguf
|
||||
|
||||
|
||||
|
|
|
@ -23,13 +23,14 @@ Feature: llama.cpp server
|
|||
And a user prompt <user_prompt>
|
||||
And a model <model>
|
||||
And <max_tokens> max tokens to predict
|
||||
And streaming is <enable_streaming>
|
||||
Given an OAI compatible chat completions request
|
||||
Then <predicted_n> tokens are predicted
|
||||
|
||||
Examples: Prompts
|
||||
| model | system_prompt | user_prompt | max_tokens | predicted_n |
|
||||
| llama-2 | You are ChatGPT. | Say hello. | 64 | 64 |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512 | 512 |
|
||||
| model | system_prompt | user_prompt | max_tokens | enable_streaming | predicted_n |
|
||||
| llama-2 | You are ChatGPT. | Say hello. | 64 | false | 64 |
|
||||
| codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 512 | true | 512 |
|
||||
|
||||
Scenario: Multi users
|
||||
Given a prompt:
|
||||
|
@ -55,4 +56,4 @@ Feature: llama.cpp server
|
|||
Given concurrent completion requests
|
||||
Then the server is busy
|
||||
Then the server is idle
|
||||
Then all prompts are predicted
|
||||
Then all prompts are predicted
|
||||
|
|
|
@ -74,6 +74,11 @@ def step_max_tokens(context, max_tokens):
|
|||
context.max_tokens = int(max_tokens)
|
||||
|
||||
|
||||
@step(u'streaming is {enable_streaming}')
|
||||
def step_streaming(context, enable_streaming):
|
||||
context.enable_streaming = bool(enable_streaming)
|
||||
|
||||
|
||||
@step(u'an OAI compatible chat completions request')
|
||||
def step_oai_chat_completions(context):
|
||||
chat_completion = openai.Completion.create(
|
||||
|
@ -88,14 +93,31 @@ def step_oai_chat_completions(context):
|
|||
}
|
||||
],
|
||||
model=context.model,
|
||||
max_tokens=context.max_tokens
|
||||
max_tokens=context.max_tokens,
|
||||
stream=context.enable_streaming
|
||||
)
|
||||
context.completions.append({
|
||||
'content': chat_completion.choices[0].message,
|
||||
'timings': {
|
||||
'predicted_n': chat_completion.usage.completion_tokens
|
||||
if context.enable_streaming:
|
||||
completion_response = {
|
||||
'content': '',
|
||||
'timings': {
|
||||
'predicted_n': 0
|
||||
}
|
||||
}
|
||||
})
|
||||
for chunk in chat_completion:
|
||||
assert len(chunk.choices) == 1
|
||||
delta = chunk.choices[0].delta
|
||||
if 'content' in delta:
|
||||
completion_response['content'] += delta['content']
|
||||
completion_response['timings']['predicted_n'] += 1
|
||||
context.completions.append(completion_response)
|
||||
else:
|
||||
assert len(chat_completion.choices) == 1
|
||||
context.completions.append({
|
||||
'content': chat_completion.choices[0].message,
|
||||
'timings': {
|
||||
'predicted_n': chat_completion.usage.completion_tokens
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
@step(u'a prompt')
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
behave~=1.2.6
|
||||
openai~=0.25.0
|
||||
openai~=0.25.0
|
||||
|
|
|
@ -32,4 +32,4 @@ set -eu
|
|||
"$@" &
|
||||
|
||||
# Start tests
|
||||
behave
|
||||
behave --summary --stop
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue