server: tests: add infinite loop
This commit is contained in:
parent
0772884b06
commit
6b9dc4f291
2 changed files with 30 additions and 19 deletions
|
@ -36,24 +36,13 @@ Feature: llama.cpp server
|
||||||
Scenario: Multi users
|
Scenario: Multi users
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
"""
|
"""
|
||||||
Write a formal complaint email to Air France about my delayed
|
Write a very long story about AI.
|
||||||
baggage from my flight on Tuesday, January 17th, from Paris to Toulouse. Be verbose.
|
|
||||||
"""
|
"""
|
||||||
And a prompt:
|
And a prompt:
|
||||||
"""
|
"""
|
||||||
Translate the following War & Peace chapter into Russian: WELL, PRINCE,
|
Write another very long music lyrics.
|
||||||
Genoa and Lucca are now no more than private estates of the Bonaparte
|
|
||||||
family. No, I warn you, that if you do not tell me we are at war,
|
|
||||||
if you again allow yourself to palliate all the infamies and atrocities
|
|
||||||
of this Antichrist (upon my word, I believe he is), I don’t know you
|
|
||||||
in future, you are no longer my friend, no longer my faithful slave,
|
|
||||||
as you say. There, how do you do, how do you do? I see I’m scaring you,
|
|
||||||
sit down and talk to me.” These words were uttered in July 1805 by
|
|
||||||
Anna Pavlovna Scherer, a distinguished lady of the court,
|
|
||||||
and confidential maid-of-honour to the Empress Marya Fyodorovna.
|
|
||||||
It was her greeting to Prince Vassily, a man high in rank
|
|
||||||
and office, who was the first to arrive at her soirée.
|
|
||||||
"""
|
"""
|
||||||
|
And 512 max tokens to predict
|
||||||
Given concurrent completion requests
|
Given concurrent completion requests
|
||||||
Then the server is busy
|
Then the server is busy
|
||||||
And all slots are busy
|
And all slots are busy
|
||||||
|
@ -65,8 +54,6 @@ Feature: llama.cpp server
|
||||||
Scenario: Multi users OAI Compatibility
|
Scenario: Multi users OAI Compatibility
|
||||||
Given a system prompt "You are an AI assistant."
|
Given a system prompt "You are an AI assistant."
|
||||||
And a model tinyllama-2
|
And a model tinyllama-2
|
||||||
And 1024 max tokens to predict
|
|
||||||
And streaming is enabled
|
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
"""
|
"""
|
||||||
Write a very long story about AI.
|
Write a very long story about AI.
|
||||||
|
@ -75,6 +62,8 @@ Feature: llama.cpp server
|
||||||
"""
|
"""
|
||||||
Write another very long music lyrics.
|
Write another very long music lyrics.
|
||||||
"""
|
"""
|
||||||
|
And 512 max tokens to predict
|
||||||
|
And streaming is enabled
|
||||||
Given concurrent OAI completions requests
|
Given concurrent OAI completions requests
|
||||||
Then the server is busy
|
Then the server is busy
|
||||||
And all slots are busy
|
And all slots are busy
|
||||||
|
@ -82,3 +71,25 @@ Feature: llama.cpp server
|
||||||
And all slots are idle
|
And all slots are idle
|
||||||
Then all prompts are predicted
|
Then all prompts are predicted
|
||||||
|
|
||||||
|
# FIXME: infinite loop on the CI, not locally, if n_prompt * n_predict > kv_size
|
||||||
|
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long story about AI.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another very long music lyrics.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long poem.
|
||||||
|
"""
|
||||||
|
And 1024 max tokens to predict
|
||||||
|
Given concurrent completion requests
|
||||||
|
Then the server is busy
|
||||||
|
And all slots are busy
|
||||||
|
Then the server is idle
|
||||||
|
And all slots are idle
|
||||||
|
Then all prompts are predicted
|
||||||
|
|
||||||
|
|
|
@ -105,7 +105,7 @@ def step_model(context, model):
|
||||||
|
|
||||||
@step(u'{max_tokens} max tokens to predict')
|
@step(u'{max_tokens} max tokens to predict')
|
||||||
def step_max_tokens(context, max_tokens):
|
def step_max_tokens(context, max_tokens):
|
||||||
context.max_tokens = int(max_tokens)
|
context.n_predict = int(max_tokens)
|
||||||
|
|
||||||
|
|
||||||
@step(u'streaming is {enable_streaming}')
|
@step(u'streaming is {enable_streaming}')
|
||||||
|
@ -154,7 +154,7 @@ def concurrent_requests(context, f_completion):
|
||||||
def request_completion(context, prompt, n_predict=None):
|
def request_completion(context, prompt, n_predict=None):
|
||||||
response = requests.post(f'{context.base_url}/completion', json={
|
response = requests.post(f'{context.base_url}/completion', json={
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"n_predict": int(n_predict) if n_predict is not None else 4096,
|
"n_predict": int(n_predict) if n_predict is not None else context.n_predict,
|
||||||
"seed": context.seed
|
"seed": context.seed
|
||||||
})
|
})
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
|
@ -174,7 +174,7 @@ def oai_chat_completions(context, user_prompt):
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
model=context.model,
|
model=context.model,
|
||||||
max_tokens=context.max_tokens,
|
max_tokens=context.n_predict,
|
||||||
stream=context.enable_streaming,
|
stream=context.enable_streaming,
|
||||||
seed = context.seed
|
seed = context.seed
|
||||||
)
|
)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue