diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 968e288d0..6e54395b6 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -36,24 +36,13 @@ Feature: llama.cpp server Scenario: Multi users Given a prompt: """ - Write a formal complaint email to Air France about my delayed - baggage from my flight on Tuesday, January 17th, from Paris to Toulouse. Be verbose. + Write a very long story about AI. """ And a prompt: """ - Translate the following War & Peace chapter into Russian: WELL, PRINCE, - Genoa and Lucca are now no more than private estates of the Bonaparte - family. No, I warn you, that if you do not tell me we are at war, - if you again allow yourself to palliate all the infamies and atrocities - of this Antichrist (upon my word, I believe he is), I don’t know you - in future, you are no longer my friend, no longer my faithful slave, - as you say. There, how do you do, how do you do? I see I’m scaring you, - sit down and talk to me.” These words were uttered in July 1805 by - Anna Pavlovna Scherer, a distinguished lady of the court, - and confidential maid-of-honour to the Empress Marya Fyodorovna. - It was her greeting to Prince Vassily, a man high in rank - and office, who was the first to arrive at her soirée. + Write another very long music lyrics. """ + And 512 max tokens to predict Given concurrent completion requests Then the server is busy And all slots are busy @@ -65,8 +54,6 @@ Feature: llama.cpp server Scenario: Multi users OAI Compatibility Given a system prompt "You are an AI assistant." And a model tinyllama-2 - And 1024 max tokens to predict - And streaming is enabled Given a prompt: """ Write a very long story about AI. @@ -75,6 +62,8 @@ Feature: llama.cpp server """ Write another very long music lyrics. """ + And 512 max tokens to predict + And streaming is enabled Given concurrent OAI completions requests Then the server is busy And all slots are busy @@ -82,3 +71,25 @@ Feature: llama.cpp server And all slots are idle Then all prompts are predicted + # FIXME: infinite loop on the CI, not locally, if n_prompt * n_predict > kv_size + Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And 1024 max tokens to predict + Given concurrent completion requests + Then the server is busy + And all slots are busy + Then the server is idle + And all slots are idle + Then all prompts are predicted + diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 400b3c126..896d8e32d 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -105,7 +105,7 @@ def step_model(context, model): @step(u'{max_tokens} max tokens to predict') def step_max_tokens(context, max_tokens): - context.max_tokens = int(max_tokens) + context.n_predict = int(max_tokens) @step(u'streaming is {enable_streaming}') @@ -154,7 +154,7 @@ def concurrent_requests(context, f_completion): def request_completion(context, prompt, n_predict=None): response = requests.post(f'{context.base_url}/completion', json={ "prompt": prompt, - "n_predict": int(n_predict) if n_predict is not None else 4096, + "n_predict": int(n_predict) if n_predict is not None else context.n_predict, "seed": context.seed }) assert response.status_code == 200 @@ -174,7 +174,7 @@ def oai_chat_completions(context, user_prompt): } ], model=context.model, - max_tokens=context.max_tokens, + max_tokens=context.n_predict, stream=context.enable_streaming, seed = context.seed )