add test

2024-05-02 21:30:53 -03:00 · 2024-05-02 21:30:53 -03:00 · a772cde9dc
commit a772cde9dc
parent 4a471b12d6
3 changed files with 91 additions and 2 deletions
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -47,6 +47,8 @@ def step_server_config(context, server_fqdn, server_port):
    context.n_ga = None
    context.n_ga_w = None
    context.n_predict = None
+    context.n_keep = 0
+    context.n_truncate = 0
    context.n_prompts = 0
    context.n_server_predict = None
    context.slot_save_path = None
@ -66,6 +68,7 @@ def step_server_config(context, server_fqdn, server_port):
    context.user_api_key = None
    context.response_format = None

+    context.stop_string = []
    context.tasks_result = []
    context.concurrent_tasks = []
    context.prompts = []
@ -168,6 +171,7 @@ def step_start_server(context):

    addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM)
    family, typ, proto, _, sockaddr = addrs[0]
+    print(sockaddr)

    while True:
        with closing(socket.socket(family, typ, proto)) as sock:
@ -231,17 +235,33 @@ async def step_all_slots_status(context, expected_slot_status_string):
@step('a completion request with {api_error} api error')
@async_run_until_complete
 async def step_request_completion(context, api_error):
+   await make_completion_request(context, api_error)
+
+
+@step('an ongoing completion request')
+@async_run_until_complete
+async def step_request_ongoing_completion(context):
+   await make_completion_request(context, '', True)
+
+
+async def make_completion_request(context, api_error, ongoing=False):
    expect_api_error = api_error == 'raised'
-    completion = await request_completion(context.prompts.pop(),
+    prompt = context.prompts[-1] if ongoing else context.prompts.pop()
+    completion = await request_completion(prompt,
                                          context.base_url,
                                          debug=context.debug,
                                          n_predict=context.n_predict,
+                                          n_keep=context.n_keep,
+                                          n_truncate=context.n_truncate,
                                          cache_prompt=context.cache_prompt,
+                                          stop_string=context.stop_string,
                                          id_slot=context.id_slot,
                                          seed=await completions_seed(context),
                                          expect_api_error=expect_api_error,
                                          user_api_key=context.user_api_key)
    context.tasks_result.append(completion)
+    if ongoing and not expect_api_error:
+        context.prompts[-1] += completion['content']
    if context.debug:
        print(f"Completion response: {completion}")
    if expect_api_error:
@ -306,6 +326,16 @@ def step_max_tokens(context, max_tokens):
    context.n_predict = max_tokens


+@step('{n_keep:d} tokens to keep')
+def step_keep_tokens(context, n_keep):
+    context.n_keep = n_keep
+
+
+@step('{n_truncate:d} tokens to truncate')
+def step_truncate_tokens(context, n_truncate):
+    context.n_truncate = n_truncate
+
+
@step('a response format {response_format}')
 def step_response_format(context, response_format):
    context.response_format = json.loads(response_format)
@ -355,6 +385,10 @@ def step_n_ubatch(context, n_ubatch):
 def step_seed(context, seed):
    context.seed = seed

+@step('a list of stop strings {stop_list}')
+def step_stop_string(context, stop_list):
+    context.stop_string = json.loads(stop_list)
+

@step('a prefix prompt')
 def step_prompt_prefix(context):
@ -457,6 +491,11 @@ def step_a_prompt_prompt(context, prompt):
    context.n_prompts = len(context.prompts)


+@step('an ongoing prompt')
+def step_a_ongoing_prompt(context):
+    context.prompts[-1] += context_text(context)
+
+
@step('concurrent completion requests')
@async_run_until_complete()
 async def step_concurrent_completion_requests(context):
@ -786,7 +825,10 @@ async def request_completion(prompt,
                             prompt_prefix=None,
                             prompt_suffix=None,
                             n_predict=None,
+                             n_keep=0,
+                             n_truncate=0,
                             cache_prompt=False,
+                             stop_string=None,
                             id_slot=None,
                             seed=None,
                             expect_api_error=None,
@ -809,7 +851,10 @@ async def request_completion(prompt,
                                    "prompt": prompt,
                                    "input_suffix": prompt_suffix,
                                    "n_predict": n_predict if n_predict is not None else -1,
+                                    "n_keep": n_keep,
+                                    "n_truncate": n_truncate,
                                    "cache_prompt": cache_prompt,
+                                    "stop": stop_string if stop_string is not None else [],
                                    "id_slot": id_slot,
                                    "seed": seed if seed is not None else 42
                                },
--- a/examples/server/tests/features/truncation.feature
+++ b/examples/server/tests/features/truncation.feature
@ -0,0 +1,44 @@
+# run with: ./tests.sh --no-skipped --tags truncation
+@trucation
+@slow
+Feature: Chat truncation
+
+  Background: Server startup
+    Given a server listening on localhost:8080
+    And   a model file mistral-7b-v0.2-iq3_s-imat.gguf from HF repo ggml-org/models
+    And   prompt caching is enabled
+    And   a list of stop strings ["\n"]
+    And   82 tokens to keep
+    And   256 KV cache size
+    And   32 server max tokens to predict
+    Then  the server is starting
+    Then  the server is healthy
+
+  Scenario: Correctly truncate the prompt when the prompt exceeds the context size
+    Given a prompt:
+    """
+    Continue the chat below.
+    Me: Hey there, how's it going?
+    You: I'm doing well, thanks for asking! How are you?
+    Me: I'm doing good, just trying to get some work done. How's your day?
+    You: My day has been pretty productive so far. I've been working on a new project.
+    Me: That's great to hear! What's the new project you're working on?
+    You: It's a web application that's designed to help people manage their personal finances. I'm really excited about it.
+    Me: That sounds really useful, I'd be interested to hear more about it. Do you have a timeframe for when you expect to have it ready to launch?
+    You: I'm aiming to have the initial version ready within the next few months. I want to ensure it's robust before launching it.
+    Me: That's really nice, are you happy with the progress so far?
+
+    """
+    And   an ongoing completion request
+    Then  -1 tokens are predicted matching You:
+    Given an ongoing prompt:
+    """
+
+    Me: I have one more question for you my friend. What's the most value thing you learned during your development journey?
+
+    """
+    And   52 tokens to truncate
+    And   a completion request with no api error
+    Then  -1 tokens are predicted matching You:
+    # 28 because '\n' stop string is not pushed to the context
+    And   28 prompt tokens are processed
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -5,7 +5,7 @@ set -eu
 if [ $# -lt 1 ]
 then
    # Start @llama.cpp scenario
-    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
+    behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey|truncation' --tags llama.cpp
 else
    behave "$@"
 fi