server: tests: reduce number of files, all in one tests shell script

2024-02-19 21:50:56 +01:00 · 2024-02-19 21:50:56 +01:00 · 9b63d7057a
commit 9b63d7057a
parent 157bcf2286
7 changed files with 85 additions and 60 deletions
--- a/.github/workflows/server-test.yml
+++ b/.github/workflows/server-test.yml
@ -45,26 +45,7 @@ jobs:
      - name: Server Integration Tests
        id: server_integration_test
        run: |
          ./build/bin/server \
            -m tinyllama-2-1b-miniguanaco.Q2_K.gguf \
            --ctx-size 512 \
            --parallel 4 \
            --n-predict 512 \
            --batch-size 128 \
            --threads 4 \
            --threads-batch 128 \
            --alias phi-2 \
            --embedding \
            --cont-batching &
          sh -c '\
            max_attempts=30; \
            attempts=${max_attempts}; \
            echo "waiting for server to be ready..."; \
            until curl --silent --show-error --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do \
              attempts=$(( attempts - 1)); \
              [ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }; \
              sleep $(( (max_attempts - attempts) * 2 )); \
            done;'
          cd examples/server/tests
-          behave
+          ./tests.sh
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@ -6,4 +6,6 @@ Functional server tests suite.
 `pip install -r requirements.txt`
 ### Run tests
-`python -m behave`
+1. Build the server
 2. download a GGUF model: `../../../scripts/hf.sh --repo TheBloke/Tinyllama-2-1b-miniguanaco-GGUF --file tinyllama-2-1b-miniguanaco.Q2_K.gguf`
 3. Start the test: `./tests.sh tinyllama-2-1b-miniguanaco.Q2_K.gguf -ngl 23 --log-disable`
--- a/examples/server/tests/features/completions.feature
+++ b/examples/server/tests/features/completions.feature
@ -1,11 +0,0 @@
 Feature: Completion request
  Scenario Outline: run a completion request
      Given a prompt <prompt>
      When we request a completion
      Then tokens are predicted
    Examples: Prompts
      | prompt                                                         |
      | I believe the meaning of life is                               |
      | Write a detailed analogy between mathematics and a lighthouse. |
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -1,4 +1,14 @@
-Feature: OpenAI compatible completions request
+Feature: llama.cpp server
  Scenario Outline: run a completion request
    Given a prompt <prompt>
    When we request a completion
    Then tokens are predicted
    Examples: Prompts
      | prompt       |
      | I believe    |
      | Write a joke |
  Scenario Outline: run a completion on the OAI endpoint
    Given a system prompt <system_prompt>
@ -9,5 +19,5 @@ Feature: OpenAI compatible completions request
    Examples: Prompts
      | model          | system_prompt                | user_prompt                            |
-      | tinyllama-2    | You are ChatGPT.             | I believe the meaning of life is       |
+      | tinyllama-2    | You are ChatGPT.             | Say hello                              |
      | tinyllama-2    | You are a coding assistant.  | Write the fibonacci function in c++    |
--- a/examples/server/tests/features/steps/completion.py
+++ b/examples/server/tests/features/steps/completion.py
@ -1,24 +0,0 @@
 from behave import *
 import requests
@given(u'a prompt {prompt}')
 def step_prompt(context, prompt):
    context.prompt = prompt
@when(u'we request a completion')
 def step_request_completion(context):
    response = requests.post('http://localhost:8080/completion', json={
        "prompt": context.prompt
    })
    status_code = response.status_code
    assert status_code == 200
    context.response_data = response.json()
@then(u'tokens are predicted')
 def step_request_completion(context):
    assert len(context.response_data['content']) > 0
    assert context.response_data['timings']['predicted_n'] > 0
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1,10 +1,32 @@
 from behave import *
 import openai
 import requests
 from behave import *
 openai.api_key = 'llama.cpp'
 openai.api_base = "http://localhost:8080/v1/chat"
@given(u'a prompt {prompt}')
 def step_prompt(context, prompt):
    context.prompt = prompt
@when(u'we request a completion')
 def step_request_completion(context):
    response = requests.post('http://localhost:8080/completion', json={
        "prompt": context.prompt
    })
    status_code = response.status_code
    assert status_code == 200
    context.response_data = response.json()
@then(u'tokens are predicted')
 def step_request_completion(context):
    assert len(context.response_data['content']) > 0
    assert context.response_data['timings']['predicted_n'] > 0
@given(u'a user prompt {user_prompt}')
 def step_user_prompt(context, user_prompt):
    context.user_prompt = user_prompt
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@ -0,0 +1,45 @@
 #!/bin/bash
 if [ $# -lt 1 ]
 then
    >&2 echo "Usage: $0 model_path [server_args...]"
    exit 1
 fi
 cleanup() {
    pkill -P $$
 }
 trap cleanup EXIT
 model_path="$1"
 shift 1
 set -eu
 # Start the server in background
 ../../../build/bin/server \
            --model "$model_path" \
            --alias tinyllama-2 \
            --ctx-size 64 \
            --parallel 2 \
            --n-predict 32 \
            --batch-size 32 \
            --threads 4 \
            --threads-batch 4 \
            --embedding \
            --cont-batching \
            "$@" &
 # Wait for the server to start
 max_attempts=30
 attempts=${max_attempts}
 until curl --silent --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do
  attempts=$(( attempts - 1));
  [ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }
  sleep_time=$(( (max_attempts - attempts) * 2 ))
  echo "waiting for server to be ready ${sleep_time}s..."
  sleep ${sleep_time}
 done
 # Start tests
 behave