server: tests: reduce number of files, all in one tests shell script

This commit is contained in:
Pierrick HYMBERT 2024-02-19 21:50:56 +01:00
parent 157bcf2286
commit 9b63d7057a
7 changed files with 85 additions and 60 deletions

View file

@ -45,26 +45,7 @@ jobs:
- name: Server Integration Tests
id: server_integration_test
run: |
./build/bin/server \
-m tinyllama-2-1b-miniguanaco.Q2_K.gguf \
--ctx-size 512 \
--parallel 4 \
--n-predict 512 \
--batch-size 128 \
--threads 4 \
--threads-batch 128 \
--alias phi-2 \
--embedding \
--cont-batching &
sh -c '\
max_attempts=30; \
attempts=${max_attempts}; \
echo "waiting for server to be ready..."; \
until curl --silent --show-error --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do \
attempts=$(( attempts - 1)); \
[ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }; \
sleep $(( (max_attempts - attempts) * 2 )); \
done;'
cd examples/server/tests
behave
./tests.sh

View file

@ -6,4 +6,6 @@ Functional server tests suite.
`pip install -r requirements.txt`
### Run tests
`python -m behave`
1. Build the server
2. download a GGUF model: `../../../scripts/hf.sh --repo TheBloke/Tinyllama-2-1b-miniguanaco-GGUF --file tinyllama-2-1b-miniguanaco.Q2_K.gguf`
3. Start the test: `./tests.sh tinyllama-2-1b-miniguanaco.Q2_K.gguf -ngl 23 --log-disable`

View file

@ -1,11 +0,0 @@
Feature: Completion request
Scenario Outline: run a completion request
Given a prompt <prompt>
When we request a completion
Then tokens are predicted
Examples: Prompts
| prompt |
| I believe the meaning of life is |
| Write a detailed analogy between mathematics and a lighthouse. |

View file

@ -1,4 +1,14 @@
Feature: OpenAI compatible completions request
Feature: llama.cpp server
Scenario Outline: run a completion request
Given a prompt <prompt>
When we request a completion
Then tokens are predicted
Examples: Prompts
| prompt |
| I believe |
| Write a joke |
Scenario Outline: run a completion on the OAI endpoint
Given a system prompt <system_prompt>
@ -9,5 +19,5 @@ Feature: OpenAI compatible completions request
Examples: Prompts
| model | system_prompt | user_prompt |
| tinyllama-2 | You are ChatGPT. | I believe the meaning of life is |
| tinyllama-2 | You are ChatGPT. | Say hello |
| tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ |

View file

@ -1,24 +0,0 @@
from behave import *
import requests
@given(u'a prompt {prompt}')
def step_prompt(context, prompt):
context.prompt = prompt
@when(u'we request a completion')
def step_request_completion(context):
response = requests.post('http://localhost:8080/completion', json={
"prompt": context.prompt
})
status_code = response.status_code
assert status_code == 200
context.response_data = response.json()
@then(u'tokens are predicted')
def step_request_completion(context):
assert len(context.response_data['content']) > 0
assert context.response_data['timings']['predicted_n'] > 0

View file

@ -1,10 +1,32 @@
from behave import *
import openai
import requests
from behave import *
openai.api_key = 'llama.cpp'
openai.api_base = "http://localhost:8080/v1/chat"
@given(u'a prompt {prompt}')
def step_prompt(context, prompt):
context.prompt = prompt
@when(u'we request a completion')
def step_request_completion(context):
response = requests.post('http://localhost:8080/completion', json={
"prompt": context.prompt
})
status_code = response.status_code
assert status_code == 200
context.response_data = response.json()
@then(u'tokens are predicted')
def step_request_completion(context):
assert len(context.response_data['content']) > 0
assert context.response_data['timings']['predicted_n'] > 0
@given(u'a user prompt {user_prompt}')
def step_user_prompt(context, user_prompt):
context.user_prompt = user_prompt

45
examples/server/tests/tests.sh Executable file
View file

@ -0,0 +1,45 @@
#!/bin/bash
if [ $# -lt 1 ]
then
>&2 echo "Usage: $0 model_path [server_args...]"
exit 1
fi
cleanup() {
pkill -P $$
}
trap cleanup EXIT
model_path="$1"
shift 1
set -eu
# Start the server in background
../../../build/bin/server \
--model "$model_path" \
--alias tinyllama-2 \
--ctx-size 64 \
--parallel 2 \
--n-predict 32 \
--batch-size 32 \
--threads 4 \
--threads-batch 4 \
--embedding \
--cont-batching \
"$@" &
# Wait for the server to start
max_attempts=30
attempts=${max_attempts}
until curl --silent --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do
attempts=$(( attempts - 1));
[ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }
sleep_time=$(( (max_attempts - attempts) * 2 ))
echo "waiting for server to be ready ${sleep_time}s..."
sleep ${sleep_time}
done
# Start tests
behave