server: tests: reduce number of files, all in one tests shell script
This commit is contained in:
parent
157bcf2286
commit
9b63d7057a
7 changed files with 85 additions and 60 deletions
23
.github/workflows/server-test.yml
vendored
23
.github/workflows/server-test.yml
vendored
|
@ -45,26 +45,7 @@ jobs:
|
|||
- name: Server Integration Tests
|
||||
id: server_integration_test
|
||||
run: |
|
||||
./build/bin/server \
|
||||
-m tinyllama-2-1b-miniguanaco.Q2_K.gguf \
|
||||
--ctx-size 512 \
|
||||
--parallel 4 \
|
||||
--n-predict 512 \
|
||||
--batch-size 128 \
|
||||
--threads 4 \
|
||||
--threads-batch 128 \
|
||||
--alias phi-2 \
|
||||
--embedding \
|
||||
--cont-batching &
|
||||
sh -c '\
|
||||
max_attempts=30; \
|
||||
attempts=${max_attempts}; \
|
||||
echo "waiting for server to be ready..."; \
|
||||
until curl --silent --show-error --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do \
|
||||
attempts=$(( attempts - 1)); \
|
||||
[ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }; \
|
||||
sleep $(( (max_attempts - attempts) * 2 )); \
|
||||
done;'
|
||||
cd examples/server/tests
|
||||
behave
|
||||
./tests.sh
|
||||
|
||||
|
||||
|
|
|
@ -6,4 +6,6 @@ Functional server tests suite.
|
|||
`pip install -r requirements.txt`
|
||||
|
||||
### Run tests
|
||||
`python -m behave`
|
||||
1. Build the server
|
||||
2. download a GGUF model: `../../../scripts/hf.sh --repo TheBloke/Tinyllama-2-1b-miniguanaco-GGUF --file tinyllama-2-1b-miniguanaco.Q2_K.gguf`
|
||||
3. Start the test: `./tests.sh tinyllama-2-1b-miniguanaco.Q2_K.gguf -ngl 23 --log-disable`
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
Feature: Completion request
|
||||
|
||||
Scenario Outline: run a completion request
|
||||
Given a prompt <prompt>
|
||||
When we request a completion
|
||||
Then tokens are predicted
|
||||
|
||||
Examples: Prompts
|
||||
| prompt |
|
||||
| I believe the meaning of life is |
|
||||
| Write a detailed analogy between mathematics and a lighthouse. |
|
|
@ -1,4 +1,14 @@
|
|||
Feature: OpenAI compatible completions request
|
||||
Feature: llama.cpp server
|
||||
|
||||
Scenario Outline: run a completion request
|
||||
Given a prompt <prompt>
|
||||
When we request a completion
|
||||
Then tokens are predicted
|
||||
|
||||
Examples: Prompts
|
||||
| prompt |
|
||||
| I believe |
|
||||
| Write a joke |
|
||||
|
||||
Scenario Outline: run a completion on the OAI endpoint
|
||||
Given a system prompt <system_prompt>
|
||||
|
@ -9,5 +19,5 @@ Feature: OpenAI compatible completions request
|
|||
|
||||
Examples: Prompts
|
||||
| model | system_prompt | user_prompt |
|
||||
| tinyllama-2 | You are ChatGPT. | I believe the meaning of life is |
|
||||
| tinyllama-2 | You are ChatGPT. | Say hello |
|
||||
| tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ |
|
|
@ -1,24 +0,0 @@
|
|||
from behave import *
|
||||
import requests
|
||||
|
||||
|
||||
@given(u'a prompt {prompt}')
|
||||
def step_prompt(context, prompt):
|
||||
context.prompt = prompt
|
||||
|
||||
|
||||
@when(u'we request a completion')
|
||||
def step_request_completion(context):
|
||||
response = requests.post('http://localhost:8080/completion', json={
|
||||
"prompt": context.prompt
|
||||
})
|
||||
status_code = response.status_code
|
||||
assert status_code == 200
|
||||
context.response_data = response.json()
|
||||
|
||||
|
||||
@then(u'tokens are predicted')
|
||||
def step_request_completion(context):
|
||||
assert len(context.response_data['content']) > 0
|
||||
assert context.response_data['timings']['predicted_n'] > 0
|
||||
|
|
@ -1,10 +1,32 @@
|
|||
from behave import *
|
||||
import openai
|
||||
import requests
|
||||
from behave import *
|
||||
|
||||
openai.api_key = 'llama.cpp'
|
||||
openai.api_base = "http://localhost:8080/v1/chat"
|
||||
|
||||
|
||||
@given(u'a prompt {prompt}')
|
||||
def step_prompt(context, prompt):
|
||||
context.prompt = prompt
|
||||
|
||||
|
||||
@when(u'we request a completion')
|
||||
def step_request_completion(context):
|
||||
response = requests.post('http://localhost:8080/completion', json={
|
||||
"prompt": context.prompt
|
||||
})
|
||||
status_code = response.status_code
|
||||
assert status_code == 200
|
||||
context.response_data = response.json()
|
||||
|
||||
|
||||
@then(u'tokens are predicted')
|
||||
def step_request_completion(context):
|
||||
assert len(context.response_data['content']) > 0
|
||||
assert context.response_data['timings']['predicted_n'] > 0
|
||||
|
||||
|
||||
@given(u'a user prompt {user_prompt}')
|
||||
def step_user_prompt(context, user_prompt):
|
||||
context.user_prompt = user_prompt
|
45
examples/server/tests/tests.sh
Executable file
45
examples/server/tests/tests.sh
Executable file
|
@ -0,0 +1,45 @@
|
|||
#!/bin/bash
|
||||
|
||||
if [ $# -lt 1 ]
|
||||
then
|
||||
>&2 echo "Usage: $0 model_path [server_args...]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
pkill -P $$
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
model_path="$1"
|
||||
shift 1
|
||||
|
||||
set -eu
|
||||
|
||||
# Start the server in background
|
||||
../../../build/bin/server \
|
||||
--model "$model_path" \
|
||||
--alias tinyllama-2 \
|
||||
--ctx-size 64 \
|
||||
--parallel 2 \
|
||||
--n-predict 32 \
|
||||
--batch-size 32 \
|
||||
--threads 4 \
|
||||
--threads-batch 4 \
|
||||
--embedding \
|
||||
--cont-batching \
|
||||
"$@" &
|
||||
|
||||
# Wait for the server to start
|
||||
max_attempts=30
|
||||
attempts=${max_attempts}
|
||||
until curl --silent --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do
|
||||
attempts=$(( attempts - 1));
|
||||
[ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }
|
||||
sleep_time=$(( (max_attempts - attempts) * 2 ))
|
||||
echo "waiting for server to be ready ${sleep_time}s..."
|
||||
sleep ${sleep_time}
|
||||
done
|
||||
|
||||
# Start tests
|
||||
behave
|
Loading…
Add table
Add a link
Reference in a new issue