parent
76e868821a
commit
68d1d8fe28
2 changed files with 148 additions and 0 deletions
64
examples/server/bench/README.md
Normal file
64
examples/server/bench/README.md
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
### Server benchmark tools
|
||||||
|
|
||||||
|
Benchmark is using [k6](https://k6.io/).
|
||||||
|
|
||||||
|
##### Install k6 - ubuntu
|
||||||
|
```shell
|
||||||
|
snap install k6
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Downloading the ShareGPT dataset
|
||||||
|
|
||||||
|
```shell
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Download a model
|
||||||
|
Example for PHI-2
|
||||||
|
|
||||||
|
```shell
|
||||||
|
../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Start the server
|
||||||
|
The server must listen on `localhost:8080`.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```shell
|
||||||
|
server --host localhost --port 8080 \
|
||||||
|
--model ggml-model-q4_0.gguf \
|
||||||
|
--cont-batching \
|
||||||
|
--metrics \
|
||||||
|
--parallel 8 \
|
||||||
|
--batch-size 512 \
|
||||||
|
--ctx-size 4096 \
|
||||||
|
--log-format text \
|
||||||
|
-ngl 33
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Run the bench
|
||||||
|
```shell
|
||||||
|
k6 run script.js
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Change the number of concurrent user
|
||||||
|
in the `script.js`, change the ramping period according to your number of slots.
|
||||||
|
|
||||||
|
#### Metrics
|
||||||
|
|
||||||
|
Following metrics are available:
|
||||||
|
- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
|
||||||
|
- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
|
||||||
|
- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
|
||||||
|
- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
|
||||||
|
- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
|
||||||
|
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
|
||||||
|
- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
|
||||||
|
|
||||||
|
The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
|
||||||
|
|
||||||
|
K6 metrics might be compared against [server metrics](../README.md), with:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:8080/metrics
|
||||||
|
```
|
84
examples/server/bench/script.js
Normal file
84
examples/server/bench/script.js
Normal file
|
@ -0,0 +1,84 @@
|
||||||
|
import http from 'k6/http';
|
||||||
|
import { check, sleep } from 'k6';
|
||||||
|
import { SharedArray } from 'k6/data';
|
||||||
|
import { Counter, Gauge, Rate } from 'k6/metrics';
|
||||||
|
|
||||||
|
const data = new SharedArray('conversations', function () {
|
||||||
|
return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
|
||||||
|
|
||||||
|
// Filter out the conversations with less than 2 turns.
|
||||||
|
.filter(data => data["conversations"].length >= 2)
|
||||||
|
// Only keep the first two turns of each conversation.
|
||||||
|
.map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
|
||||||
|
});
|
||||||
|
|
||||||
|
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
|
||||||
|
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
|
||||||
|
|
||||||
|
const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
|
||||||
|
|
||||||
|
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
|
||||||
|
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
|
||||||
|
|
||||||
|
const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
|
||||||
|
const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
|
||||||
|
|
||||||
|
export const options = {
|
||||||
|
thresholds: {
|
||||||
|
llamacpp_completions_truncated_rate: [
|
||||||
|
// more than 10% of truncated input will abort the test
|
||||||
|
{ threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
scenarios: {
|
||||||
|
completions: {
|
||||||
|
executor: 'ramping-vus',
|
||||||
|
startVUs: 1,
|
||||||
|
stages: [
|
||||||
|
{duration: '1m', target: 8},
|
||||||
|
{duration: '3m', target: 8},
|
||||||
|
{duration: '1m', target: 0},
|
||||||
|
],
|
||||||
|
gracefulRampDown: '30s',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export default function () {
|
||||||
|
const conversation = data[0]
|
||||||
|
const payload = {
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": conversation[0],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": conversation[1],
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"model": "model",
|
||||||
|
"stream": false,
|
||||||
|
}
|
||||||
|
let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
})
|
||||||
|
|
||||||
|
check(res, {'success completion': (r) => r.status === 200})
|
||||||
|
|
||||||
|
const completions = res.json()
|
||||||
|
|
||||||
|
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
|
||||||
|
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
|
||||||
|
|
||||||
|
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
|
||||||
|
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
|
||||||
|
|
||||||
|
llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
|
||||||
|
|
||||||
|
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
|
||||||
|
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
|
||||||
|
|
||||||
|
|
||||||
|
sleep(0.3)
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue