server: bench: PR feedback and improved k6 script configuration
This commit is contained in:
parent
0b822b6a0f
commit
548bc9635a
2 changed files with 69 additions and 39 deletions
|
@ -2,12 +2,18 @@
|
||||||
|
|
||||||
Benchmark is using [k6](https://k6.io/).
|
Benchmark is using [k6](https://k6.io/).
|
||||||
|
|
||||||
##### Install k6 - ubuntu
|
##### Install k6
|
||||||
|
|
||||||
|
Follow instruction from: https://k6.io/docs/get-started/installation/
|
||||||
|
|
||||||
|
Example for ubuntu:
|
||||||
```shell
|
```shell
|
||||||
snap install k6
|
snap install k6
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Downloading the ShareGPT dataset
|
#### Download a dataset
|
||||||
|
|
||||||
|
This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
@ -21,7 +27,7 @@ Example for PHI-2
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Start the server
|
#### Start the server
|
||||||
The server must listen on `localhost:8080`.
|
The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
```shell
|
```shell
|
||||||
|
@ -36,13 +42,22 @@ server --host localhost --port 8080 \
|
||||||
-ngl 33
|
-ngl 33
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Run the bench
|
#### Run the benchmark
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
k6 run script.js
|
k6 run script.js
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Change the number of concurrent user
|
The benchmark values can be overridden with:
|
||||||
in the `script.js`, change the ramping period according to your number of slots.
|
- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
|
||||||
|
- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
|
||||||
|
- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
|
||||||
|
|
||||||
|
Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
|
||||||
|
|
||||||
|
```shell
|
||||||
|
SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||||
|
```
|
||||||
|
|
||||||
#### Metrics
|
#### Metrics
|
||||||
|
|
||||||
|
|
|
@ -1,27 +1,44 @@
|
||||||
import http from 'k6/http';
|
import http from 'k6/http'
|
||||||
import { check, sleep } from 'k6';
|
import {check, sleep} from 'k6'
|
||||||
import { SharedArray } from 'k6/data';
|
import {SharedArray} from 'k6/data'
|
||||||
import { Counter, Gauge, Rate } from 'k6/metrics';
|
import {Counter, Gauge, Rate} from 'k6/metrics'
|
||||||
|
|
||||||
|
// Server chat completions prefix
|
||||||
|
const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
|
||||||
|
|
||||||
|
// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
|
||||||
|
const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
|
||||||
|
|
||||||
|
// Model name to request
|
||||||
|
const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
|
||||||
|
|
||||||
|
// Dataset path
|
||||||
|
const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
|
||||||
|
|
||||||
|
export function setup() {
|
||||||
|
console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
|
||||||
|
}
|
||||||
|
|
||||||
const data = new SharedArray('conversations', function () {
|
const data = new SharedArray('conversations', function () {
|
||||||
return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
|
return JSON.parse(open(dataset_path))
|
||||||
|
|
||||||
// Filter out the conversations with less than 2 turns.
|
// Filter out the conversations with less than 2 turns.
|
||||||
.filter(data => data["conversations"].length >= 2)
|
.filter(data => data["conversations"].length >= 2)
|
||||||
// Only keep the first two turns of each conversation.
|
// Only keep the first two turns of each conversation.
|
||||||
.map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
|
.map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]))
|
||||||
});
|
// Keep only first n prompts
|
||||||
|
.slice(0, n_prompt)
|
||||||
|
})
|
||||||
|
|
||||||
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
|
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
|
||||||
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
|
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
|
||||||
|
|
||||||
const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
|
const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
|
||||||
|
|
||||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
|
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
|
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||||
|
|
||||||
const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
|
const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
|
||||||
const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
|
const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
|
||||||
|
|
||||||
export const options = {
|
export const options = {
|
||||||
thresholds: {
|
thresholds: {
|
||||||
|
@ -30,22 +47,12 @@ export const options = {
|
||||||
{threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
|
{threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
scenarios: {
|
duration: '10m',
|
||||||
completions: {
|
vus: 8,
|
||||||
executor: 'ramping-vus',
|
}
|
||||||
startVUs: 1,
|
|
||||||
stages: [
|
|
||||||
{duration: '1m', target: 8},
|
|
||||||
{duration: '3m', target: 8},
|
|
||||||
{duration: '1m', target: 0},
|
|
||||||
],
|
|
||||||
gracefulRampDown: '30s',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
export default function () {
|
export default function () {
|
||||||
const conversation = data[0]
|
const conversation = data[Math.floor(Math.random() * data.length)]
|
||||||
const payload = {
|
const payload = {
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
|
@ -57,15 +64,23 @@ export default function () {
|
||||||
"content": conversation[1],
|
"content": conversation[1],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"model": "model",
|
"model": model,
|
||||||
"stream": false,
|
"stream": false,
|
||||||
}
|
}
|
||||||
let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
|
|
||||||
|
const body = JSON.stringify(payload)
|
||||||
|
|
||||||
|
console.debug(`request: ${body}`)
|
||||||
|
|
||||||
|
let res = http.post(`${server_url}/chat/completions`, body, {
|
||||||
headers: {'Content-Type': 'application/json'},
|
headers: {'Content-Type': 'application/json'},
|
||||||
|
timeout: '300s'
|
||||||
})
|
})
|
||||||
|
|
||||||
check(res, {'success completion': (r) => r.status === 200})
|
check(res, {'success completion': (r) => r.status === 200})
|
||||||
|
|
||||||
|
console.debug(`response: ${res.body}`)
|
||||||
|
|
||||||
const completions = res.json()
|
const completions = res.json()
|
||||||
|
|
||||||
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
|
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue