server: bench: remove llamacpp_completions_tokens_seconds as it include prompt processing time and it's misleading
server: bench: add max_tokens from SERVER_BENCH_MAX_TOKENS server: bench: increase truncated rate to 80% before failing
This commit is contained in:
parent
548bc9635a
commit
ab0a59d6d3
2 changed files with 22 additions and 18 deletions
|
@ -44,14 +44,16 @@ server --host localhost --port 8080 \
|
||||||
|
|
||||||
#### Run the benchmark
|
#### Run the benchmark
|
||||||
|
|
||||||
|
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
|
||||||
```shell
|
```shell
|
||||||
k6 run script.js
|
k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||||
```
|
```
|
||||||
|
|
||||||
The benchmark values can be overridden with:
|
The benchmark values can be overridden with:
|
||||||
- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
|
- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
|
||||||
- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
|
- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
|
||||||
- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
|
- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
|
||||||
|
- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024`
|
||||||
|
|
||||||
Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
|
Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
|
||||||
|
|
||||||
|
@ -66,7 +68,6 @@ Following metrics are available:
|
||||||
- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
|
- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
|
||||||
- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
|
- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
|
||||||
- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
|
- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
|
||||||
- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
|
|
||||||
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
|
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
|
||||||
- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
|
- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
|
||||||
|
|
||||||
|
|
|
@ -15,8 +15,11 @@ const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS :
|
||||||
// Dataset path
|
// Dataset path
|
||||||
const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
|
const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
|
||||||
|
|
||||||
|
// Max tokens to predict
|
||||||
|
const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
|
||||||
|
|
||||||
export function setup() {
|
export function setup() {
|
||||||
console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
|
console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
|
||||||
}
|
}
|
||||||
|
|
||||||
const data = new SharedArray('conversations', function () {
|
const data = new SharedArray('conversations', function () {
|
||||||
|
@ -32,8 +35,6 @@ const data = new SharedArray('conversations', function () {
|
||||||
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
|
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
|
||||||
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
|
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
|
||||||
|
|
||||||
const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
|
|
||||||
|
|
||||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||||
|
|
||||||
|
@ -43,8 +44,8 @@ const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'
|
||||||
export const options = {
|
export const options = {
|
||||||
thresholds: {
|
thresholds: {
|
||||||
llamacpp_completions_truncated_rate: [
|
llamacpp_completions_truncated_rate: [
|
||||||
// more than 10% of truncated input will abort the test
|
// more than 80% of truncated input will abort the test
|
||||||
{threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
|
{threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
duration: '10m',
|
duration: '10m',
|
||||||
|
@ -66,6 +67,7 @@ export default function () {
|
||||||
],
|
],
|
||||||
"model": model,
|
"model": model,
|
||||||
"stream": false,
|
"stream": false,
|
||||||
|
"max_tokens": max_tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
const body = JSON.stringify(payload)
|
const body = JSON.stringify(payload)
|
||||||
|
@ -79,6 +81,7 @@ export default function () {
|
||||||
|
|
||||||
check(res, {'success completion': (r) => r.status === 200})
|
check(res, {'success completion': (r) => r.status === 200})
|
||||||
|
|
||||||
|
if (res.status === 200) {
|
||||||
console.debug(`response: ${res.body}`)
|
console.debug(`response: ${res.body}`)
|
||||||
|
|
||||||
const completions = res.json()
|
const completions = res.json()
|
||||||
|
@ -89,11 +92,11 @@ export default function () {
|
||||||
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
|
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
|
||||||
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
|
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
|
||||||
|
|
||||||
llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
|
|
||||||
|
|
||||||
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
|
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
|
||||||
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
|
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
|
||||||
|
} else {
|
||||||
|
console.error(`response: ${res.body}`)
|
||||||
|
}
|
||||||
|
|
||||||
sleep(0.3)
|
sleep(0.3)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue