server: bench: remove llamacpp_completions_tokens_seconds as it include prompt processing time and it's misleading

server: bench: add max_tokens from SERVER_BENCH_MAX_TOKENS

server: bench: increase truncated rate to 80% before failing
This commit is contained in:
Pierrick HYMBERT 2024-03-09 01:09:56 +01:00
parent 548bc9635a
commit ab0a59d6d3
2 changed files with 22 additions and 18 deletions

View file

@ -44,14 +44,16 @@ server --host localhost --port 8080 \
#### Run the benchmark #### Run the benchmark
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
```shell ```shell
k6 run script.js k6 run script.js --duration 10m --iterations 500 --vus 8
``` ```
The benchmark values can be overridden with: The benchmark values can be overridden with:
- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1` - `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480` - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024`
Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
@ -66,7 +68,6 @@ Following metrics are available:
- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens` - `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens` - `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens` - `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'` - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'` - `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`

View file

@ -15,8 +15,11 @@ const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS :
// Dataset path // Dataset path
const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json' const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
// Max tokens to predict
const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
export function setup() { export function setup() {
console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`) console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
} }
const data = new SharedArray('conversations', function () { const data = new SharedArray('conversations', function () {
@ -32,8 +35,6 @@ const data = new SharedArray('conversations', function () {
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens') const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens') const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@ -43,8 +44,8 @@ const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'
export const options = { export const options = {
thresholds: { thresholds: {
llamacpp_completions_truncated_rate: [ llamacpp_completions_truncated_rate: [
// more than 10% of truncated input will abort the test // more than 80% of truncated input will abort the test
{threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'}, {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
], ],
}, },
duration: '10m', duration: '10m',
@ -66,6 +67,7 @@ export default function () {
], ],
"model": model, "model": model,
"stream": false, "stream": false,
"max_tokens": max_tokens
} }
const body = JSON.stringify(payload) const body = JSON.stringify(payload)
@ -79,6 +81,7 @@ export default function () {
check(res, {'success completion': (r) => r.status === 200}) check(res, {'success completion': (r) => r.status === 200})
if (res.status === 200) {
console.debug(`response: ${res.body}`) console.debug(`response: ${res.body}`)
const completions = res.json() const completions = res.json()
@ -89,11 +92,11 @@ export default function () {
llamacpp_completion_tokens.add(completions.usage.completion_tokens) llamacpp_completion_tokens.add(completions.usage.completion_tokens)
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
} else {
console.error(`response: ${res.body}`)
}
sleep(0.3) sleep(0.3)
} }