server: bench: remove llamacpp_completions_tokens_seconds as it include prompt processing time and it's misleading
server: bench: add max_tokens from SERVER_BENCH_MAX_TOKENS server: bench: increase truncated rate to 80% before failing
This commit is contained in:
parent
548bc9635a
commit
ab0a59d6d3
2 changed files with 22 additions and 18 deletions
|
@ -44,14 +44,16 @@ server --host localhost --port 8080 \
|
|||
|
||||
#### Run the benchmark
|
||||
|
||||
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
|
||||
```shell
|
||||
k6 run script.js
|
||||
k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||
```
|
||||
|
||||
The benchmark values can be overridden with:
|
||||
- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
|
||||
- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
|
||||
- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
|
||||
- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024`
|
||||
|
||||
Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
|
||||
|
||||
|
@ -66,7 +68,6 @@ Following metrics are available:
|
|||
- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
|
||||
- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
|
||||
- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
|
||||
- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
|
||||
- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
|
||||
- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
|
||||
|
||||
|
|
|
@ -15,8 +15,11 @@ const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS :
|
|||
// Dataset path
|
||||
const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
|
||||
|
||||
// Max tokens to predict
|
||||
const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
|
||||
|
||||
export function setup() {
|
||||
console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
|
||||
console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
|
||||
}
|
||||
|
||||
const data = new SharedArray('conversations', function () {
|
||||
|
@ -32,8 +35,6 @@ const data = new SharedArray('conversations', function () {
|
|||
const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
|
||||
const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
|
||||
|
||||
const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
|
||||
|
||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||
|
||||
|
@ -43,8 +44,8 @@ const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'
|
|||
export const options = {
|
||||
thresholds: {
|
||||
llamacpp_completions_truncated_rate: [
|
||||
// more than 10% of truncated input will abort the test
|
||||
{threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
|
||||
// more than 80% of truncated input will abort the test
|
||||
{threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
|
||||
],
|
||||
},
|
||||
duration: '10m',
|
||||
|
@ -66,6 +67,7 @@ export default function () {
|
|||
],
|
||||
"model": model,
|
||||
"stream": false,
|
||||
"max_tokens": max_tokens
|
||||
}
|
||||
|
||||
const body = JSON.stringify(payload)
|
||||
|
@ -79,21 +81,22 @@ export default function () {
|
|||
|
||||
check(res, {'success completion': (r) => r.status === 200})
|
||||
|
||||
console.debug(`response: ${res.body}`)
|
||||
if (res.status === 200) {
|
||||
console.debug(`response: ${res.body}`)
|
||||
|
||||
const completions = res.json()
|
||||
const completions = res.json()
|
||||
|
||||
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
|
||||
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
|
||||
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
|
||||
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
|
||||
|
||||
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
|
||||
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
|
||||
|
||||
llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
|
||||
|
||||
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
|
||||
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
|
||||
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
|
||||
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
|
||||
|
||||
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
|
||||
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
|
||||
} else {
|
||||
console.error(`response: ${res.body}`)
|
||||
}
|
||||
|
||||
sleep(0.3)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue