server: bench: remove llamacpp_completions_tokens_seconds as it include prompt processing time and it's misleading

server: bench: add max_tokens from SERVER_BENCH_MAX_TOKENS server: bench: increase truncated rate to 80% before failing
2024-03-09 01:09:56 +01:00 · 2024-03-09 01:09:56 +01:00 · ab0a59d6d3
commit ab0a59d6d3
parent 548bc9635a
2 changed files with 22 additions and 18 deletions
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@ -44,14 +44,16 @@ server --host localhost --port 8080 \

 #### Run the benchmark

+For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
 ```shell
-k6 run script.js
+k6 run script.js --duration 10m --iterations 500 --vus 8
 ```

 The benchmark values can be overridden with:
 - `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
 - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
 - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024`

 Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):

@ -66,7 +68,6 @@ Following metrics are available:
 - `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
 - `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
 - `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
 - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
 - `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`

--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@ -15,8 +15,11 @@ const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS :
 // Dataset path
 const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'

+// Max tokens to predict
+const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
+
 export function setup() {
-    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
+    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
 }

 const data = new SharedArray('conversations', function () {
@ -32,8 +35,6 @@ const data = new SharedArray('conversations', function () {
 const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')

-const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
-
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')

@ -43,8 +44,8 @@ const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'
 export const options = {
    thresholds: {
        llamacpp_completions_truncated_rate: [
-            // more than 10% of truncated input will abort the test
-            {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
+            // more than 80% of truncated input will abort the test
+            {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'},
        ],
    },
    duration: '10m',
@ -66,6 +67,7 @@ export default function () {
        ],
        "model": model,
        "stream": false,
+        "max_tokens": max_tokens
    }

    const body = JSON.stringify(payload)
@ -79,21 +81,22 @@ export default function () {

    check(res, {'success completion': (r) => r.status === 200})

-    console.debug(`response: ${res.body}`)
+    if (res.status === 200) {
+        console.debug(`response: ${res.body}`)

-    const completions = res.json()
+        const completions = res.json()

-    llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
-    llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
+        llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
+        llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)

-    llamacpp_completion_tokens.add(completions.usage.completion_tokens)
-    llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
-
-    llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
-
-    llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
-    llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+        llamacpp_completion_tokens.add(completions.usage.completion_tokens)
+        llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)

+        llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
+        llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+    } else {
+        console.error(`response: ${res.body}`)
+    }

    sleep(0.3)
 }