server: bench: allow to filter out conversation in the dataset based on env variable

2024-03-09 10:57:14 +01:00 · 2024-03-09 10:57:14 +01:00 · 29c635b411
commit 29c635b411
parent a4b0d107d3
2 changed files with 14 additions and 4 deletions
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@ -55,6 +55,10 @@ The benchmark values can be overridden with:
 - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
 - `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
 - `SERVER_BENCH_DATASET` path to the benchmark dataset file
+- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
+- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
+
+Note: the local tokenizer is just a string space split, real number of tokens will differ.

 Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):

@ -62,7 +66,7 @@ Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
 ```

-To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`
+To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.

 #### Metrics

--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@ -18,12 +18,18 @@ const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : '
 // Max tokens to predict
 const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512

+// Max prompt tokens
+const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
+
+// Max slot context
+const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
+
 export function setup() {
    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
 }

 const data = new SharedArray('conversations', function () {
-    const tokenizer = (message) => message.split(" ")
+    const tokenizer = (message) => message.split(/[\s,'".?]/)

    return JSON.parse(open(dataset_path))
        // Filter out the conversations with less than 2 turns.
@ -39,7 +45,7 @@ const data = new SharedArray('conversations', function () {
        // Filter out too short sequences
        .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
        // Filter out too long sequences.
-        .filter(conv => conv.n_prompt_tokens <= 1024 && conv.n_prompt_tokens + conv.n_completion_tokens <= 2048)
+        .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
        // Keep only first n prompts
        .slice(0, n_prompt)
 })
@ -106,7 +112,7 @@ export default function () {

        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
    } else {
-        console.error(`response: ${res.body}`)
+        console.error(`response: ${res.body} request=${payload}`)
    }

    sleep(0.3)