diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 108eb56ba..a53ad64d7 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -55,6 +55,10 @@ The benchmark values can be overridden with: - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` - `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512` - `SERVER_BENCH_DATASET` path to the benchmark dataset file +- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024` +- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048` + +Note: the local tokenizer is just a string space split, real number of tokens will differ. Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): @@ -62,7 +66,7 @@ Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8 ``` -To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"` +To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`. #### Metrics diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index 7cd44e070..3a0594e23 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -18,12 +18,18 @@ const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : ' // Max tokens to predict const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512 +// Max prompt tokens +const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024 + +// Max slot context +const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048 + export function setup() { console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`) } const data = new SharedArray('conversations', function () { - const tokenizer = (message) => message.split(" ") + const tokenizer = (message) => message.split(/[\s,'".?]/) return JSON.parse(open(dataset_path)) // Filter out the conversations with less than 2 turns. @@ -39,7 +45,7 @@ const data = new SharedArray('conversations', function () { // Filter out too short sequences .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4) // Filter out too long sequences. - .filter(conv => conv.n_prompt_tokens <= 1024 && conv.n_prompt_tokens + conv.n_completion_tokens <= 2048) + .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot) // Keep only first n prompts .slice(0, n_prompt) }) @@ -106,7 +112,7 @@ export default function () { llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) } else { - console.error(`response: ${res.body}`) + console.error(`response: ${res.body} request=${payload}`) } sleep(0.3)