diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 108eb56ba..a53ad64d7 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -55,6 +55,10 @@ The benchmark values can be overridden with:
 - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
 - `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512`
 - `SERVER_BENCH_DATASET` path to the benchmark dataset file
+- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024`
+- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048`
+
+Note: the local tokenizer is just a string space split, real number of tokens will differ.
 
 Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 
@@ -62,7 +66,7 @@ Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
 ```
 
-To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`
+To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`.
 
 #### Metrics
 
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index 7cd44e070..3a0594e23 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -18,12 +18,18 @@ const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : '
 // Max tokens to predict
 const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512
 
+// Max prompt tokens
+const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024
+
+// Max slot context
+const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048
+
 export function setup() {
     console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`)
 }
 
 const data = new SharedArray('conversations', function () {
-    const tokenizer = (message) => message.split(" ")
+    const tokenizer = (message) => message.split(/[\s,'".?]/)
 
     return JSON.parse(open(dataset_path))
         // Filter out the conversations with less than 2 turns.
@@ -39,7 +45,7 @@ const data = new SharedArray('conversations', function () {
         // Filter out too short sequences
         .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4)
         // Filter out too long sequences.
-        .filter(conv => conv.n_prompt_tokens <= 1024 && conv.n_prompt_tokens + conv.n_completion_tokens <= 2048)
+        .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot)
         // Keep only first n prompts
         .slice(0, n_prompt)
 })
@@ -106,7 +112,7 @@ export default function () {
 
         llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
     } else {
-        console.error(`response: ${res.body}`)
+        console.error(`response: ${res.body} request=${payload}`)
     }
 
     sleep(0.3)