From 548bc9635a8326406f6a0382731902e132a3532d Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 00:13:54 +0100
Subject: [PATCH] server: bench: PR feedback and improved k6 script
 configuration

---
 examples/server/bench/README.md | 27 ++++++++---
 examples/server/bench/script.js | 81 +++++++++++++++++++--------------
 2 files changed, 69 insertions(+), 39 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index b8edc8587..67367b810 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -2,12 +2,18 @@
 
 Benchmark is using [k6](https://k6.io/).
 
-##### Install k6 - ubuntu
+##### Install k6
+
+Follow instruction from: https://k6.io/docs/get-started/installation/
+
+Example for ubuntu:
 ```shell
 snap install k6
 ```
 
-#### Downloading the ShareGPT dataset
+#### Download a dataset
+
+This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
 
 ```shell
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
@@ -21,7 +27,7 @@ Example for PHI-2
 ```
 
 #### Start the server
-The server must listen on `localhost:8080`.
+The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
 
 Example:
 ```shell
@@ -36,13 +42,22 @@ server --host localhost --port 8080 \
   -ngl 33
 ```
 
-#### Run the bench
+#### Run the benchmark
+
 ```shell
 k6 run script.js
 ```
 
-#### Change the number of concurrent user
-in the `script.js`, change the ramping period according to your number of slots.
+The benchmark values can be overridden with:
+- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
+- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
+- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+
+Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
+
+```shell
+SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
+```
 
 #### Metrics
 
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index c52eb182a..e2068fb92 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,51 +1,58 @@
-import http from 'k6/http';
-import { check, sleep } from 'k6';
-import { SharedArray } from 'k6/data';
-import { Counter, Gauge, Rate } from 'k6/metrics';
+import http from 'k6/http'
+import {check, sleep} from 'k6'
+import {SharedArray} from 'k6/data'
+import {Counter, Gauge, Rate} from 'k6/metrics'
+
+// Server chat completions prefix
+const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
+
+// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
+const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
+
+// Model name to request
+const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
+
+// Dataset path
+const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
+
+export function setup() {
+    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
+}
 
 const data = new SharedArray('conversations', function () {
-    return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
-
+    return JSON.parse(open(dataset_path))
         // Filter out the conversations with less than 2 turns.
         .filter(data => data["conversations"].length >= 2)
         // Only keep the first two turns of each conversation.
-        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
-});
+        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]))
+        // Keep only first n prompts
+        .slice(0, n_prompt)
+})
 
-const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
-const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
+const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
 
-const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
+const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
 
-const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
-const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
 
-const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
-const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
 
 export const options = {
     thresholds: {
         llamacpp_completions_truncated_rate: [
             // more than 10% of truncated input will abort the test
-            { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
+            {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
         ],
     },
-    scenarios: {
-        completions: {
-            executor: 'ramping-vus',
-            startVUs: 1,
-            stages: [
-                {duration: '1m', target: 8},
-                {duration: '3m', target: 8},
-                {duration: '1m', target: 0},
-            ],
-            gracefulRampDown: '30s',
-        },
-    },
-};
+    duration: '10m',
+    vus: 8,
+}
 
 export default function () {
-    const conversation = data[0]
+    const conversation = data[Math.floor(Math.random() * data.length)]
     const payload = {
         "messages": [
             {
@@ -57,15 +64,23 @@ export default function () {
                 "content": conversation[1],
             }
         ],
-        "model": "model",
+        "model": model,
         "stream": false,
     }
-    let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
-        headers: { 'Content-Type': 'application/json' },
+
+    const body = JSON.stringify(payload)
+
+    console.debug(`request: ${body}`)
+
+    let res = http.post(`${server_url}/chat/completions`, body, {
+        headers: {'Content-Type': 'application/json'},
+        timeout: '300s'
     })
 
     check(res, {'success completion': (r) => r.status === 200})
 
+    console.debug(`response: ${res.body}`)
+
     const completions = res.json()
 
     llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)