server: bench: PR feedback and improved k6 script configuration

2024-03-09 00:13:54 +01:00 · 2024-03-09 00:13:54 +01:00 · 548bc9635a
commit 548bc9635a
parent 0b822b6a0f
2 changed files with 69 additions and 39 deletions
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@ -2,12 +2,18 @@

 Benchmark is using [k6](https://k6.io/).

-##### Install k6 - ubuntu
+##### Install k6
+
+Follow instruction from: https://k6.io/docs/get-started/installation/
+
+Example for ubuntu:
 ```shell
 snap install k6
 ```

-#### Downloading the ShareGPT dataset
+#### Download a dataset
+
+This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).

 ```shell
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
@ -21,7 +27,7 @@ Example for PHI-2
 ```

 #### Start the server
-The server must listen on `localhost:8080`.
+The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.

 Example:
 ```shell
@ -36,13 +42,22 @@ server --host localhost --port 8080 \
  -ngl 33
 ```

-#### Run the bench
+#### Run the benchmark
+
 ```shell
 k6 run script.js
 ```

-#### Change the number of concurrent user
-in the `script.js`, change the ramping period according to your number of slots.
+The benchmark values can be overridden with:
+- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
+- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
+- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
+
+Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
+
+```shell
+SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
+```

 #### Metrics

--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@ -1,51 +1,58 @@
-import http from 'k6/http';
-import { check, sleep } from 'k6';
-import { SharedArray } from 'k6/data';
-import { Counter, Gauge, Rate } from 'k6/metrics';
+import http from 'k6/http'
+import {check, sleep} from 'k6'
+import {SharedArray} from 'k6/data'
+import {Counter, Gauge, Rate} from 'k6/metrics'
+
+// Server chat completions prefix
+const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
+
+// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
+const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
+
+// Model name to request
+const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
+
+// Dataset path
+const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
+
+export function setup() {
+    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
+}

 const data = new SharedArray('conversations', function () {
-    return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
-
+    return JSON.parse(open(dataset_path))
        // Filter out the conversations with less than 2 turns.
        .filter(data => data["conversations"].length >= 2)
        // Only keep the first two turns of each conversation.
-        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
-});
+        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]))
+        // Keep only first n prompts
+        .slice(0, n_prompt)
+})

-const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
-const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
+const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
+const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')

-const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
+const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')

-const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
-const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')

-const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
-const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')

 export const options = {
    thresholds: {
        llamacpp_completions_truncated_rate: [
            // more than 10% of truncated input will abort the test
-            { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
+            {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
        ],
    },
-    scenarios: {
-        completions: {
-            executor: 'ramping-vus',
-            startVUs: 1,
-            stages: [
-                {duration: '1m', target: 8},
-                {duration: '3m', target: 8},
-                {duration: '1m', target: 0},
-            ],
-            gracefulRampDown: '30s',
-        },
-    },
-};
+    duration: '10m',
+    vus: 8,
+}

 export default function () {
-    const conversation = data[0]
+    const conversation = data[Math.floor(Math.random() * data.length)]
    const payload = {
        "messages": [
            {
@ -57,15 +64,23 @@ export default function () {
                "content": conversation[1],
            }
        ],
-        "model": "model",
+        "model": model,
        "stream": false,
    }
-    let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
-        headers: { 'Content-Type': 'application/json' },
+
+    const body = JSON.stringify(payload)
+
+    console.debug(`request: ${body}`)
+
+    let res = http.post(`${server_url}/chat/completions`, body, {
+        headers: {'Content-Type': 'application/json'},
+        timeout: '300s'
    })

    check(res, {'success completion': (r) => r.status === 200})

+    console.debug(`response: ${res.body}`)
+
    const completions = res.json()

    llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)