server: bench: PR feedback and improved k6 script configuration

2024-03-09 00:13:54 +01:00 · 2024-03-09 00:13:54 +01:00 · 548bc9635a
commit 548bc9635a
parent 0b822b6a0f
2 changed files with 69 additions and 39 deletions
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@ -2,12 +2,18 @@
 Benchmark is using [k6](https://k6.io/).
-##### Install k6 - ubuntu
+##### Install k6
 Follow instruction from: https://k6.io/docs/get-started/installation/
 Example for ubuntu:
 ```shell
 snap install k6
 ```
-#### Downloading the ShareGPT dataset
+#### Download a dataset
 This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md).
 ```shell
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
@ -21,7 +27,7 @@ Example for PHI-2
 ```
 #### Start the server
-The server must listen on `localhost:8080`.
+The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`.
 Example:
 ```shell
@ -36,13 +42,22 @@ server --host localhost --port 8080 \
  -ngl 33
 ```
-#### Run the bench
+#### Run the benchmark
 ```shell
 k6 run script.js
 ```
-#### Change the number of concurrent user
+The benchmark values can be overridden with:
-in the `script.js`, change the ramping period according to your number of slots.
+- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1`
 - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480`
 - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model`
 Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/):
 ```shell
 SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8
 ```
 #### Metrics
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@ -1,27 +1,44 @@
-import http from 'k6/http';
+import http from 'k6/http'
-import { check, sleep } from 'k6';
+import {check, sleep} from 'k6'
-import { SharedArray } from 'k6/data';
+import {SharedArray} from 'k6/data'
-import { Counter, Gauge, Rate } from 'k6/metrics';
+import {Counter, Gauge, Rate} from 'k6/metrics'
 // Server chat completions prefix
 const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1'
 // Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users
 const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8
 // Model name to request
 const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model'
 // Dataset path
 const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json'
 export function setup() {
    console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`)
 }
 const data = new SharedArray('conversations', function () {
-    return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
+    return JSON.parse(open(dataset_path))
        // Filter out the conversations with less than 2 turns.
        .filter(data => data["conversations"].length >= 2)
        // Only keep the first two turns of each conversation.
-        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
+        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]))
-});
+        // Keep only first n prompts
        .slice(0, n_prompt)
 })
-const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
+const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens')
-const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
+const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens')
-const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
+const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds')
-const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
+const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
-const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
+const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
-const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
+const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate')
-const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
+const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate')
 export const options = {
    thresholds: {
@ -30,22 +47,12 @@ export const options = {
            {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'},
        ],
    },
-    scenarios: {
+    duration: '10m',
-        completions: {
+    vus: 8,
-            executor: 'ramping-vus',
+}
            startVUs: 1,
            stages: [
                {duration: '1m', target: 8},
                {duration: '3m', target: 8},
                {duration: '1m', target: 0},
            ],
            gracefulRampDown: '30s',
        },
    },
 };
 export default function () {
-    const conversation = data[0]
+    const conversation = data[Math.floor(Math.random() * data.length)]
    const payload = {
        "messages": [
            {
@ -57,15 +64,23 @@ export default function () {
                "content": conversation[1],
            }
        ],
-        "model": "model",
+        "model": model,
        "stream": false,
    }
-    let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
+
    const body = JSON.stringify(payload)
    console.debug(`request: ${body}`)
    let res = http.post(`${server_url}/chat/completions`, body, {
        headers: {'Content-Type': 'application/json'},
        timeout: '300s'
    })
    check(res, {'success completion': (r) => r.status === 200})
    console.debug(`response: ${res.body}`)
    const completions = res.json()
    llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)