server: bench: Init a bench scenario with K6

See #5827
2024-03-08 13:16:16 +01:00 · 2024-03-08 13:16:16 +01:00 · 68d1d8fe28
commit 68d1d8fe28
parent 76e868821a
2 changed files with 148 additions and 0 deletions
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@ -0,0 +1,64 @@
 ### Server benchmark tools
 Benchmark is using [k6](https://k6.io/).
 ##### Install k6 - ubuntu
 ```shell
 snap install k6
 ```
 #### Downloading the ShareGPT dataset
 ```shell
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 #### Download a model
 Example for PHI-2
 ```shell
 ../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf
 ```
 #### Start the server
 The server must listen on `localhost:8080`.
 Example:
 ```shell
 server --host localhost --port 8080 \
  --model ggml-model-q4_0.gguf \
  --cont-batching \
  --metrics \
  --parallel 8 \
  --batch-size 512 \
  --ctx-size 4096 \
  --log-format text \
  -ngl 33
 ```
 #### Run the bench
 ```shell
 k6 run script.js
 ```
 #### Change the number of concurrent user
 in the `script.js`, change the ramping period according to your number of slots.
 #### Metrics
 Following metrics are available:
 - `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
 - `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
 - `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
 - `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
 - `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second
 - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
 - `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
 The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
 K6 metrics might be compared against [server metrics](../README.md), with:
 ```shell
 curl http://localhost:8080/metrics
 ```
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@ -0,0 +1,84 @@
 import http from 'k6/http';
 import { check, sleep } from 'k6';
 import { SharedArray } from 'k6/data';
 import { Counter, Gauge, Rate } from 'k6/metrics';
 const data = new SharedArray('conversations', function () {
    return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json'))
        // Filter out the conversations with less than 2 turns.
        .filter(data => data["conversations"].length >= 2)
        // Only keep the first two turns of each conversation.
        .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"]));
 });
 const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens');
 const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens');
 const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds');
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter');
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter');
 const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate');
 const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate');
 export const options = {
    thresholds: {
        llamacpp_completions_truncated_rate: [
            // more than 10% of truncated input will abort the test
            { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' },
        ],
    },
    scenarios: {
        completions: {
            executor: 'ramping-vus',
            startVUs: 1,
            stages: [
                {duration: '1m', target: 8},
                {duration: '3m', target: 8},
                {duration: '1m', target: 0},
            ],
            gracefulRampDown: '30s',
        },
    },
 };
 export default function () {
    const conversation = data[0]
    const payload = {
        "messages": [
            {
                "role": "system",
                "content": conversation[0],
            },
            {
                "role": "user",
                "content": conversation[1],
            }
        ],
        "model": "model",
        "stream": false,
    }
    let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), {
        headers: { 'Content-Type': 'application/json' },
    })
    check(res, {'success completion': (r) => r.status === 200})
    const completions = res.json()
    llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
    llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
    llamacpp_completion_tokens.add(completions.usage.completion_tokens)
    llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
    llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3)
    llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
    llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
    sleep(0.3)
 }