From 68d1d8fe28e9887c7e3eff4f714d3faa7def3081 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Fri, 8 Mar 2024 13:16:16 +0100 Subject: [PATCH] server: bench: Init a bench scenario with K6 See #5827 --- examples/server/bench/README.md | 64 +++++++++++++++++++++++++ examples/server/bench/script.js | 84 +++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 examples/server/bench/README.md create mode 100644 examples/server/bench/script.js diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md new file mode 100644 index 000000000..049d23317 --- /dev/null +++ b/examples/server/bench/README.md @@ -0,0 +1,64 @@ +### Server benchmark tools + +Benchmark is using [k6](https://k6.io/). + +##### Install k6 - ubuntu +```shell +snap install k6 +``` + +#### Downloading the ShareGPT dataset + +```shell +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +#### Download a model +Example for PHI-2 + +```shell +../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf +``` + +#### Start the server +The server must listen on `localhost:8080`. + +Example: +```shell +server --host localhost --port 8080 \ + --model ggml-model-q4_0.gguf \ + --cont-batching \ + --metrics \ + --parallel 8 \ + --batch-size 512 \ + --ctx-size 4096 \ + --log-format text \ + -ngl 33 +``` + +#### Run the bench +```shell +k6 run script.js +``` + +#### Change the number of concurrent user +in the `script.js`, change the ramping period according to your number of slots. + +#### Metrics + +Following metrics are available: +- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens` +- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens` +- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens` +- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens` +- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second +- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'` +- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'` + +The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`. + +K6 metrics might be compared against [server metrics](../README.md), with: + +```shell +curl http://localhost:8080/metrics +``` \ No newline at end of file diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js new file mode 100644 index 000000000..9d963e49d --- /dev/null +++ b/examples/server/bench/script.js @@ -0,0 +1,84 @@ +import http from 'k6/http'; +import { check, sleep } from 'k6'; +import { SharedArray } from 'k6/data'; +import { Counter, Gauge, Rate } from 'k6/metrics'; + +const data = new SharedArray('conversations', function () { + return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json')) + + // Filter out the conversations with less than 2 turns. + .filter(data => data["conversations"].length >= 2) + // Only keep the first two turns of each conversation. + .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"])); +}); + +const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens'); +const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens'); + +const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds'); + +const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter'); +const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter'); + +const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate'); +const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'); + +export const options = { + thresholds: { + llamacpp_completions_truncated_rate: [ + // more than 10% of truncated input will abort the test + { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' }, + ], + }, + scenarios: { + completions: { + executor: 'ramping-vus', + startVUs: 1, + stages: [ + {duration: '1m', target: 8}, + {duration: '3m', target: 8}, + {duration: '1m', target: 0}, + ], + gracefulRampDown: '30s', + }, + }, +}; + +export default function () { + const conversation = data[0] + const payload = { + "messages": [ + { + "role": "system", + "content": conversation[0], + }, + { + "role": "user", + "content": conversation[1], + } + ], + "model": "model", + "stream": false, + } + let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), { + headers: { 'Content-Type': 'application/json' }, + }) + + check(res, {'success completion': (r) => r.status === 200}) + + const completions = res.json() + + llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) + + llamacpp_completion_tokens.add(completions.usage.completion_tokens) + llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + + llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3) + + llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') + llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + + + sleep(0.3) +} \ No newline at end of file