From 548bc9635a8326406f6a0382731902e132a3532d Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Sat, 9 Mar 2024 00:13:54 +0100 Subject: [PATCH] server: bench: PR feedback and improved k6 script configuration --- examples/server/bench/README.md | 27 ++++++++--- examples/server/bench/script.js | 81 +++++++++++++++++++-------------- 2 files changed, 69 insertions(+), 39 deletions(-) diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index b8edc8587..67367b810 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -2,12 +2,18 @@ Benchmark is using [k6](https://k6.io/). -##### Install k6 - ubuntu +##### Install k6 + +Follow instruction from: https://k6.io/docs/get-started/installation/ + +Example for ubuntu: ```shell snap install k6 ``` -#### Downloading the ShareGPT dataset +#### Download a dataset + +This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md). ```shell wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json @@ -21,7 +27,7 @@ Example for PHI-2 ``` #### Start the server -The server must listen on `localhost:8080`. +The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`. Example: ```shell @@ -36,13 +42,22 @@ server --host localhost --port 8080 \ -ngl 33 ``` -#### Run the bench +#### Run the benchmark + ```shell k6 run script.js ``` -#### Change the number of concurrent user -in the `script.js`, change the ramping period according to your number of slots. +The benchmark values can be overridden with: +- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1` +- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480` +- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` + +Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): + +```shell +SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8 +``` #### Metrics diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index c52eb182a..e2068fb92 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -1,51 +1,58 @@ -import http from 'k6/http'; -import { check, sleep } from 'k6'; -import { SharedArray } from 'k6/data'; -import { Counter, Gauge, Rate } from 'k6/metrics'; +import http from 'k6/http' +import {check, sleep} from 'k6' +import {SharedArray} from 'k6/data' +import {Counter, Gauge, Rate} from 'k6/metrics' + +// Server chat completions prefix +const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1' + +// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users +const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8 + +// Model name to request +const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model' + +// Dataset path +const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json' + +export function setup() { + console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`) +} const data = new SharedArray('conversations', function () { - return JSON.parse(open('./ShareGPT_V3_unfiltered_cleaned_split.json')) - + return JSON.parse(open(dataset_path)) // Filter out the conversations with less than 2 turns. .filter(data => data["conversations"].length >= 2) // Only keep the first two turns of each conversation. - .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"])); -}); + .map(data => Array(data["conversations"][0]["value"], data["conversations"][1]["value"])) + // Keep only first n prompts + .slice(0, n_prompt) +}) -const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens'); -const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens'); +const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens') +const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens') -const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds'); +const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds') -const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter'); -const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter'); +const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') +const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') -const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate'); -const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate'); +const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate') +const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate') export const options = { thresholds: { llamacpp_completions_truncated_rate: [ // more than 10% of truncated input will abort the test - { threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m' }, + {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'}, ], }, - scenarios: { - completions: { - executor: 'ramping-vus', - startVUs: 1, - stages: [ - {duration: '1m', target: 8}, - {duration: '3m', target: 8}, - {duration: '1m', target: 0}, - ], - gracefulRampDown: '30s', - }, - }, -}; + duration: '10m', + vus: 8, +} export default function () { - const conversation = data[0] + const conversation = data[Math.floor(Math.random() * data.length)] const payload = { "messages": [ { @@ -57,15 +64,23 @@ export default function () { "content": conversation[1], } ], - "model": "model", + "model": model, "stream": false, } - let res = http.post('http://localhost:8080/v1/chat/completions', JSON.stringify(payload), { - headers: { 'Content-Type': 'application/json' }, + + const body = JSON.stringify(payload) + + console.debug(`request: ${body}`) + + let res = http.post(`${server_url}/chat/completions`, body, { + headers: {'Content-Type': 'application/json'}, + timeout: '300s' }) check(res, {'success completion': (r) => r.status === 200}) + console.debug(`response: ${res.body}`) + const completions = res.json() llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)