diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index 67367b810..0c8f6b516 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -44,14 +44,16 @@ server --host localhost --port 8080 \ #### Run the benchmark +For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: ```shell -k6 run script.js +k6 run script.js --duration 10m --iterations 500 --vus 8 ``` The benchmark values can be overridden with: - `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1` - `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480` - `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` +- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `1024` Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): @@ -66,7 +68,6 @@ Following metrics are available: - `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens` - `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens` - `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens` -- `llamacpp_completions_tokens_seconds` Gauge of `usage.completion_tokens` divided by the request time in second - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'` - `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'` diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index e2068fb92..fb942d6ab 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -15,8 +15,11 @@ const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : // Dataset path const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json' +// Max tokens to predict +const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512 + export function setup() { - console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path}`) + console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`) } const data = new SharedArray('conversations', function () { @@ -32,8 +35,6 @@ const data = new SharedArray('conversations', function () { const llamacpp_prompt_tokens = new Gauge('llamacpp_prompt_tokens') const llamacpp_completion_tokens = new Gauge('llamacpp_completion_tokens') -const llamacpp_completions_tokens_seconds = new Gauge('llamacpp_completions_tokens_seconds') - const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -43,8 +44,8 @@ const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate' export const options = { thresholds: { llamacpp_completions_truncated_rate: [ - // more than 10% of truncated input will abort the test - {threshold: 'rate < 0.1', abortOnFail: true, delayAbortEval: '1m'}, + // more than 80% of truncated input will abort the test + {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'}, ], }, duration: '10m', @@ -66,6 +67,7 @@ export default function () { ], "model": model, "stream": false, + "max_tokens": max_tokens } const body = JSON.stringify(payload) @@ -79,21 +81,22 @@ export default function () { check(res, {'success completion': (r) => r.status === 200}) - console.debug(`response: ${res.body}`) + if (res.status === 200) { + console.debug(`response: ${res.body}`) - const completions = res.json() + const completions = res.json() - llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) - llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) + llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) - llamacpp_completion_tokens.add(completions.usage.completion_tokens) - llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) - - llamacpp_completions_tokens_seconds.add(completions.usage.completion_tokens / res.timings.duration * 1e3) - - llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') - llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + llamacpp_completion_tokens.add(completions.usage.completion_tokens) + llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') + llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + } else { + console.error(`response: ${res.body}`) + } sleep(0.3) }