ci: bench: support sse and fix prompt processing time / server: add tokens usage in stream OAI response (#6495)
* ci: bench: support sse and fix prompt processing time server: add tokens usage in stream mode * ci: bench: README.md EOL * ci: bench: remove total pp and tg as it is not accurate * ci: bench: fix case when there is no token generated * ci: bench: change to the 95 percentile for pp and tg as it is closer to what the server exports in metrics * ci: bench: fix finish reason rate
This commit is contained in:
parent
a8bd14d557
commit
75cd4c7729
5 changed files with 112 additions and 38 deletions
|
@ -2,13 +2,15 @@
|
|||
|
||||
Benchmark is using [k6](https://k6.io/).
|
||||
|
||||
##### Install k6
|
||||
##### Install k6 and sse extension
|
||||
|
||||
Follow instruction from: https://k6.io/docs/get-started/installation/
|
||||
SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
|
||||
|
||||
Example for ubuntu:
|
||||
Example:
|
||||
```shell
|
||||
snap install k6
|
||||
go install go.k6.io/xk6/cmd/xk6@latest
|
||||
xk6 build master \
|
||||
--with github.com/phymbert/xk6-sse
|
||||
```
|
||||
|
||||
#### Download a dataset
|
||||
|
@ -46,7 +48,7 @@ server --host localhost --port 8080 \
|
|||
|
||||
For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
|
||||
```shell
|
||||
k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||
./k6 run script.js --duration 10m --iterations 500 --vus 8
|
||||
```
|
||||
|
||||
The benchmark values can be overridden with:
|
||||
|
@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
|
|||
```shell
|
||||
curl http://localhost:8080/metrics
|
||||
```
|
||||
|
||||
### Using the CI python script
|
||||
The `bench.py` script does several steps:
|
||||
- start the server
|
||||
- define good variable for k6
|
||||
- run k6 script
|
||||
- extract metrics from prometheus
|
||||
|
||||
It aims to be used in the CI, but you can run it manually:
|
||||
|
||||
```shell
|
||||
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
|
||||
--runner-label local \
|
||||
--name local \
|
||||
--branch `git rev-parse --abbrev-ref HEAD` \
|
||||
--commit `git rev-parse HEAD` \
|
||||
--scenario script.js \
|
||||
--duration 5m \
|
||||
--hf-repo ggml-org/models \
|
||||
--hf-file phi-2/ggml-model-q4_0.gguf \
|
||||
--model-path-prefix models \
|
||||
--parallel 4 \
|
||||
-ngl 33 \
|
||||
--batch-size 2048 \
|
||||
--ubatch-size 256 \
|
||||
--ctx-size 4096 \
|
||||
--n-prompts 200 \
|
||||
--max-prompt-tokens 256 \
|
||||
--max-tokens 256
|
||||
```
|
||||
|
|
|
@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
data['metrics'][metric_name][metric_metric]=value
|
||||
github_env.write(
|
||||
f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
|
||||
token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
|
||||
iterations = data['root_group']['checks']['success completion']['passes']
|
||||
|
||||
except Exception:
|
||||
|
@ -181,16 +180,16 @@ xychart-beta
|
|||
bench_results = {
|
||||
"i": iterations,
|
||||
"req": {
|
||||
"p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
|
||||
"p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
|
||||
"avg": round(data['metrics']["http_req_duration"]["avg"], 2),
|
||||
},
|
||||
"pp": {
|
||||
"p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
|
||||
"avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
|
||||
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
|
||||
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
|
||||
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
|
||||
},
|
||||
"tg": {
|
||||
"p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
|
||||
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
|
||||
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
|
||||
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
|
||||
},
|
||||
|
@ -206,7 +205,7 @@ xychart-beta
|
|||
|
||||
|
||||
def start_benchmark(args):
|
||||
k6_path = 'k6'
|
||||
k6_path = './k6'
|
||||
if 'BENCH_K6_BIN_PATH' in os.environ:
|
||||
k6_path = os.environ['BENCH_K6_BIN_PATH']
|
||||
k6_args = [
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import http from 'k6/http'
|
||||
import sse from 'k6/x/sse'
|
||||
import {check, sleep} from 'k6'
|
||||
import {SharedArray} from 'k6/data'
|
||||
import {Counter, Rate, Trend} from 'k6/metrics'
|
||||
|
@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {
|
|||
|
||||
const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
|
||||
const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
|
||||
|
||||
const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
|
||||
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
|
||||
|
||||
const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
|
||||
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
|
||||
|
@ -86,36 +88,62 @@ export default function () {
|
|||
}
|
||||
],
|
||||
"model": model,
|
||||
"stream": false,
|
||||
"stream": true,
|
||||
"seed": 42,
|
||||
"max_tokens": max_tokens
|
||||
}
|
||||
|
||||
const body = JSON.stringify(payload)
|
||||
const params = {method: 'POST', body: JSON.stringify(payload)};
|
||||
|
||||
let res = http.post(`${server_url}/chat/completions`, body, {
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
timeout: '300s'
|
||||
const startTime = new Date()
|
||||
let promptEvalEndTime = null
|
||||
let prompt_tokens = 0
|
||||
let completions_tokens = 0
|
||||
let finish_reason = null
|
||||
const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
|
||||
client.on('event', function (event) {
|
||||
if (promptEvalEndTime == null) {
|
||||
promptEvalEndTime = new Date()
|
||||
}
|
||||
|
||||
let chunk = JSON.parse(event.data)
|
||||
let choice = chunk.choices[0]
|
||||
if (choice.finish_reason) {
|
||||
finish_reason = choice.finish_reason
|
||||
}
|
||||
|
||||
if (chunk.usage) {
|
||||
prompt_tokens = chunk.usage.prompt_tokens
|
||||
llamacpp_prompt_tokens.add(prompt_tokens)
|
||||
llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
|
||||
|
||||
completions_tokens = chunk.usage.completion_tokens
|
||||
llamacpp_completion_tokens.add(completions_tokens)
|
||||
llamacpp_completion_tokens_total_counter.add(completions_tokens)
|
||||
}
|
||||
})
|
||||
|
||||
client.on('error', function (e) {
|
||||
console.log('An unexpected error occurred: ', e.error());
|
||||
throw e;
|
||||
})
|
||||
})
|
||||
|
||||
check(res, {'success completion': (r) => r.status === 200})
|
||||
|
||||
if (res.status === 200) {
|
||||
const completions = res.json()
|
||||
const endTime = new Date()
|
||||
|
||||
llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
|
||||
llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
|
||||
|
||||
llamacpp_completion_tokens.add(completions.usage.completion_tokens)
|
||||
llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
|
||||
|
||||
llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
|
||||
llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
|
||||
|
||||
llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
|
||||
} else {
|
||||
console.error(`response: ${res.body} request=${payload}`)
|
||||
const promptEvalTime = promptEvalEndTime - startTime
|
||||
if (promptEvalTime > 0) {
|
||||
llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
|
||||
}
|
||||
|
||||
const completion_time = endTime - promptEvalEndTime
|
||||
if (completions_tokens > 0 && completion_time > 0) {
|
||||
llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
|
||||
}
|
||||
llamacpp_completions_truncated_rate.add(finish_reason === 'length')
|
||||
llamacpp_completions_stop_rate.add(finish_reason === 'stop')
|
||||
|
||||
sleep(0.3)
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue