From 572758a665e9435ec235ae6c788ed2e3f099d8cc Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 9 Mar 2024 09:15:15 +0100
Subject: [PATCH] server: bench: change gauge custom metrics to trend server:
 bench: add trend custom metrics for total tokens per second average

---
 examples/server/bench/README.md | 13 +++++++------
 examples/server/bench/script.js |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 481dc5c4d..6e1709ee5 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -64,13 +64,14 @@ SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vu
 
 #### Metrics
 
-Following metrics are available:
-- `llamacpp_prompt_tokens` Gauge of OAI response `usage.prompt_tokens`
-- `llamacpp_prompt_tokens_total_counter` Counter of OAI response `usage.prompt_tokens`
-- `llamacpp_completion_tokens` Gauge of OAI response `usage.completion_tokens`
-- `llamacpp_completion_tokens_total_counter` Counter of OAI response `usage.completion_tokens`
+Following metrics are available computed from the OAI chat completions response `usage`:
+- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration`
+- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens`
+- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens`
+- `llamacpp_completion_tokens` Trend of `usage.completion_tokens`
+- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens`
 - `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'`
-- `llamacpp_completions_stop_rate` Rate of completions truncated, i.e. if `finish_reason === 'stop'`
+- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'`
 
 The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`.
 
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index d076b8c34..94b8aa94a 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -34,6 +34,7 @@ const data = new SharedArray('conversations', function () {
 
 const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
 
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -94,6 +95,8 @@ export default function () {
 
         llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
         llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
+
+        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
     } else {
         console.error(`response: ${res.body}`)
     }