From 1b496a745c315022df2d919374052e6004ced8d3 Mon Sep 17 00:00:00 2001
From: Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
Date: Fri, 5 Apr 2024 14:35:06 +0100
Subject: [PATCH 1/5] [SYCL] Fixed minor bug when enabling FP16 for non intel
 targets (#6464)

* moved INTEL_MKL guard from gemm_impl to gemm (wrapper)

* Update ggml-sycl.cpp

Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>

---------

Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
---
 ggml-sycl.cpp | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 2b0e5f548..db3c24f60 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -1664,24 +1664,6 @@ namespace dpct
                               const void *alpha, const void *a, int lda, const void *b,
                               int ldb, const void *beta, void *c, int ldc)
         {
-#ifndef __INTEL_MKL__
-            GGML_UNUSED(q);
-            GGML_UNUSED(a_trans);
-            GGML_UNUSED(b_trans);
-            GGML_UNUSED(m);
-            GGML_UNUSED(n);
-            GGML_UNUSED(k);
-            GGML_UNUSED(alpha);
-            GGML_UNUSED(a);
-            GGML_UNUSED(lda);
-            GGML_UNUSED(b);
-            GGML_UNUSED(ldb);
-            GGML_UNUSED(beta);
-            GGML_UNUSED(c);
-            GGML_UNUSED(ldc);
-            throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                                     "Project does not support this API.");
-#else
             Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
             Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
             auto data_a = get_memory<const Ta>(a);
@@ -1690,7 +1672,6 @@ namespace dpct
             oneapi::mkl::blas::column_major::gemm(
                 q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
                 data_b, ldb, beta_value, data_c, ldc);
-#endif
         }
 
         template <typename VecT, class BinaryOperation, class = void>
@@ -2330,6 +2311,7 @@ namespace dpct
                                           lda, b, ldb, beta, c, ldc);
             break;
         }
+#ifdef __INTEL_MKL__
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
@@ -2391,6 +2373,7 @@ namespace dpct
                 q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
             break;
         }
+#endif // __INTEL_MKL__
         default:
             throw std::runtime_error("the combination of data type is unsupported");
         }

From 87e21bbacd830437ab653cf03b6f26d45c15395d Mon Sep 17 00:00:00 2001
From: Ting Sun <suntcrick@gmail.com>
Date: Sat, 6 Apr 2024 01:34:53 +0700
Subject: [PATCH 2/5] bench : make n_batch and n_ubatch configurable in Batched
 bench (#6500)

* bench: make n_batch and n_ubatch configurable

* bench: update doc for batched bench
---
 examples/batched-bench/README.md         |  8 +++----
 examples/batched-bench/batched-bench.cpp | 29 ++++++++++++++++--------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index 34b343f66..bf951baf7 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 
 ```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
 
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
+./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
 
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
 
 # custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
 ```
 
 ## Sample results
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 19674dfd3..1e34de620 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -32,13 +32,15 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
         printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
+        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
         return 1 ;
     }
 
     int n_kv_max     = 2048;
+    int n_batch      = 2048;
+    int n_ubatch     = 512;
     int is_pp_shared = 0;
     int n_gpu_layers = 0;
 
@@ -56,23 +58,31 @@ int main(int argc, char ** argv) {
     }
 
     if (argc >= 4) {
-        is_pp_shared = std::atoi(argv[3]);
+        n_batch = std::atoi(argv[3]);
     }
 
     if (argc >= 5) {
-        n_gpu_layers = std::atoi(argv[4]);
+        n_ubatch = std::atoi(argv[4]);
     }
 
     if (argc >= 6) {
-        n_pp = parse_list(argv[5]);
+        is_pp_shared = std::atoi(argv[5]);
     }
 
     if (argc >= 7) {
-        n_tg = parse_list(argv[6]);
+        n_gpu_layers = std::atoi(argv[6]);
     }
 
     if (argc >= 8) {
-        n_pl = parse_list(argv[7]);
+        n_pp = parse_list(argv[7]);
+    }
+
+    if (argc >= 9) {
+        n_tg = parse_list(argv[8]);
+    }
+
+    if (argc >= 10) {
+        n_pl = parse_list(argv[9]);
     }
 
     // init LLM
@@ -100,7 +110,8 @@ int main(int argc, char ** argv) {
 
     ctx_params.seed      = 1234;
     ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = 512;
+    ctx_params.n_batch   = n_batch;
+    ctx_params.n_ubatch  = n_ubatch;
 
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -158,7 +169,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
     LOG_TEE("\n");
 
     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");

From d0f5deebf898f8186a10148a03a56909ba05fc0b Mon Sep 17 00:00:00 2001
From: Hoang Nguyen <hugo53@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:39:43 -0700
Subject: [PATCH 3/5] readme : update UI list (#6503)

* Add MindMac to UI list

* Update proprietary description

Co-authored-by: slaren <slarengh@gmail.com>

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index bb66b6c68..bd3f9cff5 100644
--- a/README.md
+++ b/README.md
@@ -181,6 +181,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
 - [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later)
 - [Dot](https://github.com/alexpinel/Dot) (GPL)
+- [MindMac](https://mindmac.app) (proprietary)
 
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 

From a8bd14d55717754a1f48313a846a2b16fa998ad2 Mon Sep 17 00:00:00 2001
From: Brian <mofosyne@gmail.com>
Date: Sat, 6 Apr 2024 05:41:38 +1100
Subject: [PATCH 4/5] gguf.py : add licence and version to gguf writer (#6504)

---
 gguf-py/gguf/constants.py   | 1 +
 gguf-py/gguf/gguf_writer.py | 6 ++++++
 llama.cpp                   | 2 ++
 3 files changed, 9 insertions(+)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 5214764a9..c44d8abeb 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -24,6 +24,7 @@ class Keys:
         ALIGNMENT            = "general.alignment"
         NAME                 = "general.name"
         AUTHOR               = "general.author"
+        VERSION              = "general.version"
         URL                  = "general.url"
         DESCRIPTION          = "general.description"
         LICENSE              = "general.license"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 2ae6c814b..f4c440766 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -296,6 +296,9 @@ class GGUFWriter:
     def add_author(self, author: str) -> None:
         self.add_string(Keys.General.AUTHOR, author)
 
+    def add_version(self, version: str) -> None:
+        self.add_string(Keys.General.VERSION, version)
+
     def add_tensor_data_layout(self, layout: str) -> None:
         self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
 
@@ -305,6 +308,9 @@ class GGUFWriter:
     def add_description(self, description: str) -> None:
         self.add_string(Keys.General.DESCRIPTION, description)
 
+    def add_licence(self, licence: str) -> None:
+        self.add_string(Keys.General.LICENSE, licence)
+
     def add_source_url(self, url: str) -> None:
         self.add_string(Keys.General.SOURCE_URL, url)
 
diff --git a/llama.cpp b/llama.cpp
index 9a1c11043..217726184 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -261,6 +261,7 @@ enum llm_kv {
     LLM_KV_GENERAL_ALIGNMENT,
     LLM_KV_GENERAL_NAME,
     LLM_KV_GENERAL_AUTHOR,
+    LLM_KV_GENERAL_VERSION,
     LLM_KV_GENERAL_URL,
     LLM_KV_GENERAL_DESCRIPTION,
     LLM_KV_GENERAL_LICENSE,
@@ -330,6 +331,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_GENERAL_ALIGNMENT,             "general.alignment"                     },
     { LLM_KV_GENERAL_NAME,                  "general.name"                          },
     { LLM_KV_GENERAL_AUTHOR,                "general.author"                        },
+    { LLM_KV_GENERAL_VERSION,               "general.version"                       },
     { LLM_KV_GENERAL_URL,                   "general.url"                           },
     { LLM_KV_GENERAL_DESCRIPTION,           "general.description"                   },
     { LLM_KV_GENERAL_LICENSE,               "general.license"                       },

From 75cd4c77292034ecec587ecb401366f57338f7c0 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 6 Apr 2024 05:40:47 +0200
Subject: [PATCH 5/5] ci: bench: support sse and fix prompt processing time /
 server: add tokens usage in stream OAI response (#6495)

* ci: bench: support sse and fix prompt processing time
server: add tokens usage in stream mode

* ci: bench: README.md EOL

* ci: bench: remove total pp and tg as it is not accurate

* ci: bench: fix case when there is no token generated

* ci: bench: change to the 95 percentile for pp and tg as it is closer to what the server exports in metrics

* ci: bench: fix finish reason rate
---
 .github/workflows/bench.yml     | 20 ++++++----
 examples/server/bench/README.md | 42 +++++++++++++++++---
 examples/server/bench/bench.py  | 11 +++---
 examples/server/bench/script.js | 68 +++++++++++++++++++++++----------
 examples/server/utils.hpp       |  9 +++++
 5 files changed, 112 insertions(+), 38 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b974e7fac..758796632 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -79,12 +79,18 @@ jobs:
             sleep 0.1
           done
 
-      - name: Install k6
+      - name: Set up Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: '1.21'
+
+      - name: Install k6 and xk6-sse
         id: k6_installation
         run: |
           cd examples/server/bench
-          wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
-          tar xzf k6*.tar.gz --strip-components=1
+          go install go.k6.io/xk6/cmd/xk6@latest
+          xk6 build master \
+              --with github.com/phymbert/xk6-sse
 
       - name: Build
         id: cmake_build
@@ -118,7 +124,7 @@ jobs:
 
           cd examples/server/bench
           source venv/bin/activate
-          BENCH_K6_BIN_PATH=./k6 python bench.py \
+          python bench.py \
               --runner-label ${{ env.RUNNER_LABEL }} \
               --name ${{ github.job }} \
               --branch ${{ github.head_ref || github.ref_name }} \
@@ -228,9 +234,9 @@ jobs:
             <summary>Expand details for performance related PR only</summary>
 
             - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
-            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
-            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
-            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
+            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
+            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
+            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
             - ${{ env.BENCH_GRAPH_XLABEL }}
 
 
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index a53ad64d7..23a3ec975 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -2,13 +2,15 @@
 
 Benchmark is using [k6](https://k6.io/).
 
-##### Install k6
+##### Install k6 and sse extension
 
-Follow instruction from: https://k6.io/docs/get-started/installation/
+SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.
 
-Example for ubuntu:
+Example:
 ```shell
-snap install k6
+go install go.k6.io/xk6/cmd/xk6@latest
+xk6 build master \
+--with github.com/phymbert/xk6-sse
 ```
 
 #### Download a dataset
@@ -46,7 +48,7 @@ server --host localhost --port 8080 \
 
 For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run:
 ```shell
-k6 run script.js --duration 10m --iterations 500 --vus 8
+./k6 run script.js --duration 10m --iterations 500 --vus 8
 ```
 
 The benchmark values can be overridden with:
@@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with:
 ```shell
 curl http://localhost:8080/metrics
 ```
+
+### Using the CI python script
+The `bench.py` script does several steps:
+- start the server
+- define good variable for k6
+- run k6 script
+- extract metrics from prometheus
+
+It aims to be used in the CI, but you can run it manually:
+
+```shell
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
+              --runner-label local \
+              --name local \
+              --branch `git rev-parse --abbrev-ref HEAD` \
+              --commit `git rev-parse HEAD` \
+              --scenario script.js \
+              --duration 5m \
+              --hf-repo ggml-org/models	 \
+              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --model-path-prefix models \
+              --parallel 4 \
+              -ngl 33 \
+              --batch-size 2048 \
+              --ubatch-size	256 \
+              --ctx-size 4096 \
+              --n-prompts 200 \
+              --max-prompt-tokens 256 \
+              --max-tokens 256
+```
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index 86eeeccf8..6ca637bdd 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -76,7 +76,6 @@ def main(args_in: list[str] | None = None) -> None:
                             data['metrics'][metric_name][metric_metric]=value
                             github_env.write(
                                 f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n")
-                token_seconds = data['metrics']['llamacpp_tokens_second']['avg']
                 iterations = data['root_group']['checks']['success completion']['passes']
 
     except Exception:
@@ -181,16 +180,16 @@ xychart-beta
     bench_results = {
         "i": iterations,
         "req": {
-            "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2),
+            "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2),
             "avg": round(data['metrics']["http_req_duration"]["avg"], 2),
         },
         "pp": {
-            "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2),
-            "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2),
+            "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
+            "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
             "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
         },
         "tg": {
-            "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2),
+            "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
             "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
             "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
         },
@@ -206,7 +205,7 @@ xychart-beta
 
 
 def start_benchmark(args):
-    k6_path = 'k6'
+    k6_path = './k6'
     if 'BENCH_K6_BIN_PATH' in os.environ:
         k6_path = os.environ['BENCH_K6_BIN_PATH']
     k6_args = [
diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js
index dc41e8d93..c4c486cdf 100644
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@@ -1,4 +1,4 @@
-import http from 'k6/http'
+import sse from 'k6/x/sse'
 import {check, sleep} from 'k6'
 import {SharedArray} from 'k6/data'
 import {Counter, Rate, Trend} from 'k6/metrics'
@@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () {
 
 const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens')
 const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')
+
 const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
+const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
 
 const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
 const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
@@ -86,36 +88,62 @@ export default function () {
             }
         ],
         "model": model,
-        "stream": false,
+        "stream": true,
         "seed": 42,
         "max_tokens": max_tokens
     }
 
-    const body = JSON.stringify(payload)
+    const params = {method: 'POST', body: JSON.stringify(payload)};
 
-    let res = http.post(`${server_url}/chat/completions`, body, {
-        headers: {'Content-Type': 'application/json'},
-        timeout: '300s'
+    const startTime = new Date()
+    let promptEvalEndTime = null
+    let prompt_tokens = 0
+    let completions_tokens = 0
+    let finish_reason = null
+    const res = sse.open(`${server_url}/chat/completions`, params, function (client) {
+        client.on('event', function (event) {
+            if (promptEvalEndTime == null) {
+                promptEvalEndTime = new Date()
+            }
+
+            let chunk = JSON.parse(event.data)
+            let choice = chunk.choices[0]
+            if (choice.finish_reason) {
+                finish_reason = choice.finish_reason
+            }
+
+            if (chunk.usage) {
+                prompt_tokens = chunk.usage.prompt_tokens
+                llamacpp_prompt_tokens.add(prompt_tokens)
+                llamacpp_prompt_tokens_total_counter.add(prompt_tokens)
+
+                completions_tokens = chunk.usage.completion_tokens
+                llamacpp_completion_tokens.add(completions_tokens)
+                llamacpp_completion_tokens_total_counter.add(completions_tokens)
+            }
+        })
+
+        client.on('error', function (e) {
+            console.log('An unexpected error occurred: ', e.error());
+            throw e;
+        })
     })
 
     check(res, {'success completion': (r) => r.status === 200})
 
-    if (res.status === 200) {
-        const completions = res.json()
+    const endTime = new Date()
 
-        llamacpp_prompt_tokens.add(completions.usage.prompt_tokens)
-        llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens)
-
-        llamacpp_completion_tokens.add(completions.usage.completion_tokens)
-        llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens)
-
-        llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length')
-        llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop')
-
-        llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3)
-    } else {
-        console.error(`response: ${res.body} request=${payload}`)
+    const promptEvalTime = promptEvalEndTime - startTime
+    if (promptEvalTime > 0) {
+        llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3)
     }
 
+    const completion_time = endTime - promptEvalEndTime
+    if (completions_tokens > 0 && completion_time > 0) {
+        llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3)
+    }
+    llamacpp_completions_truncated_rate.add(finish_reason === 'length')
+    llamacpp_completions_stop_rate.add(finish_reason === 'stop')
+
     sleep(0.3)
 }
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 47cc53c27..a8d43ac63 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -567,6 +567,15 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
         {"model",   modelname},
         {"object",  "chat.completion.chunk"}
     };
+    if (!finish_reason.empty()) {
+        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
+        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
+        ret.push_back({"usage", json {
+            {"completion_tokens", num_tokens_predicted},
+            {"prompt_tokens",     num_prompt_tokens},
+            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+        }});
+    }
 
     return std::vector<json>({ret});
 }