From b2e89a327457179a34eae4d7de0d412ed945679c Mon Sep 17 00:00:00 2001
From: Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
Date: Mon, 9 Sep 2024 09:02:45 +0200
Subject: [PATCH 01/65] Arm AArch64: Documentation updates (#9321)

* Arm AArch64: Documentation updates

* Update docs/build.md to include information on how to enable the Arm optimized gemm/gemv kernels

* Update examples/quantize/README.md with information on the Q4_0_4_4, Q4_0_4_8 and Q4_0_8_8 formats

* Add newline to the end of docs/build.md
---
 docs/build.md               | 6 ++++++
 examples/quantize/README.md | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/docs/build.md b/docs/build.md
index 152d46d6f..faa0ecfa4 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to
 ### Android
 
 To read documentation for how to build on Android, [click here](./android.md)
+
+### Arm CPU optimized mulmat kernels
+
+Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
+
+To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
index 5d1e11c67..704f0d56b 100644
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
 
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 
+The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
+
 *(outdated)*
 
 | Model | Measure      |    F16 |   Q4_0 |   Q4_1 |   Q5_0 |   Q5_1 |   Q8_0 |

From 54f376d0b92c6ff6feb1fa2ef8ed2022348100ba Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Mon, 9 Sep 2024 11:04:39 +0300
Subject: [PATCH 02/65] rpc : update README [no ci] (#9320)

Update README with instructions how to offload model layers to both
local and remote devices
---
 examples/rpc/README.md | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/examples/rpc/README.md b/examples/rpc/README.md
index adedc8909..36ff4a330 100644
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following
 
 ```mermaid
 flowchart TD
-    rpcb---|TCP|srva
-    rpcb---|TCP|srvb
-    rpcb-.-|TCP|srvn
+    rpcb<-->|TCP|srva
+    rpcb<-->|TCP|srvb
+    rpcb<-.->|TCP|srvn
     subgraph hostn[Host N]
-    srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
+    srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
     end
     subgraph hostb[Host B]
-    srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
+    srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
     end
     subgraph hosta[Host A]
-    srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
+    srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
     end
     subgraph host[Main Host]
-    ggml[llama.cpp]---rpcb[RPC backend]
+    local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
+    ggml[llama-cli]<-->rpcb[RPC backend]
     end
     style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
 ```
@@ -62,17 +63,11 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
 This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
 
 
-On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
-
-```bash
-mkdir build-rpc
-cd build-rpc
-cmake .. -DGGML_RPC=ON
-cmake --build . --config Release
-```
-
-Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
+On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
+Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
 
 ```bash
 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
+
+This way you can offload model layers to both local and remote devices.
\ No newline at end of file

From 5ed087573e1f326cfa70e29c1895d074a7a1a00c Mon Sep 17 00:00:00 2001
From: Antonis Makropoulos <benuix@gmail.com>
Date: Mon, 9 Sep 2024 14:21:38 +0300
Subject: [PATCH 03/65] readme : add LLMUnity to UI projects (#9381)

* add LLMUnity to UI projects

* add newline to examples/rpc/README.md to fix editorconfig-checker unit test
---
 README.md              | 1 +
 examples/rpc/README.md | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e30ab0c8c..8cbfdda6b 100644
--- a/README.md
+++ b/README.md
@@ -163,6 +163,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
 - [AIKit](https://github.com/sozercan/aikit) (MIT)
 - [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
+- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
 
 *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
 
diff --git a/examples/rpc/README.md b/examples/rpc/README.md
index 36ff4a330..312bb634d 100644
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -70,4 +70,5 @@ Finally, when running `llama-cli`, use the `--rpc` option to specify the host an
 $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
 ```
 
-This way you can offload model layers to both local and remote devices.
\ No newline at end of file
+This way you can offload model layers to both local and remote devices.
+

From 8e6e2fbe1458ac91387266241262294a964d6b95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 9 Sep 2024 14:22:53 +0200
Subject: [PATCH 04/65] CUDA: fix variable name conflict for Windows build
 (#9382)

---
 ggml/src/ggml-cuda/sum.cu | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu
index 0d5e953ee..21da63509 100644
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -1,13 +1,15 @@
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
+// For this reason CUB must be included BEFORE anything else.
+#include <cub/cub.cuh>
+using namespace cub;
+#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
+
 #include "sumrows.cuh"
 #include "sum.cuh"
 
 #include <cstdint>
 
-#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
-#include <cub/cub.cuh>
-using namespace cub;
-#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
-
 void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
 #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
     size_t tmp_size = 0;

From 38ca6f644bd48301e9caa80f9913c22e70a8fd1b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 9 Sep 2024 15:51:37 +0300
Subject: [PATCH 05/65] readme : update hot topics

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8cbfdda6b..c945e125c 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ## Hot topics
 
-- *add hot topics here*
+- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
 
 ----
 

From 5fb5e24811cb01d48b482c15a974bfbd9f433e1d Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 9 Sep 2024 17:10:46 +0200
Subject: [PATCH 06/65] llama : minor sampling refactor (2) (#9386)

---
 examples/batched.swift/Sources/main.swift     |   2 -
 examples/batched/batched.cpp                  |   2 -
 examples/gritlm/gritlm.cpp                    |   1 -
 .../llama/src/main/cpp/llama-android.cpp      |   2 -
 .../llama.cpp.swift/LibLlama.swift            |   2 -
 examples/passkey/passkey.cpp                  |   2 -
 examples/save-load-state/save-load-state.cpp  |   6 -
 examples/server/server.cpp                    |   2 +-
 examples/simple/simple.cpp                    |   2 -
 include/llama.h                               |  11 +-
 src/llama-sampling.cpp                        | 194 ++++++++++--------
 tests/test-sampling.cpp                       |   2 +-
 12 files changed, 115 insertions(+), 113 deletions(-)

diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 4bc2bbf2c..9f7c49492 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -140,8 +140,6 @@ while n_cur <= n_len {
 
         let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
 
-        llama_sampler_accept(smpl, new_token_id)
-
         // is it an end of stream? -> mark the stream as finished
         if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
             i_batch[i] = -1
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index f5f309022..615d6f0f5 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -172,8 +172,6 @@ int main(int argc, char ** argv) {
 
             const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
 
-            llama_sampler_accept(smpl, new_token_id);
-
             // is it an end of generation? -> mark the stream as finished
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                 i_batch[i] = -1;
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index e1efbf573..6f060e2dc 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -121,7 +121,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
         llama_decode(ctx, bat);
 
         llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
-        llama_sampler_accept(smpl, token);
 
         if (token == eos_token) {
             break;
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 06ec160c2..f611809c6 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
     // sample the most likely token
     const auto new_token_id = llama_sampler_sample(sampler, context, -1);
 
-    llama_sampler_accept(sampler, new_token_id);
-
     const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
     if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
         return nullptr;
diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
index 92f61fe83..dcd9803a2 100644
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -152,8 +152,6 @@ actor LlamaContext {
 
         new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
 
-        llama_sampler_accept(sampling, new_token_id)
-
         if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
             print("\n")
             is_done = true
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 76d235c2c..271ef3a98 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -220,8 +220,6 @@ int main(int argc, char ** argv) {
         {
             const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
 
-            llama_sampler_accept(smpl, new_token_id);
-
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                 LOG_TEE("\n");
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index b54ec3bd8..e17ab0ed0 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -74,8 +74,6 @@ int main(int argc, char ** argv) {
         auto next_token     = llama_sampler_sample(smpl, ctx, -1);
         auto next_token_str = llama_token_to_piece(ctx, next_token);
 
-        llama_sampler_accept(smpl, next_token);
-
         printf("%s", next_token_str.c_str());
         result0 += next_token_str;
 
@@ -132,8 +130,6 @@ int main(int argc, char ** argv) {
         auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
         auto next_token_str = llama_token_to_piece(ctx2, next_token);
 
-        llama_sampler_accept(smpl2, next_token);
-
         printf("%s", next_token_str.c_str());
         result1 += next_token_str;
 
@@ -222,8 +218,6 @@ int main(int argc, char ** argv) {
         auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
         auto next_token_str = llama_token_to_piece(ctx3, next_token);
 
-        llama_sampler_accept(smpl3, next_token);
-
         printf("%s", next_token_str.c_str());
         result2 += next_token_str;
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9ab8f8ca6..de3ea313c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -613,7 +613,7 @@ struct server_context {
 
     gpt_params params;
 
-    llama_batch batch;
+    llama_batch batch = {};
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index a53cef547..d040172a5 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -118,8 +118,6 @@ int main(int argc, char ** argv) {
         {
             const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
 
-            llama_sampler_accept(smpl, new_token_id);
-
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                 LOG_TEE("\n");
diff --git a/include/llama.h b/include/llama.h
index 6334fc30d..93b3e6e85 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1127,15 +1127,16 @@ extern "C" {
                              int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
 
-    // Shorthand for:
+    /// @details Sample and accept a token from the idx-th output of the last evaluation
     //
+    // Shorthand for:
     //    const auto * logits = llama_get_logits_ith(ctx, idx);
     //    llama_token_data_array cur_p = { ... init from logits ... };
     //    llama_sampler_apply(smpl, &cur_p);
-    //    return cur_p.data[cur_p.selected].id;
-    //
-    // At this point, this is mostly a convenience function.
-    //
+    //    auto token = cur_p.data[cur_p.selected].id;
+    //    llama_sampler_accept(smpl, token);
+    //    return token;
+    // Returns the sampled token
     LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
 
     // TODO: extend in the future
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 41f48ec28..6f448b80c 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -8,49 +8,44 @@
 #include <cstring>
 #include <ctime>
 #include <cfloat>
+#include <cmath>
 #include <numeric>
 #include <random>
 #include <unordered_map>
 
-static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng, std::vector<float> & probs) {
-#if 1
-    probs.resize(cur_p->size);
-    for (size_t i = 0; i < cur_p->size; ++i) {
-        probs[i] = cur_p->data[i].p;
-    }
-
-    std::discrete_distribution<size_t> dist(probs.begin(), probs.end());
-#else
-    // avoid the copy with a custom iterator
+static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
+    // iterator for the probabilities
+#ifdef __GNUC__
     #pragma GCC diagnostic push
     #pragma GCC diagnostic ignored "-Wunused-local-typedefs"
+#endif
 
     struct probs_iterator {
         typedef std::input_iterator_tag iterator_category;
         typedef float value_type;
         typedef float * pointer;
         typedef float & reference;
-        typedef size_t difference_type;
+        typedef ptrdiff_t difference_type;
 
-        const llama_token_data_array * data;
-        size_t i;
+        const llama_token_data * data;
 
-        bool operator==(const probs_iterator & other) const { return data + i == other.data + other.i; }
-        bool operator!=(const probs_iterator & other) const { return data + i != other.data + other.i; }
-        float operator*() const { return data->data[i].p; }
-        probs_iterator & operator++() { ++i; return *this; }
-        probs_iterator operator++(int) { probs_iterator tmp = *this; ++i; return tmp; }
+        bool operator==(const probs_iterator & other) const { return data == other.data; }
+        bool operator!=(const probs_iterator & other) const { return data != other.data; }
+        const float & operator*() const { return data->p; }
+        probs_iterator & operator++() { ++data; return *this; }
+        probs_iterator operator++(int) { probs_iterator tmp = *this; ++data; return tmp; }
     };
+
+#ifdef __GNUC__
     #pragma GCC diagnostic pop
-
-    std::discrete_distribution<size_t> dist(probs_iterator{cur_p, 0}, probs_iterator{cur_p, cur_p->size});
-
-    GGML_UNUSED(probs);
 #endif
 
+    std::discrete_distribution<int> dist(probs_iterator{cur_p->data}, probs_iterator{cur_p->data + cur_p->size});
+
     return dist(rng);
 }
 
+/*
 static void llama_log_softmax(float * array, size_t size) {
     float max_l = *std::max_element(array, array + size);
     float sum = 0.f;
@@ -64,6 +59,7 @@ static void llama_log_softmax(float * array, size_t size) {
         array[i] = logf(array[i] / sum);
     }
 }
+*/
 
 static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
     GGML_ASSERT(cur_p->size > 0);
@@ -231,67 +227,92 @@ llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_conte
         cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
     }
 
-    llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
+    llama_token_data_array cur_p = {
+        /* .data       = */ cur.data(),
+        /* .size       = */ cur.size(),
+        /* .selected   = */ -1,
+        /* .sorted     = */ false,
+    };
 
     llama_sampler_apply(smpl, &cur_p);
 
-    return cur_p.data[cur_p.selected].id;
+    GGML_ASSERT(cur_p.selected >= 0 && cur_p.selected < (int32_t) cur_p.size);
+
+    auto token = cur_p.data[cur_p.selected].id;
+
+    llama_sampler_accept(smpl, token);
+
+    return token;
 }
 
 // sampler chain
 
+static const char * llama_sampler_chain_name(const struct llama_sampler * /*smpl*/) {
+    return "chain";
+}
+
+static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token token) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    time_meas tm(chain->t_sample_us, chain->params.no_perf);
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_accept(smpl, token);
+    }
+
+    chain->n_sample++;
+}
+
+static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    time_meas tm(chain->t_sample_us, chain->params.no_perf);
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_apply(smpl, cur_p);
+    }
+}
+
+static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_reset(smpl);
+    }
+
+    chain->t_sample_us = 0;
+    chain->n_sample    = 0;
+}
+
+static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
+    const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;
+
+    auto * result = llama_sampler_chain_init(chain_src->params);
+
+    for (auto * smpl : chain_src->samplers) {
+        llama_sampler_chain_add(result, llama_sampler_clone(smpl));
+    }
+
+    return result;
+}
+
+static void llama_sampler_chain_free(struct llama_sampler * smpl) {
+    auto * chain = (llama_sampler_chain *) smpl->ctx;
+
+    for (auto * smpl : chain->samplers) {
+        llama_sampler_free(smpl);
+    }
+
+    delete chain;
+}
+
 static struct llama_sampler_i llama_sampler_chain_i = {
-    /* .name   = */ [](const struct llama_sampler * /*smpl*/) { return "chain"; },
-    /* .accept = */ [](struct llama_sampler * smpl, llama_token token) {
-        auto * chain = (llama_sampler_chain *) smpl->ctx;
-
-        time_meas tm(chain->t_sample_us, chain->params.no_perf);
-
-        for (auto * smpl : chain->samplers) {
-            llama_sampler_accept(smpl, token);
-        }
-
-        chain->n_sample++;
-    },
-    /* .apply  = */ [](struct llama_sampler * smpl, llama_token_data_array * cur_p) {
-        auto * chain = (llama_sampler_chain *) smpl->ctx;
-
-        time_meas tm(chain->t_sample_us, chain->params.no_perf);
-
-        for (auto * smpl : chain->samplers) {
-            llama_sampler_apply(smpl, cur_p);
-        }
-    },
-    /* .reset  = */ [](struct llama_sampler * smpl) {
-        auto * chain = (llama_sampler_chain *) smpl->ctx;
-
-        for (auto * smpl : chain->samplers) {
-            llama_sampler_reset(smpl);
-        }
-
-        chain->t_sample_us = 0;
-        chain->n_sample    = 0;
-    },
-    /* .clone  = */ [](const struct llama_sampler * smpl) {
-        const auto * chain_src = (const llama_sampler_chain *) smpl->ctx;
-
-        auto * result = llama_sampler_chain_init(chain_src->params);
-
-        for (auto * smpl : chain_src->samplers) {
-            llama_sampler_chain_add(result, llama_sampler_clone(smpl));
-        }
-
-        return result;
-    },
-    /* .free   = */ [](struct llama_sampler * smpl) {
-        auto * chain = (llama_sampler_chain *) smpl->ctx;
-
-        for (auto * smpl : chain->samplers) {
-            llama_sampler_free(smpl);
-        }
-
-        delete chain;
-    },
+    /* .name   = */ llama_sampler_chain_name,
+    /* .accept = */ llama_sampler_chain_accept,
+    /* .apply  = */ llama_sampler_chain_apply,
+    /* .reset  = */ llama_sampler_chain_reset,
+    /* .clone  = */ llama_sampler_chain_clone,
+    /* .free   = */ llama_sampler_chain_free,
 };
 
 struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params) {
@@ -368,8 +389,6 @@ struct llama_sampler_dist {
     const uint32_t seed;
 
     std::mt19937 rng;
-
-    std::vector<float> probs; // work array
 };
 
 static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*/) {
@@ -378,7 +397,7 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
 
 static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
-    cur_p->selected = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+    cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
 }
 
 static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sampler * smpl) {
@@ -419,7 +438,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
         /* .ctx   = */ new llama_sampler_dist {
             /* .seed = */ seed,
             /* .rng  = */ std::mt19937(seed),
-            /* .probs = */ {},
         },
     };
 }
@@ -1023,8 +1041,6 @@ struct llama_sampler_mirostat {
     float mu;
 
     std::mt19937 rng;
-
-    std::vector<float> probs;
 };
 
 static const char * llama_sampler_mirostat_name(const struct llama_sampler * /*smpl*/) {
@@ -1055,7 +1071,7 @@ static void llama_sampler_mirostat_apply(struct llama_sampler * smpl, llama_toke
     llama_sampler_top_k_impl(cur_p, std::max(int(k), 1));
     llama_sampler_softmax_impl(cur_p);
 
-    const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+    const int idx = llama_sample_dist(cur_p, ctx->rng);
 
     cur_p->selected = idx;
 
@@ -1111,7 +1127,6 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
             /* .m       = */ m,
             /* .mu      = */ 2.0f*tau,
             /* .rng     = */ std::mt19937(seed),
-            /* .probs   = */ {},
         },
     };
 }
@@ -1127,8 +1142,6 @@ struct llama_sampler_mirostat_v2 {
     float mu;
 
     std::mt19937 rng;
-
-    std::vector<float> probs;
 };
 
 static const char * llama_sampler_mirostat_v2_name(const struct llama_sampler * /*smpl*/) {
@@ -1152,7 +1165,7 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
     // Normalize the probabilities of the remaining words
     llama_sampler_softmax_impl(cur_p);
 
-    const int idx = llama_sample_dist(cur_p, ctx->rng, ctx->probs);
+    const int idx = llama_sample_dist(cur_p, ctx->rng);
 
     cur_p->selected = idx;
 
@@ -1207,7 +1220,6 @@ struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau,
             /* .eta   = */ eta,
             /* .mu    = */ 2.0f*tau,
             /* .rng   = */ std::mt19937(seed),
-            /* .probs = */ {},
         },
     };
 }
@@ -1527,6 +1539,10 @@ static const char * llama_sampler_logit_bias_name(const struct llama_sampler * /
 static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
     auto * ctx = (llama_sampler_logit_bias *) smpl->ctx;
 
+    if (ctx->logit_bias.empty()) {
+        return;
+    }
+
     ctx->to_search.clear();
 
     // update the candidates that have not been shuffled in the vocabulary (i.e. idx == id)
@@ -1538,6 +1554,10 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
         }
     }
 
+    if (ctx->to_search.empty()) {
+        return;
+    }
+
     // search for the remaining candidates that were not found in the previous step
     for (size_t i = 0; i < cur_p->size; ++i) {
         for (const auto & lb : ctx->to_search) {
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 37400c179..d738b7a45 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -245,7 +245,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
         }
     }
 
-    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
+    printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
            samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
 }
 

From 5fac4d57643b1de8e9ab746f14d2fc4e319ae0c2 Mon Sep 17 00:00:00 2001
From: Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Date: Mon, 9 Sep 2024 21:07:18 +0530
Subject: [PATCH 07/65] ggml : vector length agnostic SVE support (#9290)

* Implemented vector length agnostic SVE using switch case for 512-bit, 256-bit, 128-bit vector lengths

* Implemented vector length agnostic SVE using switch case for 512-bit, 256-bit, 128-bit vector lengths

* Removed WhiteSpaces

* ggml : style changes + fix 512-bit nb loop check

- fix local scope in switch cases
- consistent predicate names
- empty lines when necessary
- opening braces, spaces
- const-correctness
- add asserts

* Update ggml/src/ggml-quants.c

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml/src/ggml-quants.c | 284 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 239 insertions(+), 45 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 8c31e2cca..322c85d2a 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -4003,42 +4003,141 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float sumf = 0;
 
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
-        const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-        svfloat32_t sumv0 = svdup_n_f32(0.0f);
-        svfloat32_t sumv1 = svdup_n_f32(0.0f);
+    const int vector_length = ggml_sve_cnt_b*8;
 
-        for (; ib + 1 < nb; ib += 2) {
-            const block_q4_0 * restrict x0 = &x[ib + 0];
-            const block_q4_0 * restrict x1 = &x[ib + 1];
-            const block_q8_0 * restrict y0 = &y[ib + 0];
-            const block_q8_0 * restrict y1 = &y[ib + 1];
+    // VLA Implementation using switch case
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating higher lanes for 4 float32 elements
+                const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
 
-            // load x
-            const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
-            const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
 
-            // 4-bit -> 8-bit
-            const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
-            const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
 
-            // sub 8
-            const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
-            const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
+                    const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
+                    const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
+                    const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
 
-            // load y
-            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+                    // sub 8
+                    const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
+                    const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
+                    const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
+                    const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
 
-            // dot product
-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-        }
+                    // load y
+                    const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
+                    const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
+                    const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
 
-        sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
+                                    svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
+                                    svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
+                                    svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for  16 int8 elements
+                const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating higher lanes for 32 int8 elements
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+
+                // predicate for activating higher lanes for 16 int8 elements
+                const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
+                // predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
+                const svbool_t pl16 = svnot_b_z(ph32, ph16);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q4_0 * restrict x0 = &x[ib + 0];
+                    const block_q4_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
+                    const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
+
+                    // 4-bit -> 8-bit
+                    const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
+                    const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
+
+                    // sub 8
+                    const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
+                    const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(ph32, y0->qs);
+                    const svint8_t qy1 = svld1_s8(ph32, y1->qs);
+
+                    // dot product
+                    sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
+                                svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
+            } break;
+        default:
+            assert(false && "Unsupported vector length");
+            break;
     }
+
 #elif defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -5488,29 +5587,124 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     float sumf = 0;
 
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
-        svfloat32_t sumv0 = svdup_n_f32(0.0f);
-        svfloat32_t sumv1 = svdup_n_f32(0.0f);
+    svfloat32_t sumv0 = svdup_n_f32(0.0f);
+    svfloat32_t sumv1 = svdup_n_f32(0.0f);
 
-        for (; ib + 1 < nb; ib += 2) {
-            const block_q8_0 * restrict x0 = &x[ib + 0];
-            const block_q8_0 * restrict x1 = &x[ib + 1];
-            const block_q8_0 * restrict y0 = &y[ib + 0];
-            const block_q8_0 * restrict y1 = &y[ib + 1];
+    const int vector_length = ggml_sve_cnt_b*8;
 
-            // load x
-            const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
-            const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+    //VLA Implemenation for SVE
+    switch (vector_length) {
+        case 128:
+            {
+                // predicate for activating lanes for 16 Int8 elements
+                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
+                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
 
-            // load y
-            const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
-            const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
 
-            sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-            sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-        }
+                    // load x
+                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
+                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
+                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
+                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
 
-        sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+                    // load y
+                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
+                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
+                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
+                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
+
+                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
+                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
+                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
+                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
+            } break;
+        case 256:
+            {
+                //printf("sve256");
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    // load x
+                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
+                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
+
+                    // load y
+                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
+                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
+
+                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
+                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
+            } break;
+        case 512:
+            {
+                // predicate for activating high 256 bit
+                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
+                // predicate for activating low 256 bit
+                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
+
+                // predicate for activating high lanes for 8 float32 elements
+                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
+                // predicate for activating low lanes for 8 float32 elements
+                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
+
+                svfloat32_t sumv00 = svdup_n_f32(0.0f);
+
+                for (; ib + 1 < nb; ib += 2) {
+                    const block_q8_0 * restrict x0 = &x[ib + 0];
+                    const block_q8_0 * restrict x1 = &x[ib + 1];
+                    const block_q8_0 * restrict y0 = &y[ib + 0];
+                    const block_q8_0 * restrict y1 = &y[ib + 1];
+
+                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
+                    // and add them to make one 64 element vector
+                    // load x
+                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
+                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
+
+                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
+
+                    // load y
+                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
+                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
+
+                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
+
+                    // scale creation
+                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
+                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
+
+                    // duplicate deq1 in first half of vector and deq2 in second half of vector
+                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
+
+                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
+
+                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
+                }
+
+                sumf = svaddv_f32(svptrue_b32(), sumv00);
+                break;
+            }
+        default:
+            assert(false && "Unsupported vector length");
+            break;
     }
 #elif defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);

From 293bebe0773c907c0c866213856eeba41b035df1 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Mon, 9 Sep 2024 18:40:10 +0300
Subject: [PATCH 08/65] rpc : fix segfault with nkvo (#9389)

* rpc : fix nkvo

* rpc : buf_size must not be static

ref: #9337

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 ggml/src/ggml-cuda.cu |  6 +++++-
 ggml/src/ggml-rpc.cpp | 18 ++++++++++--------
 ggml/src/ggml.c       |  2 +-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 982316f56..d53de4edd 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2552,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
         for (int i = 0; i < cgraph->n_nodes; i++) {
             ggml_tensor * node = cgraph->nodes[i];
 
-            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+
+            if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
                 use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
 #ifndef NDEBUG
                 GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index 8f9d0a460..9c600c7ca 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
     }
     result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
     if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
-        return nullptr;
+        result->buffer = nullptr;
     }
 
-    // require that the tensor data does not go beyond the buffer end
-    uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
-    uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
-    uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
-    GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
-    GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+    if (result->buffer) {
+        // require that the tensor data does not go beyond the buffer end
+        uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
+        uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
+        uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
+        GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
+        GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
+    }
 
     result->op = (ggml_op) tensor->op;
     for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
@@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
     const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
     GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
 
-    static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+    size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ NULL,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 28ee46e04..d7157ca6d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3847,7 +3847,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
 
     if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
         GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                __func__, cur_end + size_needed, ctx->mem_size);
+                __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
         assert(false);
         return NULL;
     }

From bfe76d4a17228bfd1565761f203123bc4914771b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Mon, 9 Sep 2024 23:36:09 +0200
Subject: [PATCH 09/65] common : move arg parser code to `arg.cpp` (#9388)

* common : move arg parser to arg.cpp

* better categorize args

* add cmake

* missing climits

* missing cstdarg

* common : more explicit includes

* fix build

* refactor gpt_params_parse

* update server readme

* fix test

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 Makefile                                      |    6 +
 common/CMakeLists.txt                         |    2 +
 common/arg.cpp                                | 1994 +++++++++++++++++
 common/arg.h                                  |   77 +
 common/common.cpp                             | 1929 ----------------
 common/common.h                               |  214 +-
 common/sampling.cpp                           |    5 +-
 common/sampling.h                             |   54 +-
 examples/batched-bench/batched-bench.cpp      |    4 +-
 examples/batched/batched.cpp                  |    4 +-
 .../cvector-generator/cvector-generator.cpp   |    4 +-
 examples/cvector-generator/pca.hpp            |    5 +-
 examples/embedding/embedding.cpp              |    4 +-
 examples/eval-callback/eval-callback.cpp      |    4 +-
 examples/export-lora/export-lora.cpp          |    4 +-
 examples/gen-docs/gen-docs.cpp                |    5 +-
 examples/gritlm/gritlm.cpp                    |    4 +-
 examples/imatrix/imatrix.cpp                  |    4 +-
 examples/infill/infill.cpp                    |    6 +-
 examples/llava/llava-cli.cpp                  |   10 +-
 examples/llava/minicpmv-cli.cpp               |    7 +-
 examples/lookahead/lookahead.cpp              |    5 +-
 examples/lookup/lookup-create.cpp             |   10 +-
 examples/lookup/lookup-stats.cpp              |    8 +-
 examples/lookup/lookup.cpp                    |    7 +-
 examples/main/main.cpp                        |    7 +-
 examples/parallel/parallel.cpp                |    5 +-
 examples/passkey/passkey.cpp                  |    4 +-
 examples/perplexity/perplexity.cpp            |   14 +-
 examples/retrieval/retrieval.cpp              |    4 +-
 examples/save-load-state/save-load-state.cpp  |    4 +-
 examples/server/README.md                     |   19 +-
 examples/server/server.cpp                    |    5 +-
 examples/simple/simple.cpp                    |    4 +-
 examples/speculative/speculative.cpp          |    7 +-
 tests/test-arg-parser.cpp                     |   42 +-
 36 files changed, 2281 insertions(+), 2210 deletions(-)
 create mode 100644 common/arg.cpp
 create mode 100644 common/arg.h

diff --git a/Makefile b/Makefile
index 6053bc17b..97ef37c0e 100644
--- a/Makefile
+++ b/Makefile
@@ -925,6 +925,7 @@ OBJ_LLAMA = \
 
 OBJ_COMMON = \
 	common/common.o \
+	common/arg.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
@@ -1157,6 +1158,11 @@ common/common.o: \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+common/arg.o: \
+	common/arg.cpp \
+	common/arg.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 2c72793b8..22fd99689 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
     base64.hpp
     common.h
     common.cpp
+    arg.h
+    arg.cpp
     sampling.h
     sampling.cpp
     console.h
diff --git a/common/arg.cpp b/common/arg.cpp
new file mode 100644
index 000000000..c5134be51
--- /dev/null
+++ b/common/arg.cpp
@@ -0,0 +1,1994 @@
+#include "arg.h"
+
+#include "sampling.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <set>
+#include <fstream>
+#include <regex>
+#include <cstdarg>
+#include <climits>
+
+#include "json-schema-to-grammar.h"
+
+using json = nlohmann::ordered_json;
+
+llama_arg & llama_arg::set_examples(std::initializer_list<enum llama_example> examples) {
+    this->examples = std::move(examples);
+    return *this;
+}
+
+llama_arg & llama_arg::set_env(const char * env) {
+    help = help + "\n(env: " + env + ")";
+    this->env = env;
+    return *this;
+}
+
+llama_arg & llama_arg::set_sparam() {
+    is_sparam = true;
+    return *this;
+}
+
+bool llama_arg::in_example(enum llama_example ex) {
+    return examples.find(ex) != examples.end();
+}
+
+bool llama_arg::get_value_from_env(std::string & output) {
+    if (env == nullptr) return false;
+    char * value = std::getenv(env);
+    if (value) {
+        output = value;
+        return true;
+    }
+    return false;
+}
+
+bool llama_arg::has_value_from_env() {
+    return env != nullptr && std::getenv(env);
+}
+
+static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
+    std::vector<std::string> result;
+    std::istringstream iss(input);
+    std::string line;
+    auto add_line = [&](const std::string& l) {
+        if (l.length() <= max_char_per_line) {
+            result.push_back(l);
+        } else {
+            std::istringstream line_stream(l);
+            std::string word, current_line;
+            while (line_stream >> word) {
+                if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
+                    if (!current_line.empty()) result.push_back(current_line);
+                    current_line = word;
+                } else {
+                    current_line += (!current_line.empty() ? " " : "") + word;
+                }
+            }
+            if (!current_line.empty()) result.push_back(current_line);
+        }
+    };
+    while (std::getline(iss, line)) {
+        add_line(line);
+    }
+    return result;
+}
+
+std::string llama_arg::to_string() {
+    // params for printing to console
+    const static int n_leading_spaces = 40;
+    const static int n_char_per_line_help = 70; // TODO: detect this based on current console
+    std::string leading_spaces(n_leading_spaces, ' ');
+
+    std::ostringstream ss;
+    for (const auto arg : args) {
+        if (arg == args.front()) {
+            if (args.size() == 1) {
+                ss << arg;
+            } else {
+                // first arg is usually abbreviation, we need padding to make it more beautiful
+                auto tmp = std::string(arg) + ", ";
+                auto spaces = std::string(std::max(0, 7 - (int)tmp.size()), ' ');
+                ss << tmp << spaces;
+            }
+        } else {
+            ss << arg << (arg != args.back() ? ", " : "");
+        }
+    }
+    if (value_hint) ss << " " << value_hint;
+    if (value_hint_2) ss << " " << value_hint_2;
+    if (ss.tellp() > n_leading_spaces - 3) {
+        // current line is too long, add new line
+        ss << "\n" << leading_spaces;
+    } else {
+        // padding between arg and help, same line
+        ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
+    }
+    const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
+    for (const auto & line : help_lines) {
+        ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
+    }
+    return ss.str();
+}
+
+//
+// utils
+//
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+static std::string format(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+static void gpt_params_handle_model_default(gpt_params & params) {
+    if (!params.hf_repo.empty()) {
+        // short-hand to avoid specifying --hf-file -> default it to --model
+        if (params.hf_file.empty()) {
+            if (params.model.empty()) {
+                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
+            }
+            params.hf_file = params.model;
+        } else if (params.model.empty()) {
+            params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
+        }
+    } else if (!params.model_url.empty()) {
+        if (params.model.empty()) {
+            auto f = string_split(params.model_url, '#').front();
+            f = string_split(f, '?').front();
+            params.model = fs_get_cache_file(string_split(f, '/').back());
+        }
+    } else if (params.model.empty()) {
+        params.model = DEFAULT_MODEL_PATH;
+    }
+}
+
+//
+// CLI argument parsing functions
+//
+
+static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx_arg) {
+    std::string arg;
+    const std::string arg_prefix = "--";
+    gpt_params & params = ctx_arg.params;
+    gpt_sampler_params & sparams = params.sparams;
+
+    std::unordered_map<std::string, llama_arg *> arg_to_options;
+    for (auto & opt : ctx_arg.options) {
+        for (const auto & arg : opt.args) {
+            arg_to_options[arg] = &opt;
+        }
+    }
+
+    // handle environment variables
+    for (auto & opt : ctx_arg.options) {
+        std::string value;
+        if (opt.get_value_from_env(value)) {
+            try {
+                if (opt.handler_void && (value == "1" || value == "true")) {
+                    opt.handler_void(params);
+                }
+                if (opt.handler_int) {
+                    opt.handler_int(params, std::stoi(value));
+                }
+                if (opt.handler_string) {
+                    opt.handler_string(params, value);
+                    continue;
+                }
+            } catch (std::exception & e) {
+                throw std::invalid_argument(format(
+                    "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
+            }
+        }
+    }
+
+    // handle command line arguments
+    auto check_arg = [&](int i) {
+        if (i+1 >= argc) {
+            throw std::invalid_argument("expected value for argument");
+        }
+    };
+
+    for (int i = 1; i < argc; i++) {
+        const std::string arg_prefix = "--";
+
+        std::string arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
+        }
+        auto opt = *arg_to_options[arg];
+        if (opt.has_value_from_env()) {
+            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
+        }
+        try {
+            if (opt.handler_void) {
+                opt.handler_void(params);
+                continue;
+            }
+
+            // arg with single value
+            check_arg(i);
+            std::string val = argv[++i];
+            if (opt.handler_int) {
+                opt.handler_int(params, std::stoi(val));
+                continue;
+            }
+            if (opt.handler_string) {
+                opt.handler_string(params, val);
+                continue;
+            }
+
+            // arg with 2 values
+            check_arg(i);
+            std::string val2 = argv[++i];
+            if (opt.handler_str_str) {
+                opt.handler_str_str(params, val, val2);
+                continue;
+            }
+        } catch (std::exception & e) {
+            throw std::invalid_argument(format(
+                "error while handling argument \"%s\": %s\n\n"
+                "usage:\n%s\n\nto show complete usage, run with -h",
+                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+        }
+    }
+
+    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+
+    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
+        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+    }
+
+    gpt_params_handle_model_default(params);
+
+    if (params.escape) {
+        string_process_escapes(params.prompt);
+        string_process_escapes(params.input_prefix);
+        string_process_escapes(params.input_suffix);
+        for (auto & antiprompt : params.antiprompt) {
+            string_process_escapes(antiprompt);
+        }
+    }
+
+    if (!params.kv_overrides.empty()) {
+        params.kv_overrides.emplace_back();
+        params.kv_overrides.back().key[0] = 0;
+    }
+
+    if (sparams.seed == LLAMA_DEFAULT_SEED) {
+        sparams.seed = time(NULL);
+    }
+
+    return true;
+}
+
+static void gpt_params_print_usage(gpt_params_context & ctx_arg) {
+    auto print_options = [](std::vector<llama_arg *> & options) {
+        for (llama_arg * opt : options) {
+            printf("%s", opt->to_string().c_str());
+        }
+    };
+
+    std::vector<llama_arg *> common_options;
+    std::vector<llama_arg *> sparam_options;
+    std::vector<llama_arg *> specific_options;
+    for (auto & opt : ctx_arg.options) {
+        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
+        if (opt.is_sparam) {
+            sparam_options.push_back(&opt);
+        } else if (opt.in_example(ctx_arg.ex)) {
+            specific_options.push_back(&opt);
+        } else {
+            common_options.push_back(&opt);
+        }
+    }
+    printf("----- common params -----\n\n");
+    print_options(common_options);
+    printf("\n\n----- sampling params -----\n\n");
+    print_options(sparam_options);
+    // TODO: maybe convert enum llama_example to string
+    printf("\n\n----- example-specific params -----\n\n");
+    print_options(specific_options);
+}
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    auto ctx_arg = gpt_params_parser_init(params, ex, print_usage);
+    const gpt_params params_org = ctx_arg.params; // the example can modify the default params
+
+    try {
+        if (!gpt_params_parse_ex(argc, argv, ctx_arg)) {
+            ctx_arg.params = params_org;
+            return false;
+        }
+        if (ctx_arg.params.usage) {
+            gpt_params_print_usage(ctx_arg);
+            if (ctx_arg.print_usage) {
+                ctx_arg.print_usage(argc, argv);
+            }
+            exit(0);
+        }
+    } catch (const std::invalid_argument & ex) {
+        fprintf(stderr, "%s\n", ex.what());
+        ctx_arg.params = params_org;
+        return false;
+    }
+
+    return true;
+}
+
+gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
+    gpt_params_context ctx_arg(params);
+    ctx_arg.print_usage = print_usage;
+    ctx_arg.ex          = ex;
+
+    std::string sampler_type_chars;
+    std::string sampler_type_names;
+    for (const auto & sampler : params.sparams.samplers) {
+        sampler_type_chars += gpt_sampler_type_to_chr(sampler);
+        sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
+    }
+    sampler_type_names.pop_back();
+
+
+    /**
+     * filter options by example
+     * rules:
+     * - all examples inherit options from LLAMA_EXAMPLE_COMMON
+     * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
+     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
+     */
+    auto add_opt = [&](llama_arg arg) {
+        if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
+            ctx_arg.options.push_back(std::move(arg));
+        }
+    };
+
+
+    add_opt(llama_arg(
+        {"-h", "--help", "--usage"},
+        "print usage and exit",
+        [](gpt_params & params) {
+            params.usage = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--version"},
+        "show version and build info",
+        [](gpt_params &) {
+            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
+            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
+            exit(0);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-v", "--verbose"},
+        "print verbose information",
+        [](gpt_params & params) {
+            params.verbosity = 1;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--verbosity"}, "N",
+        format("set specific verbosity level (default: %d)", params.verbosity),
+        [](gpt_params & params, int value) {
+            params.verbosity = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--verbose-prompt"},
+        format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
+        [](gpt_params & params) {
+            params.verbose_prompt = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--no-display-prompt"},
+        format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
+        [](gpt_params & params) {
+            params.display_prompt = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-co", "--color"},
+        format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
+        [](gpt_params & params) {
+            params.use_color = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"-t", "--threads"}, "N",
+        format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
+        [](gpt_params & params, int value) {
+            params.cpuparams.n_threads = value;
+            if (params.cpuparams.n_threads <= 0) {
+                params.cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_env("LLAMA_ARG_THREADS"));
+    add_opt(llama_arg(
+        {"-tb", "--threads-batch"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads)",
+        [](gpt_params & params, int value) {
+            params.cpuparams_batch.n_threads = value;
+            if (params.cpuparams_batch.n_threads <= 0) {
+                params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-td", "--threads-draft"}, "N",
+        "number of threads to use during generation (default: same as --threads)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams.n_threads = value;
+            if (params.draft_cpuparams.n_threads <= 0) {
+                params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-tbd", "--threads-batch-draft"}, "N",
+        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams_batch.n_threads = value;
+            if (params.draft_cpuparams_batch.n_threads <= 0) {
+                params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-C", "--cpu-mask"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
+        [](gpt_params & params, const std::string & mask) {
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Cr", "--cpu-range"}, "lo-hi",
+        "range of CPUs for affinity. Complements --cpu-mask",
+        [](gpt_params & params, const std::string & range) {
+            params.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--cpu-strict"}, "<0|1>",
+        format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
+        [](gpt_params & params, const std::string & value) {
+            params.cpuparams.strict_cpu = std::stoul(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--prio"}, "N",
+        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--poll"}, "<0...100>",
+        format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
+        [](gpt_params & params, const std::string & value) {
+            params.cpuparams.poll = std::stoul(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Cb", "--cpu-mask-batch"}, "M",
+        "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
+        [](gpt_params & params, const std::string & mask) {
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Crb", "--cpu-range-batch"}, "lo-hi",
+        "ranges of CPUs for affinity. Complements --cpu-mask-batch",
+        [](gpt_params & params, const std::string & range) {
+            params.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--cpu-strict-batch"}, "<0|1>",
+        "use strict CPU placement (default: same as --cpu-strict)",
+        [](gpt_params & params, int value) {
+            params.cpuparams_batch.strict_cpu = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--prio-batch"}, "N",
+        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--poll-batch"}, "<0|1>",
+        "use polling to wait for work (default: same as --poll)",
+        [](gpt_params & params, int value) {
+            params.cpuparams_batch.poll = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-Cd", "--cpu-mask-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](gpt_params & params, const std::string & mask) {
+            params.draft_cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Crd", "--cpu-range-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
+        [](gpt_params & params, const std::string & range) {
+            params.draft_cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
+                throw std::invalid_argument("invalid range");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--cpu-strict-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--prio-draft"}, "N",
+        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--poll-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: same as --poll])",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
+        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
+        [](gpt_params & params, const std::string & mask) {
+            params.draft_cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
+        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
+        [](gpt_params & params, const std::string & range) {
+            params.draft_cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
+                throw std::invalid_argument("invalid cpumask");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--cpu-strict-batch-draft"}, "<0|1>",
+        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams_batch.strict_cpu = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--prio-batch-draft"}, "N",
+        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
+        [](gpt_params & params, int prio) {
+            if (prio < 0 || prio > 3) {
+                throw std::invalid_argument("invalid value");
+            }
+            params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--poll-batch-draft"}, "<0|1>",
+        "Use polling to wait for draft model work (default: --poll-draft)",
+        [](gpt_params & params, int value) {
+            params.draft_cpuparams_batch.poll = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"--draft"}, "N",
+        format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
+        [](gpt_params & params, int value) {
+            params.n_draft = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
+    add_opt(llama_arg(
+        {"-ps", "--p-split"}, "N",
+        format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
+        [](gpt_params & params, const std::string & value) {
+            params.p_split = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-lcs", "--lookup-cache-static"}, "FNAME",
+        "path to static lookup cache to use for lookup decoding (not updated by generation)",
+        [](gpt_params & params, const std::string & value) {
+            params.lookup_cache_static = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    add_opt(llama_arg(
+        {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
+        "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
+        [](gpt_params & params, const std::string & value) {
+            params.lookup_cache_dynamic = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
+    add_opt(llama_arg(
+        {"-c", "--ctx-size"}, "N",
+        format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
+        [](gpt_params & params, int value) {
+            params.n_ctx = value;
+        }
+    ).set_env("LLAMA_ARG_CTX_SIZE"));
+    add_opt(llama_arg(
+        {"-n", "--predict", "--n-predict"}, "N",
+        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
+        [](gpt_params & params, int value) {
+            params.n_predict = value;
+        }
+    ).set_env("LLAMA_ARG_N_PREDICT"));
+    add_opt(llama_arg(
+        {"-b", "--batch-size"}, "N",
+        format("logical maximum batch size (default: %d)", params.n_batch),
+        [](gpt_params & params, int value) {
+            params.n_batch = value;
+        }
+    ).set_env("LLAMA_ARG_BATCH"));
+    add_opt(llama_arg(
+        {"-ub", "--ubatch-size"}, "N",
+        format("physical maximum batch size (default: %d)", params.n_ubatch),
+        [](gpt_params & params, int value) {
+            params.n_ubatch = value;
+        }
+    ).set_env("LLAMA_ARG_UBATCH"));
+    add_opt(llama_arg(
+        {"--keep"}, "N",
+        format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
+        [](gpt_params & params, int value) {
+            params.n_keep = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--chunks"}, "N",
+        format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
+        [](gpt_params & params, int value) {
+            params.n_chunks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"-fa", "--flash-attn"},
+        format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.flash_attn = true;
+        }
+    ).set_env("LLAMA_ARG_FLASH_ATTN"));
+    add_opt(llama_arg(
+        {"-p", "--prompt"}, "PROMPT",
+        ex == LLAMA_EXAMPLE_MAIN
+            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
+            : "prompt to start generation with",
+        [](gpt_params & params, const std::string & value) {
+            params.prompt = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-f", "--file"}, "FNAME",
+        "a file containing the prompt (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (!params.prompt.empty() && params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--in-file"}, "FNAME",
+        "an input file (repeat to specify multiple files)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            params.in_files.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"-bf", "--binary-file"}, "FNAME",
+        "binary file containing the prompt (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            // store the external file name in params
+            params.prompt_file = value;
+            std::ostringstream ss;
+            ss << file.rdbuf();
+            params.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
+        }
+    ));
+    add_opt(llama_arg(
+        {"-e", "--escape"},
+        format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [](gpt_params & params) {
+            params.escape = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--no-escape"},
+        "do not process escape sequences",
+        [](gpt_params & params) {
+            params.escape = false;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ptc", "--print-token-count"}, "N",
+        format("print token count every N tokens (default: %d)", params.n_print),
+        [](gpt_params & params, int value) {
+            params.n_print = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache"}, "FNAME",
+        "file to cache prompt state for faster startup (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            params.path_prompt_cache = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache-all"},
+        "if specified, saves user input and generations to cache as well\n",
+        [](gpt_params & params) {
+            params.prompt_cache_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--prompt-cache-ro"},
+        "if specified, uses the prompt cache but does not update it",
+        [](gpt_params & params) {
+            params.prompt_cache_ro = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-r", "--reverse-prompt"}, "PROMPT",
+        "halt generation at PROMPT, return control in interactive mode\n",
+        [](gpt_params & params, const std::string & value) {
+            params.antiprompt.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-sp", "--special"},
+        format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
+        [](gpt_params & params) {
+            params.special = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-cnv", "--conversation"},
+        format(
+            "run in conversation mode:\n"
+            "- does not print special tokens and suffix/prefix\n"
+            "- interactive mode is also enabled\n"
+            "(default: %s)",
+            params.conversation ? "true" : "false"
+        ),
+        [](gpt_params & params) {
+            params.conversation = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-i", "--interactive"},
+        format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
+        [](gpt_params & params) {
+            params.interactive = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-if", "--interactive-first"},
+        format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
+        [](gpt_params & params) {
+            params.interactive_first = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"-mli", "--multiline-input"},
+        "allows you to write or paste multiple lines without ending each in '\\'",
+        [](gpt_params & params) {
+            params.multiline_input = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--in-prefix-bos"},
+        "prefix BOS to user inputs, preceding the `--in-prefix` string",
+        [](gpt_params & params) {
+            params.input_prefix_bos = true;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--in-prefix"}, "STRING",
+        "string to prefix user inputs with (default: empty)",
+        [](gpt_params & params, const std::string & value) {
+            params.input_prefix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--in-suffix"}, "STRING",
+        "string to suffix after user inputs with (default: empty)",
+        [](gpt_params & params, const std::string & value) {
+            params.input_suffix = value;
+            params.enable_chat_template = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--no-warmup"},
+        "skip warming up the model with an empty run",
+        [](gpt_params & params) {
+            params.warmup = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(llama_arg(
+        {"--spm-infill"},
+        format(
+            "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
+            params.spm_infill ? "enabled" : "disabled"
+        ),
+        [](gpt_params & params) {
+            params.spm_infill = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"--samplers"}, "SAMPLERS",
+        format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            const auto sampler_names = string_split(value, ';');
+            params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"-s", "--seed"}, "SEED",
+        format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.seed = std::stoul(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--sampling-seq"}, "SEQUENCE",
+        format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.samplers = gpt_sampler_types_from_chars(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--ignore-eos"},
+        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
+        [](gpt_params & params) {
+            params.sparams.ignore_eos = true;
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--penalize-nl"},
+        format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
+        [](gpt_params & params) {
+            params.sparams.penalize_nl = true;
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--temp"}, "N",
+        format("temperature (default: %.1f)", (double)params.sparams.temp),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.temp = std::stof(value);
+            params.sparams.temp = std::max(params.sparams.temp, 0.0f);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--top-k"}, "N",
+        format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
+        [](gpt_params & params, int value) {
+            params.sparams.top_k = value;
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--top-p"}, "N",
+        format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.top_p = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--min-p"}, "N",
+        format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.min_p = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--tfs"}, "N",
+        format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.tfs_z = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--typical"}, "N",
+        format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.typ_p = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--repeat-last-n"}, "N",
+        format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
+        [](gpt_params & params, int value) {
+            params.sparams.penalty_last_n = value;
+            params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--repeat-penalty"}, "N",
+        format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_repeat = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--presence-penalty"}, "N",
+        format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_present = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--frequency-penalty"}, "N",
+        format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.penalty_freq = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--dynatemp-range"}, "N",
+        format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dynatemp_range = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--dynatemp-exp"}, "N",
+        format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.dynatemp_exponent = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--mirostat"}, "N",
+        format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
+        [](gpt_params & params, int value) {
+            params.sparams.mirostat = value;
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--mirostat-lr"}, "N",
+        format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.mirostat_eta = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--mirostat-ent"}, "N",
+        format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.mirostat_tau = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
+        "modifies the likelihood of token appearing in the completion,\n"
+        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
+        [](gpt_params & params, const std::string & value) {
+            std::stringstream ss(value);
+            llama_token key;
+            char sign;
+            std::string value_str;
+            try {
+                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                    const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                    params.sparams.logit_bias.push_back({key, bias});
+                } else {
+                    throw std::invalid_argument("invalid input format");
+                }
+            } catch (const std::exception&) {
+                throw std::invalid_argument("invalid input format");
+            }
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--grammar"}, "GRAMMAR",
+        format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.grammar = value;
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--grammar-file"}, "FNAME",
+        "file to read grammar from",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.sparams.grammar)
+            );
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"-j", "--json-schema"}, "SCHEMA",
+        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
+        [](gpt_params & params, const std::string & value) {
+            params.sparams.grammar = json_schema_to_grammar(json::parse(value));
+        }
+    ).set_sparam());
+    add_opt(llama_arg(
+        {"--pooling"}, "{none,mean,cls,last}",
+        "pooling type for embeddings, use model default if unspecified",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+            else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
+            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--attention"}, "{causal,non,causal}",
+        "attention type for embeddings, use model default if unspecified",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--rope-scaling"}, "{none,linear,yarn}",
+        "RoPE frequency scaling method, defaults to linear unless specified by the model",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-scale"}, "N",
+        "RoPE context scaling factor, expands context by a factor of N",
+        [](gpt_params & params, const std::string & value) {
+            params.rope_freq_scale = 1.0f / std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-freq-base"}, "N",
+        "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
+        [](gpt_params & params, const std::string & value) {
+            params.rope_freq_base = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--rope-freq-scale"}, "N",
+        "RoPE frequency scaling factor, expands context by a factor of 1/N",
+        [](gpt_params & params, const std::string & value) {
+            params.rope_freq_scale = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-orig-ctx"}, "N",
+        format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
+        [](gpt_params & params, int value) {
+            params.yarn_orig_ctx = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-ext-factor"}, "N",
+        format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_ext_factor = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-attn-factor"}, "N",
+        format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_attn_factor = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-beta-slow"}, "N",
+        format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_beta_slow = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"--yarn-beta-fast"}, "N",
+        format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
+        [](gpt_params & params, const std::string & value) {
+            params.yarn_beta_fast = std::stof(value);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-gan", "--grp-attn-n"}, "N",
+        format("group-attention factor (default: %d)", params.grp_attn_n),
+        [](gpt_params & params, int value) {
+            params.grp_attn_n = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-gaw", "--grp-attn-w"}, "N",
+        format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
+        [](gpt_params & params, int value) {
+            params.grp_attn_w = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-dkvc", "--dump-kv-cache"},
+        "verbose print of the KV cache",
+        [](gpt_params & params) {
+            params.dump_kv_cache = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-nkvo", "--no-kv-offload"},
+        "disable KV offload",
+        [](gpt_params & params) {
+            params.no_kv_offload = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ctk", "--cache-type-k"}, "TYPE",
+        format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            // TODO: get the type right here
+            params.cache_type_k = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ctv", "--cache-type-v"}, "TYPE",
+        format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            // TODO: get the type right here
+            params.cache_type_v = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--perplexity", "--all-logits"},
+        format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
+        [](gpt_params & params) {
+            params.logits_all = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--hellaswag"},
+        "compute HellaSwag score over random tasks from datafile supplied with -f",
+        [](gpt_params & params) {
+            params.hellaswag = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--hellaswag-tasks"}, "N",
+        format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
+        [](gpt_params & params, int value) {
+            params.hellaswag_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--winogrande"},
+        "compute Winogrande score over random tasks from datafile supplied with -f",
+        [](gpt_params & params) {
+            params.winogrande = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--winogrande-tasks"}, "N",
+        format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
+        [](gpt_params & params, int value) {
+            params.winogrande_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--multiple-choice"},
+        "compute multiple choice score over random tasks from datafile supplied with -f",
+        [](gpt_params & params) {
+            params.multiple_choice = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--multiple-choice-tasks"}, "N",
+        format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
+        [](gpt_params & params, int value) {
+            params.multiple_choice_tasks = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--kl-divergence"},
+        "computes KL-divergence to logits provided via --kl-divergence-base",
+        [](gpt_params & params) {
+            params.kl_divergence = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
+        "set logits file",
+        [](gpt_params & params, const std::string & value) {
+            params.logits_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--ppl-stride"}, "N",
+        format("stride for perplexity calculation (default: %d)", params.ppl_stride),
+        [](gpt_params & params, int value) {
+            params.ppl_stride = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"--ppl-output-type"}, "<0|1>",
+        format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
+        [](gpt_params & params, int value) {
+            params.ppl_output_type = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
+    add_opt(llama_arg(
+        {"-dt", "--defrag-thold"}, "N",
+        format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
+        [](gpt_params & params, const std::string & value) {
+            params.defrag_thold = std::stof(value);
+        }
+    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
+    add_opt(llama_arg(
+        {"-np", "--parallel"}, "N",
+        format("number of parallel sequences to decode (default: %d)", params.n_parallel),
+        [](gpt_params & params, int value) {
+            params.n_parallel = value;
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ns", "--sequences"}, "N",
+        format("number of sequences to decode (default: %d)", params.n_sequences),
+        [](gpt_params & params, int value) {
+            params.n_sequences = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
+    add_opt(llama_arg(
+        {"-cb", "--cont-batching"},
+        format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.cont_batching = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
+    add_opt(llama_arg(
+        {"-nocb", "--no-cont-batching"},
+        "disable continuous batching",
+        [](gpt_params & params) {
+            params.cont_batching = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
+    add_opt(llama_arg(
+        {"--mmproj"}, "FILE",
+        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        [](gpt_params & params, const std::string & value) {
+            params.mmproj = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    add_opt(llama_arg(
+        {"--image"}, "FILE",
+        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        [](gpt_params & params, const std::string & value) {
+            params.image.emplace_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+#ifdef GGML_USE_RPC
+    add_opt(llama_arg(
+        {"--rpc"}, "SERVERS",
+        "comma separated list of RPC servers",
+        [](gpt_params & params, const std::string & value) {
+            params.rpc_servers = value;
+        }
+    ));
+#endif
+    add_opt(llama_arg(
+        {"--mlock"},
+        "force system to keep model in RAM rather than swapping or compressing",
+        [](gpt_params & params) {
+            params.use_mlock = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--no-mmap"},
+        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
+        [](gpt_params & params) {
+            params.use_mmap = false;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--numa"}, "TYPE",
+        "attempt optimizations that help on some NUMA systems\n"
+        "- distribute: spread execution evenly over all nodes\n"
+        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
+        "- numactl: use the CPU map provided by numactl\n"
+        "if run without this previously, it is recommended to drop the system page cache before using this\n"
+        "see https://github.com/ggerganov/llama.cpp/issues/1437",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
+        "number of layers to store in VRAM",
+        [](gpt_params & params, int value) {
+            params.n_gpu_layers = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            }
+        }
+    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+    add_opt(llama_arg(
+        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
+        "number of layers to store in VRAM for the draft model",
+        [](gpt_params & params, int value) {
+            params.n_gpu_layers_draft = value;
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
+                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-sm", "--split-mode"}, "{none,layer,row}",
+        "how to split the model across multiple GPUs, one of:\n"
+        "- none: use one GPU only\n"
+        "- layer (default): split layers and KV across GPUs\n"
+        "- row: split rows across GPUs",
+        [](gpt_params & params, const std::string & value) {
+            std::string arg_next = value;
+            if (arg_next == "none") {
+                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+            } else if (arg_next == "layer") {
+                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+            }
+            else if (arg_next == "row") {
+#ifdef GGML_USE_SYCL
+                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+                exit(1);
+#endif // GGML_USE_SYCL
+                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+            }
+            else {
+                throw std::invalid_argument("invalid value");
+            }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
+        }
+    ));
+    add_opt(llama_arg(
+        {"-ts", "--tensor-split"}, "N0,N1,N2,...",
+        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
+        [](gpt_params & params, const std::string & value) {
+            std::string arg_next = value;
+
+            // split string by , and /
+            const std::regex regex{ R"([,/]+)" };
+            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
+            std::vector<std::string> split_arg{ it, {} };
+            if (split_arg.size() >= llama_max_devices()) {
+                throw std::invalid_argument(
+                    format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
+                );
+            }
+            for (size_t i = 0; i < llama_max_devices(); ++i) {
+                if (i < split_arg.size()) {
+                        params.tensor_split[i] = std::stof(split_arg[i]);
+                } else {
+                        params.tensor_split[i] = 0.0f;
+                }
+            }
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
+        }
+    ));
+    add_opt(llama_arg(
+        {"-mg", "--main-gpu"}, "INDEX",
+        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
+        [](gpt_params & params, int value) {
+            params.main_gpu = value;
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
+        }
+    ));
+    add_opt(llama_arg(
+        {"--check-tensors"},
+        format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
+        [](gpt_params & params) {
+            params.check_tensors = true;
+        }
+    ));
+    add_opt(llama_arg(
+        {"--override-kv"}, "KEY=TYPE:VALUE",
+        "advanced option to override model metadata by key. may be specified multiple times.\n"
+        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
+        [](gpt_params & params, const std::string & value) {
+            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
+                throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--lora"}, "FNAME",
+        "path to LoRA adapter (can be repeated to use multiple adapters)",
+        [](gpt_params & params, const std::string & value) {
+            params.lora_adapters.push_back({ std::string(value), 1.0 });
+        }
+        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"--lora-scaled"}, "FNAME", "SCALE",
+        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
+        [](gpt_params & params, const std::string & fname, const std::string & scale) {
+            params.lora_adapters.push_back({ fname, std::stof(scale) });
+        }
+        // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"--control-vector"}, "FNAME",
+        "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
+        [](gpt_params & params, const std::string & value) {
+            params.control_vectors.push_back({ 1.0f, value, });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--control-vector-scaled"}, "FNAME", "SCALE",
+        "add a control vector with user defined scaling SCALE\n"
+        "note: this argument can be repeated to add multiple scaled control vectors",
+        [](gpt_params & params, const std::string & fname, const std::string & scale) {
+            params.control_vectors.push_back({ std::stof(scale), fname });
+        }
+    ));
+    add_opt(llama_arg(
+        {"--control-vector-layer-range"}, "START", "END",
+        "layer range to apply the control vector(s) to, start and end inclusive",
+        [](gpt_params & params, const std::string & start, const std::string & end) {
+            params.control_vector_layer_start = std::stoi(start);
+            params.control_vector_layer_end = std::stoi(end);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-a", "--alias"}, "STRING",
+        "set alias for model name (to be used by REST API)",
+        [](gpt_params & params, const std::string & value) {
+            params.model_alias = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"-m", "--model"}, "FNAME",
+        ex == LLAMA_EXAMPLE_EXPORT_LORA
+            ? std::string("model path from which to load base model")
+            : format(
+                "model path (default: `models/$filename` with filename from `--hf-file` "
+                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
+            ),
+        [](gpt_params & params, const std::string & value) {
+            params.model = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
+    add_opt(llama_arg(
+        {"-md", "--model-draft"}, "FNAME",
+        "draft model for speculative decoding (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.model_draft = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
+    add_opt(llama_arg(
+        {"-mu", "--model-url"}, "MODEL_URL",
+        "model download url (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.model_url = value;
+        }
+    ).set_env("LLAMA_ARG_MODEL_URL"));
+    add_opt(llama_arg(
+        {"-hfr", "--hf-repo"}, "REPO",
+        "Hugging Face model repository (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.hf_repo = value;
+        }
+    ).set_env("LLAMA_ARG_HF_REPO"));
+    add_opt(llama_arg(
+        {"-hff", "--hf-file"}, "FILE",
+        "Hugging Face model file (default: unused)",
+        [](gpt_params & params, const std::string & value) {
+            params.hf_file = value;
+        }
+    ).set_env("LLAMA_ARG_HF_FILE"));
+    add_opt(llama_arg(
+        {"-hft", "--hf-token"}, "TOKEN",
+        "Hugging Face access token (default: value from HF_TOKEN environment variable)",
+        [](gpt_params & params, const std::string & value) {
+            params.hf_token = value;
+        }
+    ).set_env("HF_TOKEN"));
+    add_opt(llama_arg(
+        {"--context-file"}, "FNAME",
+        "file to load context from (repeat to specify multiple files)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value, std::ios::binary);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            params.context_files.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--chunk-size"}, "N",
+        format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
+        [](gpt_params & params, int value) {
+            params.chunk_size = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--chunk-separator"}, "STRING",
+        format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.chunk_separator = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
+    add_opt(llama_arg(
+        {"--junk"}, "N",
+        format("number of times to repeat the junk text (default: %d)", params.n_junk),
+        [](gpt_params & params, int value) {
+            params.n_junk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(llama_arg(
+        {"--pos"}, "N",
+        format("position of the passkey in the junk text (default: %d)", params.i_pos),
+        [](gpt_params & params, int value) {
+            params.i_pos = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
+    add_opt(llama_arg(
+        {"-o", "--output", "--output-file"}, "FNAME",
+        format("output file (default: '%s')",
+            ex == LLAMA_EXAMPLE_EXPORT_LORA
+                ? params.lora_outfile.c_str()
+                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
+                    ? params.cvector_outfile.c_str()
+                    : params.out_file.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.out_file = value;
+            params.cvector_outfile = value;
+            params.lora_outfile = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
+    add_opt(llama_arg(
+        {"-ofreq", "--output-frequency"}, "N",
+        format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
+        [](gpt_params & params, int value) {
+            params.n_out_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--save-frequency"}, "N",
+        format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
+        [](gpt_params & params, int value) {
+            params.n_save_freq = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--process-output"},
+        format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
+        [](gpt_params & params) {
+            params.process_output = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--no-ppl"},
+        format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [](gpt_params & params) {
+            params.compute_ppl = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"--chunk", "--from-chunk"}, "N",
+        format("start processing the input from chunk N (default: %d)", params.i_chunk),
+        [](gpt_params & params, int value) {
+            params.i_chunk = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(llama_arg(
+        {"-pps"},
+        format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
+        [](gpt_params & params) {
+            params.is_pp_shared = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-npp"}, "n0,n1,...",
+        "number of prompt tokens",
+        [](gpt_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-ntg"}, "n0,n1,...",
+        "number of text generation tokens",
+        [](gpt_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"-npl"}, "n0,n1,...",
+        "number of parallel prompts",
+        [](gpt_params & params, const std::string & value) {
+            auto p = string_split<int>(value, ',');
+            params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+    add_opt(llama_arg(
+        {"--embd-normalize"}, "N",
+        format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
+        [](gpt_params & params, int value) {
+            params.embd_normalize = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--embd-output-format"}, "FORMAT",
+        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
+        [](gpt_params & params, const std::string & value) {
+            params.embd_out = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--embd-separator"}, "STRING",
+        "separator of embendings (default \\n) for example \"<#sep#>\"",
+        [](gpt_params & params, const std::string & value) {
+            params.embd_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(llama_arg(
+        {"--host"}, "HOST",
+        format("ip address to listen (default: %s)", params.hostname.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.hostname = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
+    add_opt(llama_arg(
+        {"--port"}, "PORT",
+        format("port to listen (default: %d)", params.port),
+        [](gpt_params & params, int value) {
+            params.port = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
+    add_opt(llama_arg(
+        {"--path"}, "PATH",
+        format("path to serve static files from (default: %s)", params.public_path.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.public_path = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--embedding", "--embeddings"},
+        format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.embedding = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
+    add_opt(llama_arg(
+        {"--api-key"}, "KEY",
+        "API key to use for authentication (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            params.api_keys.push_back(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
+    add_opt(llama_arg(
+        {"--api-key-file"}, "FNAME",
+        "path to file containing API keys (default: none)",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream key_file(value);
+            if (!key_file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::string key;
+            while (std::getline(key_file, key)) {
+                if (!key.empty()) {
+                        params.api_keys.push_back(key);
+                }
+            }
+            key_file.close();
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--ssl-key-file"}, "FNAME",
+        "path to file a PEM-encoded SSL private key",
+        [](gpt_params & params, const std::string & value) {
+            params.ssl_file_key = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--ssl-cert-file"}, "FNAME",
+        "path to file a PEM-encoded SSL certificate",
+        [](gpt_params & params, const std::string & value) {
+            params.ssl_file_cert = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"-to", "--timeout"}, "N",
+        format("server read/write timeout in seconds (default: %d)", params.timeout_read),
+        [](gpt_params & params, int value) {
+            params.timeout_read  = value;
+            params.timeout_write = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--threads-http"}, "N",
+        format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
+        [](gpt_params & params, int value) {
+            params.n_threads_http = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
+    add_opt(llama_arg(
+        {"-spf", "--system-prompt-file"}, "FNAME",
+        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
+        [](gpt_params & params, const std::string & value) {
+            std::ifstream file(value);
+            if (!file) {
+                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
+            }
+            std::string system_prompt;
+            std::copy(
+                        std::istreambuf_iterator<char>(file),
+                        std::istreambuf_iterator<char>(),
+                        std::back_inserter(system_prompt)
+                        );
+            params.system_prompt = system_prompt;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--log-format"}, "{text, json}",
+        "log output format: json or text (default: json)",
+        [](gpt_params & params, const std::string & value) {
+            if (value == "json") {
+                params.log_json = true;
+            } else if (value == "text") {
+                params.log_json = false;
+            } else {
+                throw std::invalid_argument("invalid value");
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--metrics"},
+        format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_metrics = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
+    add_opt(llama_arg(
+        {"--no-slots"},
+        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.endpoint_slots = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
+    add_opt(llama_arg(
+        {"--slot-save-path"}, "PATH",
+        "path to save slot kv cache (default: disabled)",
+        [](gpt_params & params, const std::string & value) {
+            params.slot_save_path = value;
+            // if doesn't end with DIRECTORY_SEPARATOR, add it
+            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                params.slot_save_path += DIRECTORY_SEPARATOR;
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--chat-template"}, "JINJA_TEMPLATE",
+        "set custom jinja chat template (default: template taken from model's metadata)\n"
+        "if suffix/prefix are specified, template will be disabled\n"
+        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
+        [](gpt_params & params, const std::string & value) {
+            if (!llama_chat_verify_template(value)) {
+                throw std::runtime_error(format(
+                    "error: the supplied chat template is not supported: %s\n"
+                    "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
+                    value.c_str()
+                ));
+            }
+            params.chat_template = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    add_opt(llama_arg(
+        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
+        format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
+        [](gpt_params & params, const std::string & value) {
+            params.slot_prompt_similarity = std::stof(value);
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--lora-init-without-apply"},
+        format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
+        [](gpt_params & params) {
+            params.lora_init_without_apply = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}));
+    add_opt(llama_arg(
+        {"--simple-io"},
+        "use basic IO for better compatibility in subprocesses and limited consoles",
+        [](gpt_params & params) {
+            params.simple_io = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    add_opt(llama_arg(
+        {"-ld", "--logdir"}, "LOGDIR",
+        "path under which to save YAML logs (no logging if unset)",
+        [](gpt_params & params, const std::string & value) {
+            params.logdir = value;
+
+            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
+                params.logdir += DIRECTORY_SEPARATOR;
+            }
+        }
+    ));
+    add_opt(llama_arg(
+        {"--positive-file"}, "FNAME",
+        format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.cvector_positive_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--negative-file"}, "FNAME",
+        format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
+        [](gpt_params & params, const std::string & value) {
+            params.cvector_negative_file = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--pca-batch"}, "N",
+        format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
+        [](gpt_params & params, int value) {
+            params.n_pca_batch = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--pca-iter"}, "N",
+        format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
+        [](gpt_params & params, int value) {
+            params.n_pca_iterations = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--method"}, "{pca, mean}",
+        "dimensionality reduction method to be used (default: pca)",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
+            else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+            else { throw std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
+    add_opt(llama_arg(
+        {"--output-format"}, "{md,jsonl}",
+        "output format for batched-bench results (default: md)",
+        [](gpt_params & params, const std::string & value) {
+            /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
+            else if (value == "md") { params.batched_bench_output_jsonl = false; }
+            else { std::invalid_argument("invalid value"); }
+        }
+    ).set_examples({LLAMA_EXAMPLE_BENCH}));
+#ifndef LOG_DISABLE_LOGS
+    // TODO: make this looks less weird
+    add_opt(llama_arg(
+        {"--log-test"},
+        "Log test",
+        [](gpt_params &) { log_param_single_parse("--log-test"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-disable"},
+        "Log disable",
+        [](gpt_params &) { log_param_single_parse("--log-disable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-enable"},
+        "Log enable",
+        [](gpt_params &) { log_param_single_parse("--log-enable"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-new"},
+        "Log new",
+        [](gpt_params &) { log_param_single_parse("--log-new"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-append"},
+        "Log append",
+        [](gpt_params &) { log_param_single_parse("--log-append"); }
+    ));
+    add_opt(llama_arg(
+        {"--log-file"}, "FNAME",
+        "Log file",
+        [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
+    ));
+#endif // LOG_DISABLE_LOGS
+
+    return ctx_arg;
+}
+
diff --git a/common/arg.h b/common/arg.h
new file mode 100644
index 000000000..413de2c88
--- /dev/null
+++ b/common/arg.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "common.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+//
+// CLI argument parsing
+//
+
+struct llama_arg {
+    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    std::vector<const char *> args;
+    const char * value_hint   = nullptr; // help text or example for arg value
+    const char * value_hint_2 = nullptr; // for second arg value
+    const char * env          = nullptr;
+    std::string help;
+    bool is_sparam = false; // is current arg a sampling param?
+    void (*handler_void)   (gpt_params & params) = nullptr;
+    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
+    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
+    void (*handler_int)    (gpt_params & params, int) = nullptr;
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &)
+    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const std::string & help,
+        void (*handler)(gpt_params & params, int)
+    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const std::string & help,
+        void (*handler)(gpt_params & params)
+    ) : args(args), help(help), handler_void(handler) {}
+
+    // support 2 values for arg
+    llama_arg(
+        const std::initializer_list<const char *> & args,
+        const char * value_hint,
+        const char * value_hint_2,
+        const std::string & help,
+        void (*handler)(gpt_params & params, const std::string &, const std::string &)
+    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+
+    llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
+    llama_arg & set_env(const char * env);
+    llama_arg & set_sparam();
+    bool in_example(enum llama_example ex);
+    bool get_value_from_env(std::string & output);
+    bool has_value_from_env();
+    std::string to_string();
+};
+
+struct gpt_params_context {
+    enum llama_example ex = LLAMA_EXAMPLE_COMMON;
+    gpt_params & params;
+    std::vector<llama_arg> options;
+    void(*print_usage)(int, char **) = nullptr;
+    gpt_params_context(gpt_params & params) : params(params) {}
+};
+
+// parse input arguments from CLI
+// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+
+// function to be used by test-arg-parser
+gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
diff --git a/common/common.cpp b/common/common.cpp
index 916b1731e..5395eaa0e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -25,7 +25,6 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <climits>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -272,53 +271,6 @@ bool set_process_priority(enum ggml_sched_priority prio) {
 // CLI argument parsing
 //
 
-#ifdef __GNUC__
-#ifdef __MINGW32__
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-#else
-#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
-#endif
-
-LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
-static std::string format(const char * fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-static void gpt_params_handle_model_default(gpt_params & params) {
-    if (!params.hf_repo.empty()) {
-        // short-hand to avoid specifying --hf-file -> default it to --model
-        if (params.hf_file.empty()) {
-            if (params.model.empty()) {
-                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
-            }
-            params.hf_file = params.model;
-        } else if (params.model.empty()) {
-            params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
-        }
-    } else if (!params.model_url.empty()) {
-        if (params.model.empty()) {
-            auto f = string_split(params.model_url, '#').front();
-            f = string_split(f, '?').front();
-            params.model = fs_get_cache_file(string_split(f, '/').back());
-        }
-    } else if (params.model.empty()) {
-        params.model = DEFAULT_MODEL_PATH;
-    }
-}
 
 void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
     int32_t n_set = 0;
@@ -344,150 +296,6 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
     }
 }
 
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
-    std::string arg;
-    const std::string arg_prefix = "--";
-    gpt_sampler_params & sparams = params.sparams;
-
-    std::unordered_map<std::string, llama_arg *> arg_to_options;
-    for (auto & opt : options) {
-        for (const auto & arg : opt.args) {
-            arg_to_options[arg] = &opt;
-        }
-    }
-
-    // handle environment variables
-    for (auto & opt : options) {
-        std::string value;
-        if (opt.get_value_from_env(value)) {
-            try {
-                if (opt.handler_void && (value == "1" || value == "true")) {
-                    opt.handler_void(params);
-                }
-                if (opt.handler_int) {
-                    opt.handler_int(params, std::stoi(value));
-                }
-                if (opt.handler_string) {
-                    opt.handler_string(params, value);
-                    continue;
-                }
-            } catch (std::exception & e) {
-                throw std::invalid_argument(format(
-                    "error while handling environment variable \"%s\": %s\n\n", opt.env, e.what()));
-            }
-        }
-    }
-
-    // handle command line arguments
-    auto check_arg = [&](int i) {
-        if (i+1 >= argc) {
-            throw std::invalid_argument("expected value for argument");
-        }
-    };
-
-    for (int i = 1; i < argc; i++) {
-        const std::string arg_prefix = "--";
-
-        std::string arg = argv[i];
-        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
-            std::replace(arg.begin(), arg.end(), '_', '-');
-        }
-        if (arg_to_options.find(arg) == arg_to_options.end()) {
-            throw std::invalid_argument(format("error: invalid argument: %s", arg.c_str()));
-        }
-        auto opt = *arg_to_options[arg];
-        if (opt.has_value_from_env()) {
-            fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
-        }
-        try {
-            if (opt.handler_void) {
-                opt.handler_void(params);
-                continue;
-            }
-
-            // arg with single value
-            check_arg(i);
-            std::string val = argv[++i];
-            if (opt.handler_int) {
-                opt.handler_int(params, std::stoi(val));
-                continue;
-            }
-            if (opt.handler_string) {
-                opt.handler_string(params, val);
-                continue;
-            }
-
-            // arg with 2 values
-            check_arg(i);
-            std::string val2 = argv[++i];
-            if (opt.handler_str_str) {
-                opt.handler_str_str(params, val, val2);
-                continue;
-            }
-        } catch (std::exception & e) {
-            throw std::invalid_argument(format(
-                "error while handling argument \"%s\": %s\n\n"
-                "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
-        }
-    }
-
-    postprocess_cpu_params(params.cpuparams, nullptr);
-    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
-
-    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
-        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
-    }
-
-    gpt_params_handle_model_default(params);
-
-    if (params.escape) {
-        string_process_escapes(params.prompt);
-        string_process_escapes(params.input_prefix);
-        string_process_escapes(params.input_suffix);
-        for (auto & antiprompt : params.antiprompt) {
-            string_process_escapes(antiprompt);
-        }
-    }
-
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
-    }
-
-    if (sparams.seed == LLAMA_DEFAULT_SEED) {
-        sparams.seed = time(NULL);
-    }
-
-    return true;
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options) {
-    const auto params_org = params; // the example can modify the default params
-
-    try {
-        if (!gpt_params_parse_ex(argc, argv, params, options)) {
-            params = params_org;
-            return false;
-        }
-        if (params.usage) {
-            gpt_params_print_usage(params, options);
-            if (params.print_usage) {
-                params.print_usage(argc, argv);
-            }
-            exit(0);
-        }
-    } catch (const std::invalid_argument & ex) {
-        fprintf(stderr, "%s\n", ex.what());
-        params = params_org;
-        return false;
-    }
-
-    return true;
-}
-
 bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
     size_t dash_loc = range.find('-');
     if (dash_loc == std::string::npos) {
@@ -561,1743 +369,6 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     return true;
 }
 
-static std::vector<std::string> break_str_into_lines(std::string input, size_t max_char_per_line) {
-    std::vector<std::string> result;
-    std::istringstream iss(input);
-    std::string line;
-    auto add_line = [&](const std::string& l) {
-        if (l.length() <= max_char_per_line) {
-            result.push_back(l);
-        } else {
-            std::istringstream line_stream(l);
-            std::string word, current_line;
-            while (line_stream >> word) {
-                if (current_line.length() + !current_line.empty() + word.length() > max_char_per_line) {
-                    if (!current_line.empty()) result.push_back(current_line);
-                    current_line = word;
-                } else {
-                    current_line += (!current_line.empty() ? " " : "") + word;
-                }
-            }
-            if (!current_line.empty()) result.push_back(current_line);
-        }
-    };
-    while (std::getline(iss, line)) {
-        add_line(line);
-    }
-    return result;
-}
-
-std::string llama_arg::to_string() {
-    // params for printing to console
-    const static int n_leading_spaces = 40;
-    const static int n_char_per_line_help = 70; // TODO: detect this based on current console
-    std::string leading_spaces(n_leading_spaces, ' ');
-
-    std::ostringstream ss;
-    for (const auto arg : args) {
-        if (arg == args.front()) {
-            if (args.size() == 1) {
-                ss << arg;
-            } else {
-                // first arg is usually abbreviation, we need padding to make it more beautiful
-                auto tmp = std::string(arg) + ", ";
-                ss << format("%-7s", tmp.c_str());
-            }
-        } else {
-            ss << arg << (arg != args.back() ? ", " : "");
-        }
-    }
-    if (value_hint) ss << " " << value_hint;
-    if (value_hint_2) ss << " " << value_hint_2;
-    if (ss.tellp() > n_leading_spaces - 3) {
-        // current line is too long, add new line
-        ss << "\n" << leading_spaces;
-    } else {
-        // padding between arg and help, same line
-        ss << std::string(leading_spaces.size() - ss.tellp(), ' ');
-    }
-    const auto help_lines = break_str_into_lines(help, n_char_per_line_help);
-    for (const auto & line : help_lines) {
-        ss << (&line == &help_lines.front() ? "" : leading_spaces) << line << "\n";
-    }
-    return ss.str();
-}
-
-void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options) {
-    auto print_options = [](std::vector<llama_arg *> & options) {
-        for (llama_arg * opt : options) {
-            printf("%s", opt->to_string().c_str());
-        }
-    };
-
-    std::vector<llama_arg *> common_options;
-    std::vector<llama_arg *> specific_options;
-    for (auto & opt : options) {
-        // in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
-        if (opt.in_example(params.curr_ex)) {
-            specific_options.push_back(&opt);
-        } else {
-            common_options.push_back(&opt);
-        }
-    }
-    printf("----- common options -----\n\n");
-    print_options(common_options);
-    // TODO: maybe convert enum llama_example to string
-    printf("\n\n----- example-specific options -----\n\n");
-    print_options(specific_options);
-}
-
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex) {
-    return gpt_params_parser_init(params, ex, nullptr);
-}
-
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage) {
-    std::vector<llama_arg> options;
-    params.print_usage = print_usage;
-    params.curr_ex     = ex;
-
-    std::string sampler_type_chars;
-    std::string sampler_type_names;
-    for (const auto & sampler : params.sparams.samplers) {
-        sampler_type_chars += gpt_sampler_type_to_chr(sampler);
-        sampler_type_names += gpt_sampler_type_to_str(sampler) + ";";
-    }
-    sampler_type_names.pop_back();
-
-
-    /**
-     * filter options by example
-     * rules:
-     * - all examples inherit options from LLAMA_EXAMPLE_COMMON
-     * - if LLAMA_EXAMPLE_* is set (other than COMMON), we only show the option in the corresponding example
-     * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example
-     */
-    auto add_opt = [&](llama_arg arg) {
-        if (arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) {
-            options.push_back(std::move(arg));
-        }
-    };
-
-
-    add_opt(llama_arg(
-        {"-h", "--help", "--usage"},
-        "print usage and exit",
-        [](gpt_params & params) {
-            params.usage = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--version"},
-        "show version and build info",
-        [](gpt_params &) {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-            exit(0);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-v", "--verbose"},
-        "print verbose information",
-        [](gpt_params & params) {
-            params.verbosity = 1;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--verbosity"}, "N",
-        format("set specific verbosity level (default: %d)", params.verbosity),
-        [](gpt_params & params, int value) {
-            params.verbosity = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--verbose-prompt"},
-        format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
-        [](gpt_params & params) {
-            params.verbose_prompt = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--no-display-prompt"},
-        format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [](gpt_params & params) {
-            params.display_prompt = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"-co", "--color"},
-        format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
-        [](gpt_params & params) {
-            params.use_color = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
-    add_opt(llama_arg(
-        {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.seed = std::stoul(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-t", "--threads"}, "N",
-        format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
-        [](gpt_params & params, int value) {
-            params.cpuparams.n_threads = value;
-            if (params.cpuparams.n_threads <= 0) {
-                params.cpuparams.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ).set_env("LLAMA_ARG_THREADS"));
-    add_opt(llama_arg(
-        {"-tb", "--threads-batch"}, "N",
-        "number of threads to use during batch and prompt processing (default: same as --threads)",
-        [](gpt_params & params, int value) {
-            params.cpuparams_batch.n_threads = value;
-            if (params.cpuparams_batch.n_threads <= 0) {
-                params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"-td", "--threads-draft"}, "N",
-        "number of threads to use during generation (default: same as --threads)",
-        [](gpt_params & params, int value) {
-            params.draft_cpuparams.n_threads = value;
-            if (params.draft_cpuparams.n_threads <= 0) {
-                params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-tbd", "--threads-batch-draft"}, "N",
-        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
-        [](gpt_params & params, int value) {
-            params.draft_cpuparams_batch.n_threads = value;
-            if (params.draft_cpuparams_batch.n_threads <= 0) {
-                params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-C", "--cpu-mask"}, "M",
-        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
-        [](gpt_params & params, const std::string & mask) {
-            params.cpuparams.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"-Cr", "--cpu-range"}, "lo-hi",
-        "range of CPUs for affinity. Complements --cpu-mask",
-        [](gpt_params & params, const std::string & range) {
-            params.cpuparams.mask_valid = true;
-            if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid range");
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"--cpu-strict"}, "<0|1>",
-        format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
-        [](gpt_params & params, const std::string & value) {
-            params.cpuparams.strict_cpu = std::stoul(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--prio"}, "N",
-        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
-        [](gpt_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.cpuparams.priority = (enum ggml_sched_priority) prio;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--poll"}, "<0...100>",
-        format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
-        [](gpt_params & params, const std::string & value) {
-            params.cpuparams.poll = std::stoul(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-Cb", "--cpu-mask-batch"}, "M",
-        "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
-        [](gpt_params & params, const std::string & mask) {
-            params.cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"-Crb", "--cpu-range-batch"}, "lo-hi",
-        "ranges of CPUs for affinity. Complements --cpu-mask-batch",
-        [](gpt_params & params, const std::string & range) {
-            params.cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid range");
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"--cpu-strict-batch"}, "<0|1>",
-        "use strict CPU placement (default: same as --cpu-strict)",
-        [](gpt_params & params, int value) {
-            params.cpuparams_batch.strict_cpu = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--prio-batch"}, "N",
-        format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
-        [](gpt_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--poll-batch"}, "<0|1>",
-        "use polling to wait for work (default: same as --poll)",
-        [](gpt_params & params, int value) {
-            params.cpuparams_batch.poll = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-Cd", "--cpu-mask-draft"}, "M",
-        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](gpt_params & params, const std::string & mask) {
-            params.draft_cpuparams.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-Crd", "--cpu-range-draft"}, "lo-hi",
-        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
-        [](gpt_params & params, const std::string & range) {
-            params.draft_cpuparams.mask_valid = true;
-            if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
-                throw std::invalid_argument("invalid range");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"--cpu-strict-draft"}, "<0|1>",
-        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
-        [](gpt_params & params, int value) {
-            params.draft_cpuparams.strict_cpu = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"--prio-draft"}, "N",
-        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
-        [](gpt_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"--poll-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: same as --poll])",
-        [](gpt_params & params, int value) {
-            params.draft_cpuparams.poll = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
-        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](gpt_params & params, const std::string & mask) {
-            params.draft_cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
-        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
-        [](gpt_params & params, const std::string & range) {
-            params.draft_cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
-                throw std::invalid_argument("invalid cpumask");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"--cpu-strict-batch-draft"}, "<0|1>",
-        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
-        [](gpt_params & params, int value) {
-            params.draft_cpuparams_batch.strict_cpu = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"--prio-batch-draft"}, "N",
-        format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
-        [](gpt_params & params, int prio) {
-            if (prio < 0 || prio > 3) {
-                throw std::invalid_argument("invalid value");
-            }
-            params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"--poll-batch-draft"}, "<0|1>",
-        "Use polling to wait for draft model work (default: --poll-draft)",
-        [](gpt_params & params, int value) {
-            params.draft_cpuparams_batch.poll = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"--draft"}, "N",
-        format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
-        [](gpt_params & params, int value) {
-            params.n_draft = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-ps", "--p-split"}, "N",
-        format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
-        [](gpt_params & params, const std::string & value) {
-            params.p_split = std::stof(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-lcs", "--lookup-cache-static"}, "FNAME",
-        "path to static lookup cache to use for lookup decoding (not updated by generation)",
-        [](gpt_params & params, const std::string & value) {
-            params.lookup_cache_static = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
-        "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
-        [](gpt_params & params, const std::string & value) {
-            params.lookup_cache_dynamic = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-c", "--ctx-size"}, "N",
-        format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
-        [](gpt_params & params, int value) {
-            params.n_ctx = value;
-        }
-    ).set_env("LLAMA_ARG_CTX_SIZE"));
-    add_opt(llama_arg(
-        {"-n", "--predict", "--n-predict"}, "N",
-        format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
-        [](gpt_params & params, int value) {
-            params.n_predict = value;
-        }
-    ).set_env("LLAMA_ARG_N_PREDICT"));
-    add_opt(llama_arg(
-        {"-b", "--batch-size"}, "N",
-        format("logical maximum batch size (default: %d)", params.n_batch),
-        [](gpt_params & params, int value) {
-            params.n_batch = value;
-        }
-    ).set_env("LLAMA_ARG_BATCH"));
-    add_opt(llama_arg(
-        {"-ub", "--ubatch-size"}, "N",
-        format("physical maximum batch size (default: %d)", params.n_ubatch),
-        [](gpt_params & params, int value) {
-            params.n_ubatch = value;
-        }
-    ).set_env("LLAMA_ARG_UBATCH"));
-    add_opt(llama_arg(
-        {"--keep"}, "N",
-        format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
-        [](gpt_params & params, int value) {
-            params.n_keep = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--chunks"}, "N",
-        format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
-        [](gpt_params & params, int value) {
-            params.n_chunks = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-fa", "--flash-attn"},
-        format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.flash_attn = true;
-        }
-    ).set_env("LLAMA_ARG_FLASH_ATTN"));
-    add_opt(llama_arg(
-        {"-p", "--prompt"}, "PROMPT",
-        ex == LLAMA_EXAMPLE_MAIN
-            ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
-            : "prompt to start generation with",
-        [](gpt_params & params, const std::string & value) {
-            params.prompt = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-f", "--file"}, "FNAME",
-        "a file containing the prompt (default: none)",
-        [](gpt_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            // store the external file name in params
-            params.prompt_file = value;
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (!params.prompt.empty() && params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"--in-file"}, "FNAME",
-        "an input file (repeat to specify multiple files)",
-        [](gpt_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            params.in_files.push_back(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-bf", "--binary-file"}, "FNAME",
-        "binary file containing the prompt (default: none)",
-        [](gpt_params & params, const std::string & value) {
-            std::ifstream file(value, std::ios::binary);
-            if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            // store the external file name in params
-            params.prompt_file = value;
-            std::ostringstream ss;
-            ss << file.rdbuf();
-            params.prompt = ss.str();
-            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
-        }
-    ));
-    add_opt(llama_arg(
-        {"-e", "--escape"},
-        format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](gpt_params & params) {
-            params.escape = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--no-escape"},
-        "do not process escape sequences",
-        [](gpt_params & params) {
-            params.escape = false;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-ptc", "--print-token-count"}, "N",
-        format("print token count every N tokens (default: %d)", params.n_print),
-        [](gpt_params & params, int value) {
-            params.n_print = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--prompt-cache"}, "FNAME",
-        "file to cache prompt state for faster startup (default: none)",
-        [](gpt_params & params, const std::string & value) {
-            params.path_prompt_cache = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--prompt-cache-all"},
-        "if specified, saves user input and generations to cache as well\n",
-        [](gpt_params & params) {
-            params.prompt_cache_all = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--prompt-cache-ro"},
-        "if specified, uses the prompt cache but does not update it",
-        [](gpt_params & params) {
-            params.prompt_cache_ro = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"-r", "--reverse-prompt"}, "PROMPT",
-        "halt generation at PROMPT, return control in interactive mode\n",
-        [](gpt_params & params, const std::string & value) {
-            params.antiprompt.emplace_back(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"-sp", "--special"},
-        format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
-        [](gpt_params & params) {
-            params.special = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"-cnv", "--conversation"},
-        format(
-            "run in conversation mode:\n"
-            "- does not print special tokens and suffix/prefix\n"
-            "- interactive mode is also enabled\n"
-            "(default: %s)",
-            params.conversation ? "true" : "false"
-        ),
-        [](gpt_params & params) {
-            params.conversation = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"-i", "--interactive"},
-        format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
-        [](gpt_params & params) {
-            params.interactive = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"-if", "--interactive-first"},
-        format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
-        [](gpt_params & params) {
-            params.interactive_first = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"-mli", "--multiline-input"},
-        "allows you to write or paste multiple lines without ending each in '\\'",
-        [](gpt_params & params) {
-            params.multiline_input = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--in-prefix-bos"},
-        "prefix BOS to user inputs, preceding the `--in-prefix` string",
-        [](gpt_params & params) {
-            params.input_prefix_bos = true;
-            params.enable_chat_template = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--in-prefix"}, "STRING",
-        "string to prefix user inputs with (default: empty)",
-        [](gpt_params & params, const std::string & value) {
-            params.input_prefix = value;
-            params.enable_chat_template = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--in-suffix"}, "STRING",
-        "string to suffix after user inputs with (default: empty)",
-        [](gpt_params & params, const std::string & value) {
-            params.input_suffix = value;
-            params.enable_chat_template = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--no-warmup"},
-        "skip warming up the model with an empty run",
-        [](gpt_params & params) {
-            params.warmup = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
-    add_opt(llama_arg(
-        {"--spm-infill"},
-        format(
-            "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
-            params.spm_infill ? "enabled" : "disabled"
-        ),
-        [](gpt_params & params) {
-            params.spm_infill = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
-    add_opt(llama_arg(
-        {"--samplers"}, "SAMPLERS",
-        format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            const auto sampler_names = string_split(value, ';');
-            params.sparams.samplers = gpt_sampler_types_from_names(sampler_names, true);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--sampling-seq"}, "SEQUENCE",
-        format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.samplers = gpt_sampler_types_from_chars(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--ignore-eos"},
-        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
-        [](gpt_params & params) {
-            params.sparams.ignore_eos = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--penalize-nl"},
-        format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
-        [](gpt_params & params) {
-            params.sparams.penalize_nl = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--temp"}, "N",
-        format("temperature (default: %.1f)", (double)params.sparams.temp),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.temp = std::stof(value);
-            params.sparams.temp = std::max(params.sparams.temp, 0.0f);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--top-k"}, "N",
-        format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
-        [](gpt_params & params, int value) {
-            params.sparams.top_k = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--top-p"}, "N",
-        format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.top_p = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--min-p"}, "N",
-        format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.min_p = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--tfs"}, "N",
-        format("tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)params.sparams.tfs_z),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.tfs_z = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--typical"}, "N",
-        format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.typ_p = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--repeat-last-n"}, "N",
-        format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
-        [](gpt_params & params, int value) {
-            params.sparams.penalty_last_n = value;
-            params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--repeat-penalty"}, "N",
-        format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.penalty_repeat = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--presence-penalty"}, "N",
-        format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.penalty_present = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--frequency-penalty"}, "N",
-        format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.penalty_freq = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--dynatemp-range"}, "N",
-        format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.dynatemp_range = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--dynatemp-exp"}, "N",
-        format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.dynatemp_exponent = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--mirostat"}, "N",
-        format("use Mirostat sampling.\nTop K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
-        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
-        [](gpt_params & params, int value) {
-            params.sparams.mirostat = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--mirostat-lr"}, "N",
-        format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.mirostat_eta = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--mirostat-ent"}, "N",
-        format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.mirostat_tau = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-l", "--logit-bias"}, "TOKEN_ID(+/-)BIAS",
-        "modifies the likelihood of token appearing in the completion,\n"
-        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
-        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
-        [](gpt_params & params, const std::string & value) {
-            std::stringstream ss(value);
-            llama_token key;
-            char sign;
-            std::string value_str;
-            try {
-                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                    params.sparams.logit_bias.push_back({key, bias});
-                } else {
-                    throw std::invalid_argument("invalid input format");
-                }
-            } catch (const std::exception&) {
-                throw std::invalid_argument("invalid input format");
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"--grammar"}, "GRAMMAR",
-        format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.grammar = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--grammar-file"}, "FNAME",
-        "file to read grammar from",
-        [](gpt_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(params.sparams.grammar)
-            );
-        }
-    ));
-    add_opt(llama_arg(
-        {"-j", "--json-schema"}, "SCHEMA",
-        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
-        [](gpt_params & params, const std::string & value) {
-            params.sparams.grammar = json_schema_to_grammar(json::parse(value));
-        }
-    ));
-    add_opt(llama_arg(
-        {"--pooling"}, "{none,mean,cls,last}",
-        "pooling type for embeddings, use model default if unspecified",
-        [](gpt_params & params, const std::string & value) {
-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-            else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
-            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
-            else { throw std::invalid_argument("invalid value"); }
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
-        {"--attention"}, "{causal,non,causal}",
-        "attention type for embeddings, use model default if unspecified",
-        [](gpt_params & params, const std::string & value) {
-            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
-            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
-            else { throw std::invalid_argument("invalid value"); }
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
-        {"--rope-scaling"}, "{none,linear,yarn}",
-        "RoPE frequency scaling method, defaults to linear unless specified by the model",
-        [](gpt_params & params, const std::string & value) {
-            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
-            else { throw std::invalid_argument("invalid value"); }
-        }
-    ));
-    add_opt(llama_arg(
-        {"--rope-scale"}, "N",
-        "RoPE context scaling factor, expands context by a factor of N",
-        [](gpt_params & params, const std::string & value) {
-            params.rope_freq_scale = 1.0f / std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--rope-freq-base"}, "N",
-        "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
-        [](gpt_params & params, const std::string & value) {
-            params.rope_freq_base = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--rope-freq-scale"}, "N",
-        "RoPE frequency scaling factor, expands context by a factor of 1/N",
-        [](gpt_params & params, const std::string & value) {
-            params.rope_freq_scale = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--yarn-orig-ctx"}, "N",
-        format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
-        [](gpt_params & params, int value) {
-            params.yarn_orig_ctx = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--yarn-ext-factor"}, "N",
-        format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
-        [](gpt_params & params, const std::string & value) {
-            params.yarn_ext_factor = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--yarn-attn-factor"}, "N",
-        format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
-        [](gpt_params & params, const std::string & value) {
-            params.yarn_attn_factor = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--yarn-beta-slow"}, "N",
-        format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
-        [](gpt_params & params, const std::string & value) {
-            params.yarn_beta_slow = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"--yarn-beta-fast"}, "N",
-        format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
-        [](gpt_params & params, const std::string & value) {
-            params.yarn_beta_fast = std::stof(value);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-gan", "--grp-attn-n"}, "N",
-        format("group-attention factor (default: %d)", params.grp_attn_n),
-        [](gpt_params & params, int value) {
-            params.grp_attn_n = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-gaw", "--grp-attn-w"}, "N",
-        format("group-attention width (default: %.1f)", (double)params.grp_attn_w),
-        [](gpt_params & params, int value) {
-            params.grp_attn_w = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-dkvc", "--dump-kv-cache"},
-        "verbose print of the KV cache",
-        [](gpt_params & params) {
-            params.dump_kv_cache = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-nkvo", "--no-kv-offload"},
-        "disable KV offload",
-        [](gpt_params & params) {
-            params.no_kv_offload = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-ctk", "--cache-type-k"}, "TYPE",
-        format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_k = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-ctv", "--cache-type-v"}, "TYPE",
-        format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            // TODO: get the type right here
-            params.cache_type_v = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--perplexity", "--all-logits"},
-        format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
-        [](gpt_params & params) {
-            params.logits_all = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--hellaswag"},
-        "compute HellaSwag score over random tasks from datafile supplied with -f",
-        [](gpt_params & params) {
-            params.hellaswag = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--hellaswag-tasks"}, "N",
-        format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
-        [](gpt_params & params, int value) {
-            params.hellaswag_tasks = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--winogrande"},
-        "compute Winogrande score over random tasks from datafile supplied with -f",
-        [](gpt_params & params) {
-            params.winogrande = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--winogrande-tasks"}, "N",
-        format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
-        [](gpt_params & params, int value) {
-            params.winogrande_tasks = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--multiple-choice"},
-        "compute multiple choice score over random tasks from datafile supplied with -f",
-        [](gpt_params & params) {
-            params.multiple_choice = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--multiple-choice-tasks"}, "N",
-        format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
-        [](gpt_params & params, int value) {
-            params.multiple_choice_tasks = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--kl-divergence"},
-        "computes KL-divergence to logits provided via --kl-divergence-base",
-        [](gpt_params & params) {
-            params.kl_divergence = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
-        "set logits file",
-        [](gpt_params & params, const std::string & value) {
-            params.logits_file = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--ppl-stride"}, "N",
-        format("stride for perplexity calculation (default: %d)", params.ppl_stride),
-        [](gpt_params & params, int value) {
-            params.ppl_stride = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"--ppl-output-type"}, "<0|1>",
-        format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
-        [](gpt_params & params, int value) {
-            params.ppl_output_type = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
-    add_opt(llama_arg(
-        {"-dt", "--defrag-thold"}, "N",
-        format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
-        [](gpt_params & params, const std::string & value) {
-            params.defrag_thold = std::stof(value);
-        }
-    ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
-    add_opt(llama_arg(
-        {"-np", "--parallel"}, "N",
-        format("number of parallel sequences to decode (default: %d)", params.n_parallel),
-        [](gpt_params & params, int value) {
-            params.n_parallel = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-ns", "--sequences"}, "N",
-        format("number of sequences to decode (default: %d)", params.n_sequences),
-        [](gpt_params & params, int value) {
-            params.n_sequences = value;
-        }
-    ));
-    add_opt(llama_arg(
-        {"-cb", "--cont-batching"},
-        format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.cont_batching = true;
-        }
-    ).set_env("LLAMA_ARG_CONT_BATCHING"));
-    add_opt(llama_arg(
-        {"-nocb", "--no-cont-batching"},
-        "disable continuous batching",
-        [](gpt_params & params) {
-            params.cont_batching = false;
-        }
-    ).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
-    add_opt(llama_arg(
-        {"--mmproj"}, "FILE",
-        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
-        [](gpt_params & params, const std::string & value) {
-            params.mmproj = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
-    add_opt(llama_arg(
-        {"--image"}, "FILE",
-        "path to an image file. use with multimodal models. Specify multiple times for batching",
-        [](gpt_params & params, const std::string & value) {
-            params.image.emplace_back(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
-#ifdef GGML_USE_RPC
-    add_opt(llama_arg(
-        {"--rpc"}, "SERVERS",
-        "comma separated list of RPC servers",
-        [](gpt_params & params, const std::string & value) {
-            params.rpc_servers = value;
-        }
-    ));
-#endif
-    add_opt(llama_arg(
-        {"--mlock"},
-        "force system to keep model in RAM rather than swapping or compressing",
-        [](gpt_params & params) {
-            params.use_mlock = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--no-mmap"},
-        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [](gpt_params & params) {
-            params.use_mmap = false;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--numa"}, "TYPE",
-        "attempt optimizations that help on some NUMA systems\n"
-        "- distribute: spread execution evenly over all nodes\n"
-        "- isolate: only spawn threads on CPUs on the node that execution started on\n"
-        "- numactl: use the CPU map provided by numactl\n"
-        "if run without this previously, it is recommended to drop the system page cache before using this\n"
-        "see https://github.com/ggerganov/llama.cpp/issues/1437",
-        [](gpt_params & params, const std::string & value) {
-            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-            else { throw std::invalid_argument("invalid value"); }
-        }
-    ));
-    add_opt(llama_arg(
-        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
-        "number of layers to store in VRAM",
-        [](gpt_params & params, int value) {
-            params.n_gpu_layers = value;
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-        }
-    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
-    add_opt(llama_arg(
-        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
-        "number of layers to store in VRAM for the draft model",
-        [](gpt_params & params, int value) {
-            params.n_gpu_layers_draft = value;
-            if (!llama_supports_gpu_offload()) {
-                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
-                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-sm", "--split-mode"}, "{none,layer,row}",
-        "how to split the model across multiple GPUs, one of:\n"
-        "- none: use one GPU only\n"
-        "- layer (default): split layers and KV across GPUs\n"
-        "- row: split rows across GPUs",
-        [](gpt_params & params, const std::string & value) {
-            std::string arg_next = value;
-            if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
-            } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            }
-            else if (arg_next == "row") {
-#ifdef GGML_USE_SYCL
-                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
-                exit(1);
-#endif // GGML_USE_SYCL
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            }
-            else {
-                throw std::invalid_argument("invalid value");
-            }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        }
-    ));
-    add_opt(llama_arg(
-        {"-ts", "--tensor-split"}, "N0,N1,N2,...",
-        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
-        [](gpt_params & params, const std::string & value) {
-            std::string arg_next = value;
-
-            // split string by , and /
-            const std::regex regex{ R"([,/]+)" };
-            std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 };
-            std::vector<std::string> split_arg{ it, {} };
-            if (split_arg.size() >= llama_max_devices()) {
-                throw std::invalid_argument(
-                    format("got %d input configs, but system only has %d devices", (int)split_arg.size(), (int)llama_max_devices())
-                );
-            }
-            for (size_t i = 0; i < llama_max_devices(); ++i) {
-                if (i < split_arg.size()) {
-                        params.tensor_split[i] = std::stof(split_arg[i]);
-                } else {
-                        params.tensor_split[i] = 0.0f;
-                }
-            }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        }
-    ));
-    add_opt(llama_arg(
-        {"-mg", "--main-gpu"}, "INDEX",
-        format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
-        [](gpt_params & params, int value) {
-            params.main_gpu = value;
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
-        }
-    ));
-    add_opt(llama_arg(
-        {"--check-tensors"},
-        format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
-        [](gpt_params & params) {
-            params.check_tensors = true;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--override-kv"}, "KEY=TYPE:VALUE",
-        "advanced option to override model metadata by key. may be specified multiple times.\n"
-        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
-        [](gpt_params & params, const std::string & value) {
-            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
-                throw std::runtime_error(format("error: Invalid type for KV override: %s\n", value.c_str()));
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"--lora"}, "FNAME",
-        "path to LoRA adapter (can be repeated to use multiple adapters)",
-        [](gpt_params & params, const std::string & value) {
-            params.lora_adapters.push_back({ std::string(value), 1.0 });
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
-    add_opt(llama_arg(
-        {"--lora-scaled"}, "FNAME", "SCALE",
-        "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
-        [](gpt_params & params, const std::string & fname, const std::string & scale) {
-            params.lora_adapters.push_back({ fname, std::stof(scale) });
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
-    add_opt(llama_arg(
-        {"--control-vector"}, "FNAME",
-        "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
-        [](gpt_params & params, const std::string & value) {
-            params.control_vectors.push_back({ 1.0f, value, });
-        }
-    ));
-    add_opt(llama_arg(
-        {"--control-vector-scaled"}, "FNAME", "SCALE",
-        "add a control vector with user defined scaling SCALE\n"
-        "note: this argument can be repeated to add multiple scaled control vectors",
-        [](gpt_params & params, const std::string & fname, const std::string & scale) {
-            params.control_vectors.push_back({ std::stof(scale), fname });
-        }
-    ));
-    add_opt(llama_arg(
-        {"--control-vector-layer-range"}, "START", "END",
-        "layer range to apply the control vector(s) to, start and end inclusive",
-        [](gpt_params & params, const std::string & start, const std::string & end) {
-            params.control_vector_layer_start = std::stoi(start);
-            params.control_vector_layer_end = std::stoi(end);
-        }
-    ));
-    add_opt(llama_arg(
-        {"-a", "--alias"}, "STRING",
-        "set alias for model name (to be used by REST API)",
-        [](gpt_params & params, const std::string & value) {
-            params.model_alias = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"-m", "--model"}, "FNAME",
-        ex == LLAMA_EXAMPLE_EXPORT_LORA
-            ? std::string("model path from which to load base model")
-            : format(
-                "model path (default: `models/$filename` with filename from `--hf-file` "
-                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
-            ),
-        [](gpt_params & params, const std::string & value) {
-            params.model = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
-    add_opt(llama_arg(
-        {"-md", "--model-draft"}, "FNAME",
-        "draft model for speculative decoding (default: unused)",
-        [](gpt_params & params, const std::string & value) {
-            params.model_draft = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
-    add_opt(llama_arg(
-        {"-mu", "--model-url"}, "MODEL_URL",
-        "model download url (default: unused)",
-        [](gpt_params & params, const std::string & value) {
-            params.model_url = value;
-        }
-    ).set_env("LLAMA_ARG_MODEL_URL"));
-    add_opt(llama_arg(
-        {"-hfr", "--hf-repo"}, "REPO",
-        "Hugging Face model repository (default: unused)",
-        [](gpt_params & params, const std::string & value) {
-            params.hf_repo = value;
-        }
-    ).set_env("LLAMA_ARG_HF_REPO"));
-    add_opt(llama_arg(
-        {"-hff", "--hf-file"}, "FILE",
-        "Hugging Face model file (default: unused)",
-        [](gpt_params & params, const std::string & value) {
-            params.hf_file = value;
-        }
-    ).set_env("LLAMA_ARG_HF_FILE"));
-    add_opt(llama_arg(
-        {"-hft", "--hf-token"}, "TOKEN",
-        "Hugging Face access token (default: value from HF_TOKEN environment variable)",
-        [](gpt_params & params, const std::string & value) {
-            params.hf_token = value;
-        }
-    ).set_env("HF_TOKEN"));
-    add_opt(llama_arg(
-        {"--context-file"}, "FNAME",
-        "file to load context from (repeat to specify multiple files)",
-        [](gpt_params & params, const std::string & value) {
-            std::ifstream file(value, std::ios::binary);
-            if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            params.context_files.push_back(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
-    add_opt(llama_arg(
-        {"--chunk-size"}, "N",
-        format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
-        [](gpt_params & params, int value) {
-            params.chunk_size = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
-    add_opt(llama_arg(
-        {"--chunk-separator"}, "STRING",
-        format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.chunk_separator = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
-    add_opt(llama_arg(
-        {"--junk"}, "N",
-        format("number of times to repeat the junk text (default: %d)", params.n_junk),
-        [](gpt_params & params, int value) {
-            params.n_junk = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
-    add_opt(llama_arg(
-        {"--pos"}, "N",
-        format("position of the passkey in the junk text (default: %d)", params.i_pos),
-        [](gpt_params & params, int value) {
-            params.i_pos = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
-    add_opt(llama_arg(
-        {"-o", "--output", "--output-file"}, "FNAME",
-        format("output file (default: '%s')",
-            ex == LLAMA_EXAMPLE_EXPORT_LORA
-                ? params.lora_outfile.c_str()
-                : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
-                    ? params.cvector_outfile.c_str()
-                    : params.out_file.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.out_file = value;
-            params.cvector_outfile = value;
-            params.lora_outfile = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
-    add_opt(llama_arg(
-        {"-ofreq", "--output-frequency"}, "N",
-        format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
-        [](gpt_params & params, int value) {
-            params.n_out_freq = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
-        {"--save-frequency"}, "N",
-        format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
-        [](gpt_params & params, int value) {
-            params.n_save_freq = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
-        {"--process-output"},
-        format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
-        [](gpt_params & params) {
-            params.process_output = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
-        {"--no-ppl"},
-        format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](gpt_params & params) {
-            params.compute_ppl = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
-        {"--chunk", "--from-chunk"}, "N",
-        format("start processing the input from chunk N (default: %d)", params.i_chunk),
-        [](gpt_params & params, int value) {
-            params.i_chunk = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
-    add_opt(llama_arg(
-        {"-pps"},
-        format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
-        [](gpt_params & params) {
-            params.is_pp_shared = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
-        {"-npp"}, "n0,n1,...",
-        "number of prompt tokens",
-        [](gpt_params & params, const std::string & value) {
-            auto p = string_split<int>(value, ',');
-            params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
-        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
-        {"-ntg"}, "n0,n1,...",
-        "number of text generation tokens",
-        [](gpt_params & params, const std::string & value) {
-            auto p = string_split<int>(value, ',');
-            params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
-        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
-        {"-npl"}, "n0,n1,...",
-        "number of parallel prompts",
-        [](gpt_params & params, const std::string & value) {
-            auto p = string_split<int>(value, ',');
-            params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
-        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
-    add_opt(llama_arg(
-        {"--embd-normalize"}, "N",
-        format("normalisation for embendings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
-        [](gpt_params & params, int value) {
-            params.embd_normalize = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
-        {"--embd-output-format"}, "FORMAT",
-        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
-        [](gpt_params & params, const std::string & value) {
-            params.embd_out = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
-        {"--embd-separator"}, "STRING",
-        "separator of embendings (default \\n) for example \"<#sep#>\"",
-        [](gpt_params & params, const std::string & value) {
-            params.embd_sep = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
-    add_opt(llama_arg(
-        {"--host"}, "HOST",
-        format("ip address to listen (default: %s)", params.hostname.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.hostname = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
-    add_opt(llama_arg(
-        {"--port"}, "PORT",
-        format("port to listen (default: %d)", params.port),
-        [](gpt_params & params, int value) {
-            params.port = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
-    add_opt(llama_arg(
-        {"--path"}, "PATH",
-        format("path to serve static files from (default: %s)", params.public_path.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.public_path = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--embedding", "--embeddings"},
-        format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.embedding = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
-    add_opt(llama_arg(
-        {"--api-key"}, "KEY",
-        "API key to use for authentication (default: none)",
-        [](gpt_params & params, const std::string & value) {
-            params.api_keys.push_back(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
-    add_opt(llama_arg(
-        {"--api-key-file"}, "FNAME",
-        "path to file containing API keys (default: none)",
-        [](gpt_params & params, const std::string & value) {
-            std::ifstream key_file(value);
-            if (!key_file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            std::string key;
-            while (std::getline(key_file, key)) {
-                if (!key.empty()) {
-                        params.api_keys.push_back(key);
-                }
-            }
-            key_file.close();
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--ssl-key-file"}, "FNAME",
-        "path to file a PEM-encoded SSL private key",
-        [](gpt_params & params, const std::string & value) {
-            params.ssl_file_key = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--ssl-cert-file"}, "FNAME",
-        "path to file a PEM-encoded SSL certificate",
-        [](gpt_params & params, const std::string & value) {
-            params.ssl_file_cert = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"-to", "--timeout"}, "N",
-        format("server read/write timeout in seconds (default: %d)", params.timeout_read),
-        [](gpt_params & params, int value) {
-            params.timeout_read  = value;
-            params.timeout_write = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--threads-http"}, "N",
-        format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
-        [](gpt_params & params, int value) {
-            params.n_threads_http = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
-    add_opt(llama_arg(
-        {"-spf", "--system-prompt-file"}, "FNAME",
-        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications",
-        [](gpt_params & params, const std::string & value) {
-            std::ifstream file(value);
-            if (!file) {
-                throw std::runtime_error(format("error: failed to open file '%s'\n", value.c_str()));
-            }
-            std::string system_prompt;
-            std::copy(
-                        std::istreambuf_iterator<char>(file),
-                        std::istreambuf_iterator<char>(),
-                        std::back_inserter(system_prompt)
-                        );
-            params.system_prompt = system_prompt;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--log-format"}, "{text, json}",
-        "log output format: json or text (default: json)",
-        [](gpt_params & params, const std::string & value) {
-            if (value == "json") {
-                params.log_json = true;
-            } else if (value == "text") {
-                params.log_json = false;
-            } else {
-                throw std::invalid_argument("invalid value");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--metrics"},
-        format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.endpoint_metrics = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
-    add_opt(llama_arg(
-        {"--no-slots"},
-        format("disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.endpoint_slots = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
-    add_opt(llama_arg(
-        {"--slot-save-path"}, "PATH",
-        "path to save slot kv cache (default: disabled)",
-        [](gpt_params & params, const std::string & value) {
-            params.slot_save_path = value;
-            // if doesn't end with DIRECTORY_SEPARATOR, add it
-            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
-                params.slot_save_path += DIRECTORY_SEPARATOR;
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--chat-template"}, "JINJA_TEMPLATE",
-        "set custom jinja chat template (default: template taken from model's metadata)\n"
-        "if suffix/prefix are specified, template will be disabled\n"
-        "only commonly used templates are accepted:\nhttps://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template",
-        [](gpt_params & params, const std::string & value) {
-            if (!llama_chat_verify_template(value)) {
-                throw std::runtime_error(format(
-                    "error: the supplied chat template is not supported: %s\n"
-                    "note: llama.cpp does not use jinja parser, we only support commonly used templates\n",
-                    value.c_str()
-                ));
-            }
-            params.chat_template = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
-    add_opt(llama_arg(
-        {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
-        format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
-        [](gpt_params & params, const std::string & value) {
-            params.slot_prompt_similarity = std::stof(value);
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--lora-init-without-apply"},
-        format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
-        [](gpt_params & params) {
-            params.lora_init_without_apply = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--simple-io"},
-        "use basic IO for better compatibility in subprocesses and limited consoles",
-        [](gpt_params & params) {
-            params.simple_io = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
-    add_opt(llama_arg(
-        {"-ld", "--logdir"}, "LOGDIR",
-        "path under which to save YAML logs (no logging if unset)",
-        [](gpt_params & params, const std::string & value) {
-            params.logdir = value;
-
-            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
-                params.logdir += DIRECTORY_SEPARATOR;
-            }
-        }
-    ));
-    add_opt(llama_arg(
-        {"--positive-file"}, "FNAME",
-        format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.cvector_positive_file = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
-        {"--negative-file"}, "FNAME",
-        format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
-        [](gpt_params & params, const std::string & value) {
-            params.cvector_negative_file = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
-        {"--pca-batch"}, "N",
-        format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
-        [](gpt_params & params, int value) {
-            params.n_pca_batch = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
-        {"--pca-iter"}, "N",
-        format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
-        [](gpt_params & params, int value) {
-            params.n_pca_iterations = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
-        {"--method"}, "{pca, mean}",
-        "dimensionality reduction method to be used (default: pca)",
-        [](gpt_params & params, const std::string & value) {
-            /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
-            else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
-            else { throw std::invalid_argument("invalid value"); }
-        }
-    ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
-    add_opt(llama_arg(
-        {"--output-format"}, "{md,jsonl}",
-        "output format for batched-bench results (default: md)",
-        [](gpt_params & params, const std::string & value) {
-            /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
-            else if (value == "md") { params.batched_bench_output_jsonl = false; }
-            else { std::invalid_argument("invalid value"); }
-        }
-    ).set_examples({LLAMA_EXAMPLE_BENCH}));
-#ifndef LOG_DISABLE_LOGS
-    // TODO: make this looks less weird
-    add_opt(llama_arg(
-        {"--log-test"},
-        "Log test",
-        [](gpt_params &) { log_param_single_parse("--log-test"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-disable"},
-        "Log disable",
-        [](gpt_params &) { log_param_single_parse("--log-disable"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-enable"},
-        "Log enable",
-        [](gpt_params &) { log_param_single_parse("--log-enable"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-new"},
-        "Log new",
-        [](gpt_params &) { log_param_single_parse("--log-new"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-append"},
-        "Log append",
-        [](gpt_params &) { log_param_single_parse("--log-append"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-file"}, "FNAME",
-        "Log file",
-        [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
-    ));
-#endif // LOG_DISABLE_LOGS
-
-    return options;
-}
-
 std::string gpt_params_get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
diff --git a/common/common.h b/common/common.h
index 5b945ac02..23babdd09 100644
--- a/common/common.h
+++ b/common/common.h
@@ -4,20 +4,11 @@
 
 #include "llama.h"
 
-#include "sampling.h"
-
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 
-#include <cmath>
 #include <string>
 #include <vector>
-#include <random>
-#include <thread>
-#include <set>
-#include <unordered_map>
-#include <tuple>
-#include <functional>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
 // CPU utils
 //
 
+struct cpu_params {
+    int      n_threads                   = -1;
+    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
+    bool     mask_valid                  = false;   // Default: any CPU
+    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
+    bool     strict_cpu                  = false;   // Use strict CPU placement
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+};
+
 int32_t cpu_get_num_physical_cores();
 int32_t cpu_get_num_math();
 
 //
-// CLI argument parsing
+// Common params
 //
 
 enum llama_example {
@@ -78,28 +78,71 @@ enum llama_example {
     LLAMA_EXAMPLE_CVECTOR_GENERATOR,
     LLAMA_EXAMPLE_EXPORT_LORA,
     LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_LOOKUP,
+    LLAMA_EXAMPLE_PARALLEL,
 
     LLAMA_EXAMPLE_COUNT,
 };
 
+enum gpt_sampler_type {
+    GPT_SAMPLER_TYPE_NONE        = 0,
+    GPT_SAMPLER_TYPE_TOP_K       = 1,
+    GPT_SAMPLER_TYPE_TOP_P       = 2,
+    GPT_SAMPLER_TYPE_MIN_P       = 3,
+    GPT_SAMPLER_TYPE_TFS_Z       = 4,
+    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
+    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
+};
+
 // dimensionality reduction methods, used by cvector-generator
 enum dimre_method {
     DIMRE_METHOD_PCA,
     DIMRE_METHOD_MEAN,
 };
 
-struct cpu_params {
-    int      n_threads                   = -1;
-    bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
-    bool     mask_valid                  = false;   // Default: any CPU
-    enum ggml_sched_priority  priority   = GGML_SCHED_PRIO_NORMAL;  // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
-    bool     strict_cpu                  = false;   // Use strict CPU placement
-    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
+// sampler parameters
+struct gpt_sampler_params {
+    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
+
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
+    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
+    float   dynatemp_range    = 0.00f; // 0.0 = disabled
+    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.00f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = false; // consider newlines as a repeatable token
+    bool    ignore_eos        = false;
+
+    std::vector<enum gpt_sampler_type> samplers = {
+        GPT_SAMPLER_TYPE_TOP_K,
+        GPT_SAMPLER_TYPE_TFS_Z,
+        GPT_SAMPLER_TYPE_TYPICAL_P,
+        GPT_SAMPLER_TYPE_TOP_P,
+        GPT_SAMPLER_TYPE_MIN_P,
+        GPT_SAMPLER_TYPE_TEMPERATURE
+    };
+
+    std::string grammar; // optional BNF-like grammar to constrain sampling
+
+    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
+
+    // print the parameters into a string
+    std::string print() const;
 };
 
 struct gpt_params {
-    enum llama_example curr_ex    = LLAMA_EXAMPLE_COMMON;
-
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =     0; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@@ -143,23 +186,23 @@ struct gpt_params {
 
     struct gpt_sampler_params sparams;
 
-    std::string model                = ""; // model path
-    std::string model_draft          = ""; // draft model for speculative decoding
-    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = ""; // model url to download
-    std::string hf_token             = ""; // HF token
-    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = ""; // HF file
-    std::string prompt               = "";
-    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::string logdir               = ""; // directory in which to save YAML log files
-    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
-    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = ""; // file for saving *all* logits
-    std::string rpc_servers          = ""; // comma separated list of RPC servers
+    std::string model                = ""; // model path                                                    // NOLINT
+    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
+    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
+    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
+    std::string prompt               = "";                                                                  // NOLINT
+    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
+    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
+    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
+    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
+    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
+    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
+    std::string rpc_servers          = ""; // comma separated list of RPC servers                           // NOLINT
 
     std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@@ -189,7 +232,6 @@ struct gpt_params {
 
     bool   kl_divergence    = false; // compute KL divergence
 
-    std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
     bool usage             = false; // print usage
     bool use_color         = false; // use color to distinguish generations and inputs
     bool special           = false; // enable special token output
@@ -220,7 +262,7 @@ struct gpt_params {
     std::string cache_type_v = "f16"; // KV cache data type for the V
 
     // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
     std::vector<std::string> image; // path to image file(s)
 
     // embedding
@@ -236,15 +278,15 @@ struct gpt_params {
     int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
 
     std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
-    std::string chat_template = "";
-    std::string system_prompt = "";
+    std::string public_path   = "";                                                                         // NOLINT
+    std::string chat_template = "";                                                                         // NOLINT
+    std::string system_prompt = "";                                                                         // NOLINT
     bool enable_chat_template = true;
 
     std::vector<std::string> api_keys;
 
-    std::string ssl_file_key  = "";
-    std::string ssl_file_cert = "";
+    std::string ssl_file_key  = "";                                                                         // NOLINT
+    std::string ssl_file_cert = "";                                                                         // NOLINT
 
     bool endpoint_slots   = true;
     bool endpoint_metrics = false;
@@ -299,92 +341,6 @@ struct gpt_params {
     bool batched_bench_output_jsonl = false;
 };
 
-struct llama_arg {
-    std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
-    std::vector<const char *> args;
-    const char * value_hint   = nullptr; // help text or example for arg value
-    const char * value_hint_2 = nullptr; // for second arg value
-    const char * env          = nullptr;
-    std::string help;
-    void (*handler_void)   (gpt_params & params) = nullptr;
-    void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
-    void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
-    void (*handler_int)    (gpt_params & params, int) = nullptr;
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
-        void (*handler)(gpt_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
-
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
-        void (*handler)(gpt_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
-
-    // support 2 values for arg
-    llama_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
-        void (*handler)(gpt_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
-
-    llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
-        this->examples = std::move(examples);
-        return *this;
-    }
-
-    llama_arg & set_env(const char * env) {
-        help = help + "\n(env: " + env + ")";
-        this->env = env;
-        return *this;
-    }
-
-    bool in_example(enum llama_example ex) {
-        return examples.find(ex) != examples.end();
-    }
-
-    bool get_value_from_env(std::string & output) const {
-        if (env == nullptr) return false;
-        char * value = std::getenv(env);
-        if (value) {
-            output = value;
-            return true;
-        }
-        return false;
-    }
-
-    bool has_value_from_env() const {
-        return env != nullptr && std::getenv(env);
-    }
-
-    std::string to_string();
-};
-
-// initialize list of options (arguments) that can be used by the current example
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
-// optionally, we can provide "print_usage" to print example usage
-std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
-
-// parse input arguments from CLI
-// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
-bool gpt_params_parse   (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
-
-// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
-void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
-
 std::string gpt_params_get_system_info(const gpt_params & params);
 
 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 7806b77e0..21b956462 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -2,6 +2,9 @@
 
 #include "common.h"
 
+#include <cmath>
+#include <unordered_map>
+
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 // TODO: deduplicate with llama-impl.h
 template<typename T>
@@ -420,7 +423,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
 }
 
 std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
-    std::unordered_map<char, gpt_sampler_type> sampler_name_map {
+    std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K),       GPT_SAMPLER_TYPE_TOP_K },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z),       GPT_SAMPLER_TYPE_TFS_Z },
         { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P),   GPT_SAMPLER_TYPE_TYPICAL_P },
diff --git a/common/sampling.h b/common/sampling.h
index 654e0c513..0a4461fab 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -2,61 +2,11 @@
 
 #include "llama.h"
 
+#include "common.h"
+
 #include <string>
 #include <vector>
 
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-};
-
-// sampling parameters
-struct gpt_sampler_params {
-    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
-
-    int32_t n_prev            = 64;    // number of previous tokens to remember
-    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t min_keep          = 0;     // 0 = disabled, otherwise samplers should return at least min_keep tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   min_p             = 0.05f; // 0.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typ_p             = 1.00f; // typical_p, 1.0 = disabled
-    float   temp              = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float   dynatemp_range    = 0.00f; // 0.0 = disabled
-    float   dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   penalty_repeat    = 1.00f; // 1.0 = disabled
-    float   penalty_freq      = 0.00f; // 0.0 = disabled
-    float   penalty_present   = 0.00f; // 0.0 = disabled
-    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-    bool    penalize_nl       = false; // consider newlines as a repeatable token
-    bool    ignore_eos        = false;
-
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE
-    };
-
-    std::string grammar; // optional BNF-like grammar to constrain sampling
-
-    std::vector<llama_logit_bias> logit_bias; // logit biases to apply
-
-    // print the parameters into a string
-    std::string print() const;
-};
-
 // gpt_sampler extends llama_sampler with additional functionality:
 //
 //  - grammar support
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index f3b0c433b..a91e7f4bd 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -37,8 +38,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
         return 1;
     }
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 615d6f0f5..5d32153fe 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
         return 1;
     }
 
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 0795175a1..569b6c38f 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
         return 1;
     }
 
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index 6ec3141af..05c66856c 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -12,12 +12,9 @@
 
 #include <cstdio>
 #include <ctime>
+#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
 
 #define DEBUG_POS 5
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 630f7c1c7..da7c79253 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
         return 1;
     }
 
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 881111ffd..bc7203143 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 #include "ggml.h"
@@ -144,8 +145,7 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 544e7fff6..ff324926a 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
         return 1;
     }
 
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp
index 8b1dafd63..b6d4725fd 100644
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 
 #include <fstream>
@@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
     std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
 
     gpt_params params;
-    auto options = gpt_params_parser_init(params, ex);
+    auto ctx_arg = gpt_params_parser_init(params, ex);
 
     file << "| Argument | Explanation |\n";
     file << "| -------- | ----------- |\n";
-    for (auto & opt : options) {
+    for (auto & opt : ctx_arg.options) {
         file << "| `";
         // args
         for (const auto & arg : opt.args) {
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 6f060e2dc..14c715202 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -153,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
 int main(int argc, char * argv[]) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 15a3f0d14..032a90136 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
     params.logits_all = true;
     params.verbosity = 1;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
         return 1;
     }
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 87abb761f..9a527e244 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,6 +1,7 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
 #include "llama.h"
 
 #include <cassert>
@@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
         return 1;
     }
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 5845d0106..e9108a9bd 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,11 +1,12 @@
-#include "ggml.h"
+#include "arg.h"
+#include "base64.hpp"
 #include "log.h"
 #include "common.h"
+#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
-
-#include "base64.hpp"
+#include "ggml.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -278,8 +279,7 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
         return 1;
     }
 
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 57e7d42c5..3475bbce5 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -1,9 +1,11 @@
-#include "ggml.h"
+#include "arg.h"
 #include "log.h"
 #include "common.h"
+#include "sampling.h"
 #include "clip.h"
 #include "llava.h"
 #include "llama.h"
+#include "ggml.h"
 
 #include <cstdio>
 #include <cstdlib>
@@ -253,8 +255,7 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
         return 1;
     }
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 5027a483a..de8b792f2 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,4 +1,6 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"
 
 #include <cstdio>
@@ -36,8 +38,7 @@ struct ngram_container {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 795b06c88..33287c02c 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -1,7 +1,8 @@
-#include "ggml.h"
-#include "llama.h"
+#include "arg.h"
 #include "common.h"
 #include "ngram-cache.h"
+#include "ggml.h"
+#include "llama.h"
 
 #include <cstdint>
 #include <fstream>
@@ -13,8 +14,7 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
@@ -40,4 +40,6 @@ int main(int argc, char ** argv){
     fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
 
     llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
+
+    return 0;
 }
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 93299ef8b..f299d68a9 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -1,8 +1,9 @@
-#include "ggml.h"
+#include "arg.h"
 #include "common.h"
-#include "llama.h"
 #include "log.h"
 #include "ngram-cache.h"
+#include "llama.h"
+#include "ggml.h"
 
 #include <cmath>
 #include <cstdint>
@@ -15,8 +16,7 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 9ac7f6b47..fff44a499 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,7 +1,9 @@
+#include "arg.h"
 #include "ggml.h"
-#include "llama.h"
 #include "common.h"
 #include "ngram-cache.h"
+#include "sampling.h"
+#include "llama.h"
 
 #include <cstdint>
 #include <cstdio>
@@ -12,8 +14,7 @@
 int main(int argc, char ** argv){
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
         return 1;
     }
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ef2158842..b986a865a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,6 +1,7 @@
+#include "arg.h"
 #include "common.h"
-
 #include "console.h"
+#include "sampling.h"
 #include "llama.h"
 
 #include <cassert>
@@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
 int main(int argc, char ** argv) {
     gpt_params params;
     g_params = &params;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
-
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
         return 1;
     }
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7f512d8ad..bc6301311 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -1,7 +1,9 @@
 // A basic application simulating a server with multiple clients.
 // The clients submit requests to the server and they are processed in parallel.
 
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"
 
 #include <cmath>
@@ -100,8 +102,7 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
         return 1;
     }
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 271ef3a98..d3d5ab46f 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
     params.n_keep = 32;
     params.i_pos  = -1;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
         return 1;
     }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 570ee8aeb..c7d617988 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,18 +1,19 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
+#include <array>
+#include <atomic>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
+#include <fstream>
+#include <mutex>
+#include <random>
 #include <sstream>
 #include <thread>
-#include <mutex>
-#include <atomic>
 #include <vector>
-#include <array>
-#include <fstream>
-#include <sstream>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
     params.n_ctx = 512;
     params.logits_all = true;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
         return 1;
     }
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index dd8a82e6e..7a360b731 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
         return 1;
     }
 
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index e17ab0ed0..0117d9357 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
     params.prompt = "The quick brown fox";
     params.sparams.seed = 1234;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
     }
 
diff --git a/examples/server/README.md b/examples/server/README.md
index ed1201ba8..79196e9c1 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
 | `--version` | show version and build info |
 | `-v, --verbose` | print verbose information |
 | `--verbosity N` | set specific verbosity level (default: 0) |
-| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
-| `--no-display-prompt` | don't print prompt at generation (default: false) |
-| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
 | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
 | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
 | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
 | `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
+| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
 | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
 | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
+| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
-| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
 | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
 | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
-| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
 | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
 | `-p, --prompt PROMPT` | prompt to start generation with |
 | `-f, --file FNAME` | a file containing the prompt (default: none) |
-| `--in-file FNAME` | an input file (repeat to specify multiple files) |
 | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
 | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
 | `--no-escape` | do not process escape sequences |
 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
+| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
 | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
@@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
-| `-ns, --sequences N` | number of sequences to decode (default: 1) |
 | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
 | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
-| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
+| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
 | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
 | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
-| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
+| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
 | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
 | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
@@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
 | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
-| `--timeout N` | server read/write timeout in seconds (default: 600) |
+| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
 | `--log-format {text, json}` | log output format: json or text (default: json) |
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index de3ea313c..7495821f9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,8 @@
 #include "utils.hpp"
 
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
 
@@ -2423,8 +2425,7 @@ int main(int argc, char ** argv) {
     // own arguments required by this example
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
         return 1;
     }
 
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index d040172a5..3fdc04394 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,3 +1,4 @@
+#include "arg.h"
 #include "common.h"
 #include "llama.h"
 
@@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
     params.prompt = "Hello my name is";
     params.n_predict = 32;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
         return 1;
     }
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 8f29b5a2c..214e4932b 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,11 +1,13 @@
+#include "arg.h"
 #include "common.h"
+#include "sampling.h"
 #include "llama.h"
 
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 #include <set>
+#include <random>
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -27,8 +29,7 @@ struct seq_draft {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
-    if (!gpt_params_parse(argc, argv, params, options)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
         return 1;
     }
 
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 9ad91acc0..f26707910 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -1,3 +1,6 @@
+#include "arg.h"
+#include "common.h"
+
 #include <string>
 #include <vector>
 #include <sstream>
@@ -6,18 +9,16 @@
 #undef NDEBUG
 #include <cassert>
 
-#include "common.h"
-
 int main(void) {
     gpt_params params;
 
     printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
     for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
         try {
-            auto options = gpt_params_parser_init(params, (enum llama_example)ex);
+            auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
             std::unordered_set<std::string> seen_args;
             std::unordered_set<std::string> seen_env_vars;
-            for (const auto & opt : options) {
+            for (const auto & opt : ctx_arg.options) {
                 // check for args duplications
                 for (const auto & arg : opt.args) {
                     if (seen_args.find(arg) == seen_args.end()) {
@@ -52,40 +53,51 @@ int main(void) {
     };
 
     std::vector<std::string> argv;
-    auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
 
     printf("test-arg-parser: test invalid usage\n\n");
 
+    // missing value
     argv = {"binary_name", "-m"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
+    // wrong value (int)
     argv = {"binary_name", "-ngl", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
+    // wrong value (enum)
     argv = {"binary_name", "-sm", "hello"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+
+    // non-existence arg in specific example (--draft cannot be used outside llama-speculative)
+    argv = {"binary_name", "--draft", "123"};
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
 
 
     printf("test-arg-parser: test valid usage\n\n");
 
     argv = {"binary_name", "-m", "model_file.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "model_file.gguf");
 
     argv = {"binary_name", "-t", "1234"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.cpuparams.n_threads == 1234);
 
     argv = {"binary_name", "--verbose"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.verbosity == 1);
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "abc.gguf");
     assert(params.n_predict == 6789);
     assert(params.n_batch == 9090);
 
+    // --draft cannot be used outside llama-speculative
+    argv = {"binary_name", "--draft", "123"};
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
+    assert(params.n_draft == 123);
+
 // skip this part on windows, because setenv is not supported
 #ifdef _WIN32
     printf("test-arg-parser: skip on windows build\n");
@@ -94,12 +106,12 @@ int main(void) {
 
     setenv("LLAMA_ARG_THREADS", "blah", true);
     argv = {"binary_name"};
-    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
 
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "blah.gguf");
     assert(params.cpuparams.n_threads == 1010);
 
@@ -109,7 +121,7 @@ int main(void) {
     setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
     setenv("LLAMA_ARG_THREADS", "1010", true);
     argv = {"binary_name", "-m", "overwritten.gguf"};
-    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
+    assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
     assert(params.model == "overwritten.gguf");
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32

From fb3f2498156b3140e2050ec9c7bf61372f63ff56 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 10 Sep 2024 08:23:33 +0200
Subject: [PATCH 10/65] make : do not run llama-gen-docs when building (#9399)

---
 Makefile | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Makefile b/Makefile
index 97ef37c0e..c12bc61f4 100644
--- a/Makefile
+++ b/Makefile
@@ -1454,7 +1454,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-	./llama-gen-docs
 
 libllava.a: examples/llava/llava.cpp \
 	examples/llava/llava.h \

From 0b4ac75772b744bb0a0d674927587621d1057884 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Tue, 10 Sep 2024 15:02:30 +0800
Subject: [PATCH 11/65] RWKV v6: Add time_mix_decay_w1/w2 in quant exclusion
 list (#9387)

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 convert_hf_to_gguf.py | 2 ++
 src/llama.cpp         | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 0a9bbc829..ca473244e 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -302,6 +302,8 @@ class Model:
                             gguf.MODEL_TENSOR.TIME_MIX_FIRST,
                             gguf.MODEL_TENSOR.TIME_MIX_W1,
                             gguf.MODEL_TENSOR.TIME_MIX_W2,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
+                            gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
                         )
                     )
                     or not new_name.endswith(".weight")
diff --git a/src/llama.cpp b/src/llama.cpp
index 39e20440e..ee27cbd1c 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -17530,6 +17530,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= name.find("time_mix_first.weight") == std::string::npos;
         quantize &= name.find("time_mix_w1.weight") == std::string::npos;
         quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
 
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;

From 83008b7cfe90ad89d0c0ed2c2424fd75edc25ac1 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 10 Sep 2024 09:03:21 +0200
Subject: [PATCH 12/65] llama : update llm_build_copy_mask_state comment [no
 ci] (#9385)

This commit updates the comment, which seems to contain a typo or be an
outdated comment, in the copy_mask_state function changing the variable
n_rs to n_kv.

I believe this change is correct and what the comment wants to
convey is to copy the states that are not going to be used in the
upcoming processing, which are the tokens states from n_seqs up to
the number of possible token states n_kv.
---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index ee27cbd1c..40db03517 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9258,7 +9258,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
     // FIXME: zero-out NANs?
     states = ggml_mul(ctx, states, state_mask);
 
-    // copy states which won't be changed further (between n_seqs and n_rs)
+    // copy states which won't be changed further (between n_seqs and n_kv)
     ggml_build_forward_expand(graph,
         ggml_cpy(ctx,
             ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),

From 00ba2ff78100e187ae17987bacd1c916211718b2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 10 Sep 2024 10:17:03 +0300
Subject: [PATCH 13/65] metal : fix compile warning with GGML_METAL_NDEBUG (#0)

---
 ggml/src/ggml-metal.m | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index f04e5af71..6d8a7c898 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -17,8 +17,8 @@
 #define GGML_METAL_LOG_WARN(...)
 #define GGML_METAL_LOG_ERROR(...)
 #else
-#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
-#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
+#define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
+#define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #endif
 
@@ -3039,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
         if (status != MTLCommandBufferStatusCompleted) {
             GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
             if (status == MTLCommandBufferStatusError) {
-                NSString * error_code = [command_buffer error].localizedDescription;
-                GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
+                GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
             }
 
             return GGML_STATUS_FAILED;

From 49006c67b4c6cc2e7c75a875b4d6e161ebae287c Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 10 Sep 2024 18:04:25 +0200
Subject: [PATCH 14/65] llama : move random seed generation to the samplers
 (#9398)

* llama_sampler_penalties : clamp penalty_last_n to zero
---
 common/arg.cpp                     |  7 +--
 common/sampling.cpp                |  4 ++
 common/sampling.h                  |  2 +
 examples/embedding/embedding.cpp   |  2 -
 examples/infill/infill.cpp         |  7 +--
 examples/main/main.cpp             |  6 +-
 examples/perplexity/perplexity.cpp |  2 -
 examples/server/server.cpp         |  1 +
 include/llama.h                    |  4 ++
 src/llama-sampling.cpp             | 91 ++++++++++++++++++++++++------
 10 files changed, 92 insertions(+), 34 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index c5134be51..ca569494f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -173,7 +173,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
     std::string arg;
     const std::string arg_prefix = "--";
     gpt_params & params = ctx_arg.params;
-    gpt_sampler_params & sparams = params.sparams;
 
     std::unordered_map<std::string, llama_arg *> arg_to_options;
     for (auto & opt : ctx_arg.options) {
@@ -283,10 +282,6 @@ static bool gpt_params_parse_ex(int argc, char ** argv, gpt_params_context & ctx
         params.kv_overrides.back().key[0] = 0;
     }
 
-    if (sparams.seed == LLAMA_DEFAULT_SEED) {
-        sparams.seed = time(NULL);
-    }
-
     return true;
 }
 
@@ -909,7 +904,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
     ).set_sparam());
     add_opt(llama_arg(
         {"-s", "--seed"}, "SEED",
-        format("RNG seed (default: %d, use random seed for < 0)", params.sparams.seed),
+        format("RNG seed (default: %u, use random seed for %u)", params.sparams.seed, LLAMA_DEFAULT_SEED),
         [](gpt_params & params, const std::string & value) {
             params.sparams.seed = std::stoul(value);
         }
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 21b956462..4498feb11 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -310,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
     return cur_p.data[cur_p.selected].id;
 }
 
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
+    return llama_sampler_get_seed(gsmpl->chain);
+}
+
 // helpers
 
 llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
diff --git a/common/sampling.h b/common/sampling.h
index 0a4461fab..d0e1a9203 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -60,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
 //
 llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 
+uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
+
 // helpers
 
 // access the internal list of current candidate tokens
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index da7c79253..db00c6363 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -90,8 +90,6 @@ int main(int argc, char ** argv) {
 
     print_build_info();
 
-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
     llama_backend_init();
     llama_numa_init(params.numa);
 
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 9a527e244..7e252ce09 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -159,8 +159,6 @@ int main(int argc, char ** argv) {
 
     print_build_info();
 
-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
     LOG("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -301,6 +299,9 @@ int main(int argc, char ** argv) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
+    smpl = gpt_sampler_init(model, sparams);
+
+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
     LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     LOG_TEE("\n\n");
@@ -340,8 +341,6 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd;
 
-    smpl = gpt_sampler_init(model, sparams);
-
     while (n_remain != 0 || params.interactive) {
         // predict
         if (!embd.empty()) {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index b986a865a..f41be5308 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -191,8 +191,6 @@ int main(int argc, char ** argv) {
 
     print_build_info();
 
-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
     LOG("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -470,8 +468,10 @@ int main(int argc, char ** argv) {
         exit(1);
     }
 
+    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
     LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
-    LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+
     LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
     // group-attention state
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index c7d617988..04df65b0a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {
 
     print_build_info();
 
-    LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
-
     llama_backend_init();
     llama_numa_init(params.numa);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7495821f9..5b263f646 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1266,6 +1266,7 @@ struct server_context {
             {"n_predict",                 slot.n_predict},     // Server configured n_predict
             {"model",                     params.model_alias},
             {"seed",                      slot.sparams.seed},
+            {"seed_cur",                  slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
             {"temperature",               slot.sparams.temp},
             {"dynatemp_range",            slot.sparams.dynatemp_range},
             {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
diff --git a/include/llama.h b/include/llama.h
index 93b3e6e85..405af912c 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1127,6 +1127,10 @@ extern "C" {
                              int32_t   n_logit_bias,
               const llama_logit_bias * logit_bias);
 
+
+    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
+    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
+
     /// @details Sample and accept a token from the idx-th output of the last evaluation
     //
     // Shorthand for:
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 6f448b80c..fd1b7f919 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -8,6 +8,7 @@
 #include <cstring>
 #include <ctime>
 #include <cfloat>
+#include <chrono>
 #include <cmath>
 #include <numeric>
 #include <random>
@@ -162,6 +163,19 @@ static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k)
     cur_p->size = k;
 }
 
+static uint32_t get_rng_seed(uint32_t seed) {
+    if (seed == LLAMA_DEFAULT_SEED) {
+        // use system clock if std::random_device is not a true RNG
+        static bool is_rd_prng = std::random_device().entropy() == 0;
+        if (is_rd_prng) {
+            return (uint32_t) std::chrono::system_clock::now().time_since_epoch().count();
+        }
+        std::random_device rd;
+        return rd();
+    }
+    return seed;
+}
+
 // llama_sampler API
 
 const char * llama_sampler_name(const struct llama_sampler * smpl) {
@@ -387,6 +401,7 @@ struct llama_sampler * llama_sampler_init_greedy() {
 
 struct llama_sampler_dist {
     const uint32_t seed;
+          uint32_t seed_cur;
 
     std::mt19937 rng;
 };
@@ -416,7 +431,8 @@ static struct llama_sampler * llama_sampler_dist_clone(const struct llama_sample
 
 static void llama_sampler_dist_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_dist *) smpl->ctx;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }
 
 static void llama_sampler_dist_free(struct llama_sampler * smpl) {
@@ -433,11 +449,13 @@ static struct llama_sampler_i llama_sampler_dist_i = {
 };
 
 struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
+    auto seed_cur = get_rng_seed(seed);
     return new llama_sampler {
         /* .iface = */ &llama_sampler_dist_i,
         /* .ctx   = */ new llama_sampler_dist {
-            /* .seed = */ seed,
-            /* .rng  = */ std::mt19937(seed),
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .rng      = */ std::mt19937(seed_cur),
         },
     };
 }
@@ -1032,6 +1050,7 @@ struct llama_sampler_mirostat {
     const int32_t n_vocab;
 
     const uint32_t seed;
+          uint32_t seed_cur;
 
     const float tau;
     const float eta;
@@ -1100,7 +1119,8 @@ static struct llama_sampler * llama_sampler_mirostat_clone(const struct llama_sa
 static void llama_sampler_mirostat_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_mirostat *) smpl->ctx;
     ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }
 
 static void llama_sampler_mirostat_free(struct llama_sampler * smpl) {
@@ -1117,16 +1137,18 @@ static struct llama_sampler_i llama_sampler_mirostat_i = {
 };
 
 struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t seed, float tau, float eta, int32_t m) {
+    auto seed_cur = get_rng_seed(seed);
     return new llama_sampler {
         /* .iface = */ &llama_sampler_mirostat_i,
         /* .ctx   = */ new llama_sampler_mirostat {
-            /* .n_vocab = */ n_vocab,
-            /* .seed    = */ seed,
-            /* .tau     = */ tau,
-            /* .eta     = */ eta,
-            /* .m       = */ m,
-            /* .mu      = */ 2.0f*tau,
-            /* .rng     = */ std::mt19937(seed),
+            /* .n_vocab  = */ n_vocab,
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .tau      = */ tau,
+            /* .eta      = */ eta,
+            /* .m        = */ m,
+            /* .mu       = */ 2.0f*tau,
+            /* .rng      = */ std::mt19937(seed_cur),
         },
     };
 }
@@ -1135,6 +1157,7 @@ struct llama_sampler * llama_sampler_init_mirostat(int32_t n_vocab, uint32_t see
 
 struct llama_sampler_mirostat_v2 {
     const uint32_t seed;
+          uint32_t seed_cur;
 
     const float tau;
     const float eta;
@@ -1179,7 +1202,8 @@ static void llama_sampler_mirostat_v2_apply(struct llama_sampler * smpl, llama_t
 static void llama_sampler_mirostat_v2_reset(struct llama_sampler * smpl) {
     auto * ctx = (llama_sampler_mirostat_v2 *) smpl->ctx;
     ctx->mu = 2.0f*ctx->tau;
-    ctx->rng = std::mt19937(ctx->seed);
+    ctx->seed_cur = get_rng_seed(ctx->seed);
+    ctx->rng.seed(ctx->seed_cur);
 }
 
 static struct llama_sampler * llama_sampler_mirostat_v2_clone(const struct llama_sampler * smpl) {
@@ -1212,14 +1236,16 @@ static struct llama_sampler_i llama_sampler_mirostat_v2_i = {
 };
 
 struct llama_sampler * llama_sampler_init_mirostat_v2(uint32_t seed, float tau, float eta) {
+    auto seed_cur = get_rng_seed(seed);
     return new llama_sampler {
         /* .iface = */ &llama_sampler_mirostat_v2_i,
         /* .ctx   = */ new llama_sampler_mirostat_v2 {
-            /* .seed  = */ seed,
-            /* .tau   = */ tau,
-            /* .eta   = */ eta,
-            /* .mu    = */ 2.0f*tau,
-            /* .rng   = */ std::mt19937(seed),
+            /* .seed     = */ seed,
+            /* .seed_cur = */ seed_cur,
+            /* .tau      = */ tau,
+            /* .eta      = */ eta,
+            /* .mu       = */ 2.0f*tau,
+            /* .rng      = */ std::mt19937(seed_cur),
         },
     };
 }
@@ -1505,6 +1531,8 @@ struct llama_sampler * llama_sampler_init_penalties(
         ignore_eos = false;
     }
 
+    penalty_last_n = std::max(penalty_last_n, 0);
+
     return new llama_sampler {
         /* .iface = */ &llama_sampler_penalties_i,
         /* .ctx   = */ new llama_sampler_penalties {
@@ -1568,6 +1596,7 @@ static void llama_sampler_logit_bias_apply(struct llama_sampler * smpl, llama_to
         }
     }
 }
+
 static struct llama_sampler * llama_sampler_logit_bias_clone(const struct llama_sampler * smpl) {
     const auto * ctx = (const llama_sampler_logit_bias *) smpl->ctx;
     return llama_sampler_init_logit_bias(ctx->n_vocab, ctx->logit_bias.size(), ctx->logit_bias.data());
@@ -1599,3 +1628,31 @@ struct llama_sampler * llama_sampler_init_logit_bias(
         },
     };
 }
+
+// utils
+
+uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
+    if (smpl->iface == &llama_sampler_dist_i) {
+        return ((const llama_sampler_dist *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_i) {
+        return ((const llama_sampler_mirostat *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_mirostat_v2_i) {
+        return ((const llama_sampler_mirostat_v2 *) smpl->ctx)->seed_cur;
+    }
+
+    if (smpl->iface == &llama_sampler_chain_i) {
+        const auto * ctx = (const llama_sampler_chain *) smpl->ctx;
+        for (auto it = ctx->samplers.rbegin(); it != ctx->samplers.rend(); ++it) {
+            const uint32_t seed = llama_sampler_get_seed(*it);
+            if (seed != LLAMA_DEFAULT_SEED) {
+                return seed;
+            }
+        }
+    }
+
+    return LLAMA_DEFAULT_SEED;
+}

From 8d300bd35fbe23b35a4e1ece0cf0fe8f43331029 Mon Sep 17 00:00:00 2001
From: matteo <matteogeniaccio@yahoo.it>
Date: Tue, 10 Sep 2024 22:40:59 +0200
Subject: [PATCH 15/65] enable --special arg for llama-server (#9419)

Co-authored-by: matteo serva <matteo.serva@gmail.com>
---
 common/arg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index ca569494f..588571f1b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -818,7 +818,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.special = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"-cnv", "--conversation"},
         format(

From 6cd4e034442f71718563e600070c2b6fc389e100 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 10 Sep 2024 22:41:29 +0200
Subject: [PATCH 16/65] arg : bring back missing ifdef (#9411)

* arg : bring back missing ifdef

* replace with llama_supports_gpu_offload
---
 common/arg.cpp    | 28 +++++++++++++---------------
 common/common.cpp |  8 --------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 588571f1b..ce6a27614 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1417,20 +1417,18 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
                 params.split_mode = LLAMA_SPLIT_MODE_NONE;
             } else if (arg_next == "layer") {
                 params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            }
-            else if (arg_next == "row") {
+            } else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
                 fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
                 exit(1);
 #endif // GGML_USE_SYCL
                 params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            }
-            else {
+            } else {
                 throw std::invalid_argument("invalid value");
             }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the split mode has no effect.\n");
+            }
         }
     ));
     add_opt(llama_arg(
@@ -1450,14 +1448,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             }
             for (size_t i = 0; i < llama_max_devices(); ++i) {
                 if (i < split_arg.size()) {
-                        params.tensor_split[i] = std::stof(split_arg[i]);
+                    params.tensor_split[i] = std::stof(split_arg[i]);
                 } else {
-                        params.tensor_split[i] = 0.0f;
+                    params.tensor_split[i] = 0.0f;
                 }
             }
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting a tensor split has no effect.\n");
+            }
         }
     ));
     add_opt(llama_arg(
@@ -1465,9 +1463,9 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
         [](gpt_params & params, int value) {
             params.main_gpu = value;
-#ifndef GGML_USE_CUDA_SYCL_VULKAN
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL_VULKAN
+            if (!llama_supports_gpu_offload()) {
+                fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
+            }
         }
     ));
     add_opt(llama_arg(
diff --git a/common/common.cpp b/common/common.cpp
index 5395eaa0e..d572d2408 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -56,14 +56,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUDA_SYCL
-#endif
-
-#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUDA_SYCL_VULKAN
-#endif
-
 #if defined(LLAMA_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>

From cb9c933eb2a0d2b514556bdcb934b56dfe5d6771 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Sep 2024 01:46:59 +0300
Subject: [PATCH 17/65] flake.lock: Update (#9360)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'flake-parts':
    'github:hercules-ci/flake-parts/af510d4a62d071ea13925ce41c95e3dec816c01d?narHash=sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E%3D' (2024-08-30)
  → 'github:hercules-ci/flake-parts/567b938d64d4b4112ee253b9274472dc3a346eb6?narHash=sha256-%2Bebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y%3D' (2024-09-01)
• Updated input 'flake-parts/nixpkgs-lib':
    'https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz?narHash=sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q%3D' (2024-08-01)
  → 'https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz?narHash=sha256-Ss8QWLXdr2JCBPcYChJhz4xJm%2Bh/xjl4G0c0XlP6a74%3D' (2024-09-01)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/71e91c409d1e654808b2621f28a327acfdad8dc2?narHash=sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w%3D' (2024-08-28)
  → 'github:NixOS/nixpkgs/574d1eac1c200690e27b8eb4e24887f8df7ac27c?narHash=sha256-v3rIhsJBOMLR8e/RNWxr828tB%2BWywYIoajrZKFM%2B0Gg%3D' (2024-09-06)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/flake.lock b/flake.lock
index 10e1f8a29..e9382ff3d 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "nixpkgs-lib": "nixpkgs-lib"
       },
       "locked": {
-        "lastModified": 1725024810,
-        "narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
+        "lastModified": 1725234343,
+        "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
         "owner": "hercules-ci",
         "repo": "flake-parts",
-        "rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
+        "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1724819573,
-        "narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
+        "lastModified": 1725634671,
+        "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
+        "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
         "type": "github"
       },
       "original": {
@@ -36,14 +36,14 @@
     },
     "nixpkgs-lib": {
       "locked": {
-        "lastModified": 1722555339,
-        "narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
+        "lastModified": 1725233747,
+        "narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
         "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
       },
       "original": {
         "type": "tarball",
-        "url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
+        "url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
       }
     },
     "root": {

From 51b603863627c4074e77b7e556e18ece86bdf9a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alberto=20Cabrera=20P=C3=A9rez?=
 <alberto.cabrera@codeplay.com>
Date: Wed, 11 Sep 2024 01:53:42 +0100
Subject: [PATCH 18/65] sycl : update support conditions  (#9394)

* sycl : update support condition to im2col

Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com>

* Added TODO to remind supporting FP32 im2col

---------

Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com>
---
 ggml/src/ggml-sycl.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index 4f03b01e7..e60350399 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -5137,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
         case GGML_OP_SCALE:
         case GGML_OP_SQR:
         case GGML_OP_CLAMP:
+            return true;
         case GGML_OP_CONT:
+            return op->src[0]->type != GGML_TYPE_BF16;
         case GGML_OP_DIAG_MASK_INF:
         case GGML_OP_SOFT_MAX:
             return true;
         case GGML_OP_ROPE:
             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_IM2COL:
+            // TODO: add support for the new F32 operations
+            return op->src[0]->type == GGML_TYPE_F16;
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:

From b34e02348064c2f0cef1f89b44d9bee4eb15b9e7 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Wed, 11 Sep 2024 09:46:55 +0800
Subject: [PATCH 19/65] musa: remove Clang builtins mapping (#9421)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 ggml/src/ggml-cuda/vendors/musa.h | 39 -------------------------------
 1 file changed, 39 deletions(-)

diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h
index e50a103ac..8df571149 100644
--- a/ggml/src/ggml-cuda/vendors/musa.h
+++ b/ggml/src/ggml-cuda/vendors/musa.h
@@ -130,42 +130,3 @@
 #define cudaKernelNodeParams musaKernelNodeParams
 #define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
 #define cudaStreamEndCapture musaStreamEndCapture
-
-// XXX: Clang builtins mapping
-#define __vsub4   __vsub4_musa
-#define __vcmpeq4 __vcmpeq4_musa
-#define __vcmpne4 __vcmpne4_musa
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
-
-static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
-    return __vsubss4(a, b);
-}
-
-static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0xff : 0x00;
-    }
-    return c;
-}
-
-static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
-    const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
-    const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
-    unsigned int c;
-    uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
-#pragma unroll
-    for (int i = 0; i < 4; ++i) {
-        vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
-    }
-    return c;
-}

From d2b496bff4f353a6429f8e833448f071bd237ba7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Sep 2024 10:03:54 +0300
Subject: [PATCH 20/65] batched-bench : remove unused code (#9305)

---
 examples/batched-bench/batched-bench.cpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index a91e7f4bd..89a4566c4 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -3,32 +3,10 @@
 #include "llama.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstdio>
 #include <string>
 #include <vector>
 
-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
 static void print_usage(int, char ** argv) {
     LOG_TEE("\nexample usage:\n");
     LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);

From 5af118efdaf1098798a06b24fd8a557760e99631 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 11 Sep 2024 10:22:40 +0200
Subject: [PATCH 21/65] CUDA: fix --split-mode row race condition (#9413)

---
 ggml/src/ggml-cuda/mmq.cu  | 6 +++++-
 ggml/src/ggml-cuda/mmq.cuh | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 78d70cd7a..4935f8818 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -26,7 +26,11 @@ void ggml_cuda_op_mul_mat_q(
     // nrows_dst == nrows of the matrix that the kernel writes into
     const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
 
-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
+    // The stream-k decomposition is only faster for recent NVIDIA GPUs.
+    // Also its fixup needs to allocate a temporary buffer in the memory pool.
+    // There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
+    const bool use_stream_k = compute_capability >= CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index e8a957447..021a25682 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2742,6 +2742,7 @@ struct mmq_args {
     int64_t ne00; int64_t ne01; int64_t stride01;
     int64_t ne10; int64_t ne11; int64_t stride11;
     int64_t ne0;
+    bool use_stream_k;
 };
 
 template<ggml_type type>
@@ -2777,8 +2778,7 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
     const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
     const dim3 block_nums_xy_tiling(nty, ntx, 1);
 
-    const bool use_stream_k = cc >= CC_VOLTA && cc < CC_OFFSET_AMD;
-    if (!use_stream_k) {
+    if (!args.use_stream_k) {
         if (args.ne01 % mmq_y == 0) {
             constexpr bool need_check = false;
             mul_mat_q<type, mmq_x, MMQ_NWARPS, need_check><<<block_nums_xy_tiling, block_dims, shmem, stream>>>

From 67155ab7f5e47c01b62aa989eab30f517bf6dc67 Mon Sep 17 00:00:00 2001
From: Farbod Bijary <110523279+farbodbj@users.noreply.github.com>
Date: Wed, 11 Sep 2024 12:52:37 +0330
Subject: [PATCH 22/65] feat: Implements retrying logic for downloading models
 using --model-url flag (#9255)

* feat: Implements retrying logic for downloading models using --model-url flag

* Update common/common.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* Update common/common.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* apply comments

* implements a retry function to avoid duplication

* fix editorconfig

* change function name

---------

Co-authored-by: farbod <farbod.bjary82@gmail.com>
Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 common/common.cpp | 40 ++++++++++++++++++++++++++++++++--------
 lora-tests        |  1 +
 2 files changed, 33 insertions(+), 8 deletions(-)
 create mode 160000 lora-tests

diff --git a/common/common.cpp b/common/common.cpp
index d572d2408..30c6e84c7 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -941,11 +941,37 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 
 #ifdef LLAMA_USE_CURL
 
+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+
 static bool starts_with(const std::string & str, const std::string & prefix) {
     // While we wait for C++20's std::string::starts_with...
     return str.rfind(prefix, 0) == 0;
 }
 
+static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
+    int remaining_attempts = max_attempts;
+
+    while (remaining_attempts > 0) {
+        fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res == CURLE_OK) {
+            return true;
+        }
+
+        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+        fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+        remaining_attempts--;
+        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+    }
+
+    fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+    return false;
+}
+
 static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
 
     // Initialize libcurl
@@ -1049,9 +1075,8 @@ static bool llama_download_file(const std::string & url, const std::string & pat
         curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
         curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
 
-        CURLcode res = curl_easy_perform(curl.get());
-        if (res != CURLE_OK) {
-            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
             return false;
         }
 
@@ -1126,11 +1151,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
         };
 
         // start the download
-        fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-                llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        auto res = curl_easy_perform(curl.get());
-        if (res != CURLE_OK) {
-            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+        fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
             return false;
         }
 
diff --git a/lora-tests b/lora-tests
new file mode 160000
index 000000000..c26d5fb85
--- /dev/null
+++ b/lora-tests
@@ -0,0 +1 @@
+Subproject commit c26d5fb85b4070a9e9c4e65d132c783b98086890

From 5bb2c5dbd26b246d334f0087b3cbd800f2e65c54 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 11 Sep 2024 12:02:09 +0200
Subject: [PATCH 23/65] files : remove accidentally added `lora_test` submodule
 (#9430)

---
 lora-tests | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 lora-tests

diff --git a/lora-tests b/lora-tests
deleted file mode 160000
index c26d5fb85..000000000
--- a/lora-tests
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c26d5fb85b4070a9e9c4e65d132c783b98086890

From 0996c5597f680effacc046832bb807c14900e22d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Wed, 11 Sep 2024 12:59:13 +0200
Subject: [PATCH 24/65] llava : correct args for minicpmv-cli (#9429)

---
 examples/llava/minicpmv-cli.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 3475bbce5..afc74d279 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -18,8 +18,8 @@ struct llava_context {
 };
 
 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
 static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
@@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
+    if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
         return 1;
     }
 

From 8db003a19d7055b5bd248ce2afff9324e5b8da95 Mon Sep 17 00:00:00 2001
From: Pavel Zloi <github.com@drteam.rocks>
Date: Wed, 11 Sep 2024 15:29:51 +0300
Subject: [PATCH 25/65] py : support converting local models (#7547)

* Support of converting local models added to convert-hf-to-gguf-update.py

* Description fixed

* shutil added to imports
---
 convert_hf_to_gguf_update.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index ff4955f9c..59a0b81a1 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -31,6 +31,7 @@ import re
 import requests
 import sys
 import json
+import shutil
 
 from hashlib import sha256
 from enum import IntEnum, auto
@@ -125,12 +126,27 @@ def download_model(model):
     if tokt == TOKENIZER_TYPE.UGM:
         files.append("spiece.model")
 
-    for file in files:
-        save_path = f"models/tokenizers/{name}/{file}"
-        if os.path.isfile(save_path):
-            logger.info(f"{name}: File {save_path} already exists - skipping")
-            continue
-        download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
+    if os.path.isdir(repo):
+        # If repo is a path on the file system, copy the directory
+        for file in files:
+            src_path = os.path.join(repo, file)
+            dst_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(dst_path):
+                logger.info(f"{name}: File {dst_path} already exists - skipping")
+                continue
+            if os.path.isfile(src_path):
+                shutil.copy2(src_path, dst_path)
+                logger.info(f"{name}: Copied {src_path} to {dst_path}")
+            else:
+                logger.warning(f"{name}: Source file {src_path} does not exist")
+    else:
+        # If repo is a URL, download the files
+        for file in files:
+            save_path = f"models/tokenizers/{name}/{file}"
+            if os.path.isfile(save_path):
+                logger.info(f"{name}: File {save_path} already exists - skipping")
+                continue
+            download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
 
 
 for model in models:

From 1b28061400eb9832603c9f1dfbec4d339a8490a2 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Wed, 11 Sep 2024 17:52:13 +0200
Subject: [PATCH 26/65] llama : skip token bounds check when evaluating
 embeddings (#9437)

---
 src/llama.cpp | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 40db03517..f1a95b3a3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16076,19 +16076,21 @@ static int llama_decode_internal(
         return -1;
     }
 
-    for (uint32_t i = 0; i < n_tokens_all; ++i) {
-        if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
-            return -1;
-        }
-    }
-
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
 
     GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
 
+    if (batch_all.token) {
+        for (uint32_t i = 0; i < n_tokens_all; ++i) {
+            if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
+                return -1;
+            }
+        }
+    }
+
     GGML_ASSERT(n_tokens_all <= cparams.n_batch);
 
     GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
@@ -16375,19 +16377,21 @@ static int llama_encode_internal(
         return -1;
     }
 
-    for (uint32_t i = 0; i < n_tokens; ++i) {
-        if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
-            LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
-            return -1;
-        }
-    }
-
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
     const auto & cparams = lctx.cparams;
 
     GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
 
+    if (batch.token) {
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= model.vocab.n_vocab) {
+                LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
+                return -1;
+            }
+        }
+    }
+
     // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
     GGML_ASSERT(cparams.n_ubatch >= n_tokens && "encoder requires n_ubatch >= n_tokens");
 

From 449ccfb6f5f1bbd70e04f75a330d9d7c1af82187 Mon Sep 17 00:00:00 2001
From: Faisal Zaghloul <quic_fzaghlou@quicinc.com>
Date: Wed, 11 Sep 2024 20:29:53 -0400
Subject: [PATCH 27/65] Add Jais to list of supported models (#9439)

Co-authored-by: fmz <quic_fzaghlou@quic.com>
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c945e125c..73041b1a2 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
 - [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
 - [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
+- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
 
 (instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
 

From df4b7945aeccae2a71348e5a9c1eab5241e3e0ef Mon Sep 17 00:00:00 2001
From: Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
Date: Thu, 12 Sep 2024 09:02:35 +0800
Subject: [PATCH 28/65] cann: Fix error when running a non-exist op (#9424)

---
 ggml/src/ggml-cann.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index 06930ba2e..24b8b752c 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1942,7 +1942,7 @@ GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
         GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
         return nullptr;
     }
-
+    ggml_cann_set_device(ctx->device);
     ggml_backend_t cann_backend =
         new ggml_backend{/* .guid      = */ ggml_backend_cann_guid(),
                          /* .interface = */ ggml_backend_cann_interface,

From c9c8575a1a8a170329afca4c4df4c005806efb1d Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Thu, 12 Sep 2024 17:44:17 +0800
Subject: [PATCH 29/65] enhance run script to be easy to change the parameters
 (#9448)

Co-authored-by: arthw <14088817+arthw@users.noreply.github.com>
---
 examples/sycl/run-llama2.sh | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index 111366fb0..a8cf0aa64 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -4,33 +4,23 @@
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 
-INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 
-if [ $# -gt 0 ]; then
-    GGML_SYCL_DEVICE=$1
-    GGML_SYCL_SINGLE_GPU=1
-else
-    GGML_SYCL_DEVICE=0
-    GGML_SYCL_SINGLE_GPU=0
-fi
-
 #export GGML_SYCL_DEBUG=1
 
-
 #ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
 
-if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
+INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
+MODEL_FILE=llama-2-7b.Q4_0.gguf
+NGL=33
+
+if [ $# -gt 0 ]; then
+    GGML_SYCL_DEVICE=$1
     echo "use $GGML_SYCL_DEVICE as main GPU"
     #use signle GPU only
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -mg $GGML_SYCL_DEVICE -sm none
+
 else
     #use multiple GPUs with same max compute units
-    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+    ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0
 fi
-
-#use main GPU only
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
-
-#use multiple GPUs with same max compute units
-#ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0

From d6a04f872dea8ade92527bb1488d4b0b90cc49f0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 12 Sep 2024 14:23:49 +0300
Subject: [PATCH 30/65] ggml : hide ggml_object, ggml_cgraph, ggml_hash_set
 (#9408)

* ggml : hide ggml_object, ggml_cgraph, ggml_hash_set

ggml-ci

* ggml : add ggml-impl.h to backends

* ggml : fix compiler warnings

ggml-ci

* ggml : add assert upon adding nodes
---
 examples/benchmark/benchmark-matmult.cpp |   6 +-
 examples/cvector-generator/pca.hpp       |   4 +-
 examples/export-lora/export-lora.cpp     |   2 +-
 examples/llava/clip.cpp                  |   2 +-
 examples/llava/llava.cpp                 |   2 +-
 ggml/include/ggml.h                      |  87 +++++-------------
 ggml/src/ggml-blas.cpp                   |   1 +
 ggml/src/ggml-cann.cpp                   |   1 +
 ggml/src/ggml-cuda.cu                    |   2 +-
 ggml/src/ggml-impl.h                     |  32 +++++++
 ggml/src/ggml-kompute.cpp                |   2 +-
 ggml/src/ggml-metal.m                    |   4 +-
 ggml/src/ggml-rpc.cpp                    |   2 +-
 ggml/src/ggml-sycl.cpp                   |   2 +-
 ggml/src/ggml-vulkan.cpp                 |   2 +-
 ggml/src/ggml.c                          | 112 ++++++++++++++++-------
 src/llama.cpp                            |  22 ++---
 tests/test-backend-ops.cpp               |  14 +--
 18 files changed, 170 insertions(+), 129 deletions(-)

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 97622f4f4..922daf528 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -183,7 +183,7 @@ int main(int argc, char ** argv)  {
 
     ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
 
-    TENSOR_DUMP(gf->nodes[0]);
+    TENSOR_DUMP(ggml_graph_node(gf, 0));
 
     printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
 
@@ -224,7 +224,7 @@ int main(int argc, char ** argv)  {
 
 
     // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
+    float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
 
     printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
     printf("=====================================================================================\n");
@@ -252,7 +252,7 @@ int main(int argc, char ** argv)  {
 
         // Check that the matrix multiplication result is in the right ballpark
         // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
+        float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
         float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
         float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
 
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp
index 05c66856c..a969c486d 100644
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -226,8 +226,8 @@ static ggml_status compute_piter(
         result.eigenvectors.resize(params.n_batch);
         result.distances.resize(params.n_batch);
         // get output nodes
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            auto node = gf->nodes[i];
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            auto node = ggml_graph_node(gf, i);
             int iter = -1;
             // find b_tensor (without copying data from device)
             if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index ff324926a..90126ad1e 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -370,7 +370,7 @@ struct lora_merge_ctx {
 
         // write data to output file
         {
-            auto result = gf->nodes[gf->n_nodes - 1];
+            auto * result = ggml_graph_node(gf, -1);
             size_t len = ggml_nbytes(result);
             if (read_buf.size() < len) {
                 read_buf.resize(len);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9b890571e..5dfb333d1 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     ggml_backend_graph_compute(ctx->backend, gf);
 
     // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
 
     // copy the embeddings to the location passed by the user
     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 851af0f00..e162586ed 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
     ggml_build_forward_expand(gf, flatten);
     ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor* result = ggml_graph_node(gf, -1);
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 536018b66..86ad6fb62 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -358,6 +358,7 @@ extern "C" {
 
     struct ggml_object;
     struct ggml_context;
+    struct ggml_cgraph;
 
     // NOTE: always add types at the end of the enum to keep backward compatibility
     enum ggml_type {
@@ -575,23 +576,9 @@ extern "C" {
         GGML_TENSOR_FLAG_PARAM  = 4,
     };
 
-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
     // n-dimensional tensor
     struct ggml_tensor {
-        enum ggml_type         type;
+        enum ggml_type type;
 
         GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
 
@@ -655,7 +642,7 @@ extern "C" {
 
     struct ggml_threadpool;     // forward declaration, see ggml.c
 
-    typedef struct  ggml_threadpool * ggml_threadpool_t;
+    typedef struct ggml_threadpool * ggml_threadpool_t;
 
     // the compute plan that needs to be prepared for ggml_graph_compute()
     // since https://github.com/ggerganov/ggml/issues/287
@@ -671,35 +658,6 @@ extern "C" {
         void *              abort_callback_data;
     };
 
-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    typedef uint32_t ggml_bitset_t;
-
-    struct ggml_hash_set {
-        size_t size;
-        ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
-        struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_set;
-
-        enum ggml_cgraph_eval_order order;
-    };
-
     // scratch buffer
     struct ggml_scratch {
         size_t offs;
@@ -2017,8 +1975,6 @@ extern "C" {
     typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
     typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
 
-    #define GGML_N_TASKS_MAX -1
-
     GGML_API struct ggml_tensor * ggml_map_custom1(
             struct ggml_context   * ctx,
             struct ggml_tensor    * a,
@@ -2088,30 +2044,35 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * tensor);
 
-
     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
     GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
 
     // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
+
+    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+
+    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
-    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
-    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
-    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
-    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp
index 713731735..6d99c6bea 100644
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -1,3 +1,4 @@
+#include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
 
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index 24b8b752c..e9c370b9b 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -30,6 +30,7 @@
 #include <cstring>
 #include <mutex>
 
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 #include "ggml-cann/aclnn_ops.h"
 #include "ggml-cann/common.h"
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index d53de4edd..54f1a7c2d 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1,5 +1,5 @@
 #include "ggml-cuda.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include "ggml-cuda/common.cuh"
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 961f3c67b..cb7f7728b 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 
+enum ggml_cgraph_eval_order {
+    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
 // bitset
 
+typedef uint32_t ggml_bitset_t;
+
 static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
 #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
 #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
@@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
 #define GGML_HASHSET_FULL ((size_t)-1)
 #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
 
+struct ggml_hash_set {
+    size_t size;
+    ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+    struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
+};
+
 struct ggml_hash_set ggml_hash_set_new(size_t size);
 void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);
 
@@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
     GGML_ABORT("fatal error");
 }
 
+// computation graph
+
+struct ggml_cgraph {
+    int size;
+    int n_nodes;
+    int n_leafs;
+
+    struct ggml_tensor ** nodes;
+    struct ggml_tensor ** grads;
+    struct ggml_tensor ** leafs;
+
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+};
+
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp
index 41ac63fa4..7f0bd82d5 100644
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend.h"
 #include "ggml-backend-impl.h"
 #include "ggml-kompute.h"
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 6d8a7c898..6c85acfec 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1,7 +1,7 @@
 #import "ggml-metal.h"
 
+#import "ggml-impl.h"
 #import "ggml-backend-impl.h"
-#import "ggml.h"
 
 #import <Foundation/Foundation.h>
 
@@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
-    const int n_nodes  = gf->n_nodes;
+    const int n_nodes = gf->n_nodes;
     const int n_cb = ctx->n_cb;
     const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
 
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp
index 9c600c7ca..a8a2eb85a 100644
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -1,5 +1,5 @@
 #include "ggml-rpc.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include <cinttypes>
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp
index e60350399..acef7c6d4 100644
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -33,7 +33,7 @@
 #include <sycl/half_type.hpp>
 
 #include "ggml-sycl.h"
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include "ggml-sycl/backend.hpp"
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 83737c1d9..bad960510 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -21,7 +21,7 @@
 #include <memory>
 #include <mutex>
 
-#include "ggml.h"
+#include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
 #include "ggml-vulkan-shaders.hpp"
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index d7157ca6d..47417c024 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16
 #define GGML_GELU_QUICK_FP16
+#define GGML_N_TASKS_MAX (-1)
 
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
@@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
 #define GGML_F32x4_ADD          vaddq_f32
 #define GGML_F32x4_MUL          vmulq_f32
 #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
 }
 
 #define GGML_F32_VEC        GGML_F32x4
@@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
     #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
     #define GGML_F16x8_ADD          vaddq_f16
     #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                             \
-    do {                                                          \
-        int offset = GGML_F16_ARR >> 1;                           \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    #define GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
     } while (0)
 
     #define GGML_F16_VEC                GGML_F16x8
     #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
     #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
     #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
     #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
     #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
     #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
@@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
 #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
 #endif
 
+//
+// ggml object
+//
+
+struct ggml_object {
+    size_t offs;
+    size_t size;
+
+    struct ggml_object * next;
+
+    enum ggml_object_type type;
+
+    char padding[4];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
 //
 // ggml context
 //
@@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
     ggml_hash_set_reset(&cgraph->visited_hash_set);
 }
 
+int ggml_graph_size(struct ggml_cgraph * cgraph) {
+    return cgraph->size;
+}
+
+struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
+    if (i < 0) {
+        GGML_ASSERT(cgraph->n_nodes + i >= 0);
+        return cgraph->nodes[cgraph->n_nodes + i];
+    }
+
+    GGML_ASSERT(i < cgraph->n_nodes);
+    return cgraph->nodes[i];
+}
+
+struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->nodes;
+}
+
+int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->n_nodes;
+}
+
+void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    GGML_ASSERT(cgraph->size > cgraph->n_nodes);
+    cgraph->nodes[cgraph->n_nodes] = tensor;
+    cgraph->n_nodes++;
+}
+
 // Android's libc implementation "bionic" does not support setting affinity
 #if defined(__gnu_linux__)
 static void set_numa_thread_affinity(int thread_n) {
diff --git a/src/llama.cpp b/src/llama.cpp
index f1a95b3a3..0f80b2402 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9877,8 +9877,8 @@ struct llm_build_context {
     struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
         // find result_norm tensor for input
         struct ggml_tensor * inp = nullptr;
-        for (int i = gf->n_nodes - 1; i >= 0; --i) {
-            inp = gf->nodes[i];
+        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = ggml_graph_node(gf, i);
             if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                 break;
             } else {
@@ -16207,8 +16207,8 @@ static int llama_decode_internal(
         ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
 
         // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
+        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
 
         if (lctx.n_outputs == 0) {
             // no output
@@ -16217,9 +16217,9 @@ static int llama_decode_internal(
         } else if (cparams.embeddings) {
             res  = nullptr; // do not extract logits for embedding case
             embd = nullptr;
-            for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    embd = ggml_graph_node(gf, i);
                     break;
                 }
             }
@@ -16436,15 +16436,15 @@ static int llama_encode_internal(
     // there are two cases here
     if (llama_model_has_decoder(&lctx.model)) {
         // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = gf->nodes[gf->n_nodes - 1];
+        embd = ggml_graph_node(gf, -1);
         GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
     } else {
         // second case is an encoder-only T5 model
         if (cparams.embeddings) {
             // only output embeddings if required
-            embd = gf->nodes[gf->n_nodes - 1];
+            embd = ggml_graph_node(gf, -1);
             if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+                embd = ggml_graph_node(gf, -2);
             }
             GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
         }
@@ -18492,7 +18492,7 @@ struct llama_context * llama_new_context_with_model(
 
             // note: the number of splits during measure is higher than during inference due to the kv shift
             int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
+            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf));
             LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
         }
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 635de01d7..aa7896def 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -519,7 +519,7 @@ struct test_case {
 
         // add sentinels as graph nodes so that they are checked in the callback
         for (ggml_tensor * sentinel : sentinels) {
-            gf->nodes[gf->n_nodes++] = sentinel;
+            ggml_graph_add_node(gf, sentinel);
         }
 
         // randomize tensors
@@ -679,9 +679,9 @@ struct test_case {
 
         // duplicate the op
         size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
+        int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
         for (int i = 1; i < n_runs; i++) {
-            gf->nodes[gf->n_nodes++] = out;
+            ggml_graph_add_node(gf, out);
         }
 
         // calculate memory
@@ -696,11 +696,11 @@ struct test_case {
             }
             return size;
         };
-        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
                 continue;
             }
-            mem += tensor_op_size(gf->nodes[i]);
+            mem += tensor_op_size(ggml_graph_node(gf, i));
         }
 
         // run
@@ -804,7 +804,7 @@ struct test_case {
         ggml_graph_cpy(gf, gb);
         ggml_build_backward_expand(ctx, gf, gb, false);
         if (expect.size() != 1 || expect[0] != 0.0f) {
-            GGML_ASSERT(gb->n_nodes > gf->n_nodes);
+            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
             for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
                 GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
             }

From 2b00fa799773cc75d53b841c03d21d7468a1e3a1 Mon Sep 17 00:00:00 2001
From: Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
Date: Thu, 12 Sep 2024 16:24:31 +0500
Subject: [PATCH 31/65] riscv : modify Makefile and add a RISCV_VECT to print
 log info (#9442)

- Added ggml_cpu_has_riscv_v() in GGML to print system info in log
- Modified Makefile to only use flag when cross compiling for RISC-V
---
 Makefile            | 9 +++++++--
 common/common.cpp   | 1 +
 ggml/include/ggml.h | 1 +
 ggml/src/ggml.c     | 8 ++++++++
 src/llama.cpp       | 1 +
 5 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index c12bc61f4..8d3fd3ee8 100644
--- a/Makefile
+++ b/Makefile
@@ -434,7 +434,7 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 
-ifndef RISCV
+ifndef RISCV_CROSS_COMPILE
 
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
@@ -514,7 +514,12 @@ ifneq ($(filter loongarch64%,$(UNAME_M)),)
 	MK_CXXFLAGS += -mlasx
 endif
 
-else
+ifneq ($(filter riscv64%,$(UNAME_M)),)
+	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
+	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
+endif
+
+else # RISC-V CROSS COMPILATION
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
 endif
diff --git a/common/common.cpp b/common/common.cpp
index 30c6e84c7..c492ae0cc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1828,6 +1828,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "cpu_has_sve: %s\n",         ggml_cpu_has_sve()         ? "true" : "false");
     fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
     fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
+    fprintf(stream, "cpu_has_riscv_v: %s\n",     ggml_cpu_has_riscv_v()     ? "true" : "false");
     fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
     fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
     fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 86ad6fb62..13026ab32 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2470,6 +2470,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_gpublas    (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_riscv_v    (void);
     GGML_API int ggml_cpu_has_sycl       (void);
     GGML_API int ggml_cpu_has_rpc        (void);
     GGML_API int ggml_cpu_has_vsx        (void);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 47417c024..493ff7fc0 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -23288,6 +23288,14 @@ int ggml_cpu_has_arm_fma(void) {
 #endif
 }
 
+int ggml_cpu_has_riscv_v(void) {
+#if defined(__riscv_v_intrinsic)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_metal(void) {
 #if defined(GGML_USE_METAL)
     return 1;
diff --git a/src/llama.cpp b/src/llama.cpp
index 0f80b2402..acda9e235 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -20672,6 +20672,7 @@ const char * llama_print_system_info(void) {
     s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
     s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
     s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
+    s += "RISCV_VECT = "  + std::to_string(ggml_cpu_has_riscv_v())     + " | ";
     s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
     s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
     s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";

From 39f852f44039b058fdd0611ee127c6efa7ba4a04 Mon Sep 17 00:00:00 2001
From: Molly Sophia <mollysophia379@gmail.com>
Date: Thu, 12 Sep 2024 19:25:16 +0800
Subject: [PATCH 32/65] py : add special tokens in hf_converter for RWKV v6
 (#9428)

Signed-off-by: Molly Sophia <mollysophia379@gmail.com>
---
 convert_hf_to_gguf.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ca473244e..f02c65026 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2771,6 +2771,8 @@ class Rwkv6Model(Model):
         self.gguf_writer.add_tokenizer_model("rwkv")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+        special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
         block_count = self.hparams["num_hidden_layers"]

From ff76e18516dbe269b35ba1bb500524ed5e39225c Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Thu, 12 Sep 2024 13:27:14 +0200
Subject: [PATCH 33/65] cmake : fixed the order of linking libraries for
 llama-quantize (#9450)

---
 examples/quantize/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index 3ee4eb971..62680cda4 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)

From 3c26a1644dacfa6b5d58af550210524efd7b93fc Mon Sep 17 00:00:00 2001
From: Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
Date: Thu, 12 Sep 2024 04:27:45 -0700
Subject: [PATCH 34/65] ci : bump actions/checkout to v4 (#9377)

---
 .github/workflows/build.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c36eaadfb..e58f095ba 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -375,7 +375,7 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Dependencies
         id: depends
@@ -401,7 +401,7 @@ jobs:
     continue-on-error: true
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: add oneAPI to apt
         shell: bash
@@ -442,7 +442,7 @@ jobs:
     continue-on-error: true
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: add oneAPI to apt
         shell: bash
@@ -546,7 +546,7 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
 
       - name: Dependencies
         id: depends
@@ -576,7 +576,7 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
 
       - name: Dependencies
         id: depends
@@ -610,7 +610,7 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
 
       - name: Dependencies
         id: depends
@@ -969,7 +969,7 @@ jobs:
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Install
         id: depends

From c837981bba7cf6839b69d32b25552ce685936b14 Mon Sep 17 00:00:00 2001
From: daminho <37615795+daminho@users.noreply.github.com>
Date: Thu, 12 Sep 2024 20:28:20 +0900
Subject: [PATCH 35/65] py : add Phi-1.5/Phi-2 tokenizer (#9361)

* add phi2 tokenizer

* add phi name to convert_hf_to_gguf_update.py

* make tokenizer_pre consistent; llama.cpp work
---
 convert_hf_to_gguf.py        | 3 +++
 convert_hf_to_gguf_update.py | 1 +
 2 files changed, 4 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index f02c65026..01a8a50a2 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -626,6 +626,9 @@ class Model:
         if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
             # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
             res = "exaone"
+        if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
+            # ref: https://huggingface.co/microsoft/phi-2
+            res = "phi-2"
 
         if res is None:
             logger.warning("\n")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 59a0b81a1..021f65abd 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -98,6 +98,7 @@ models = [
     {'name': "bloom",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigscience/bloom", },
     {'name': "gpt3-finnish",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/TurkuNLP/gpt3-finnish-small", },
     {"name": "exaone",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct", },
+    {"name": "phi-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/microsoft/phi-2", },
 ]
 
 

From 4dc4f5f14ae522494649d82ad06b031cf9501038 Mon Sep 17 00:00:00 2001
From: Huang Qi <huangqi3@xiaomi.com>
Date: Thu, 12 Sep 2024 19:28:43 +0800
Subject: [PATCH 36/65] ci : update HIP SDK to 24.Q3 (ROCm 6.1) (#9329)

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e58f095ba..181ef37e2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -976,7 +976,7 @@ jobs:
         run: |
           $ErrorActionPreference = "Stop"
           write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
           write-host "Installing AMD HIP SDK"
           Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
           write-host "Completed AMD HIP SDK installation"

From 2a825116b6f7f3a9b1726e5e0c3eb22f7768bd33 Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Thu, 12 Sep 2024 13:30:01 +0200
Subject: [PATCH 37/65] cmake : fix for builds without `GGML_CDEF_PUBLIC`
 (#9338)

* `GGML_TARGET_DEFINES-NOTFOUND` fix for builds without `GGML_CDEF_PUBLIC`

* Update CMakeLists.txt, spaces fix
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a31320635..244019313 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,10 +139,16 @@ set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location o
 # determining _precisely_ which defines are necessary for the llama-config
 # package.
 #
+set(GGML_TRANSIENT_DEFINES)
 get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
 get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
+if (GGML_DIR_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
+endif()
 get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
-set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
+if (GGML_TARGET_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
+endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
 
 set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)

From d4c3c10fad1bd6adec72d2f1f236761a8d6a07f8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Thu, 12 Sep 2024 13:33:57 +0200
Subject: [PATCH 38/65] lora : raise error if lm_head is ignored (#9103)

* lora : raise error if lm_head is ignored

* fix style

* clarify comment
---
 convert_lora_to_gguf.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index ddd347a2a..d1c94e580 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -363,7 +363,13 @@ if __name__ == '__main__':
                     yield (name, cast(torch.Tensor, LoraTorchTensor(tensor.A, tensor.B)))
 
             def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-                dest = super().modify_tensors(data_torch, name, bid)
+                dest = list(super().modify_tensors(data_torch, name, bid))
+                # some archs may have the same tensor for lm_head and output (tie word embeddings)
+                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # therefore, we ignore them for now
+                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                if name == "lm_head.weight" and len(dest) == 0:
+                    raise ValueError("lm_head is present in adapter, but is ignored in base model")
                 for dest_name, dest_data in dest:
                     assert isinstance(dest_data, LoraTorchTensor)
                     lora_a, lora_b = dest_data.get_lora_A_B()

From e665744317c77fc3483fc5224fe6d586b5166b33 Mon Sep 17 00:00:00 2001
From: fengerhu1 <2748250768@qq.com>
Date: Thu, 12 Sep 2024 19:34:22 +0800
Subject: [PATCH 39/65] llava : fix the script error in MobileVLM README
 (#9054)

Signed-off-by: Erhu Feng <2748250768@qq.com>
---
 examples/llava/MobileVLM-README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 06a65fba4..4f783f3ce 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -39,7 +39,7 @@ python ./examples/llava/llava_surgery.py -m path/to/MobileVLM-1.7B
 3. Use `convert_image_encoder_to_gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF:
 
 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
     -m path/to/clip-vit-large-patch14-336 \
     --llava-projector path/to/MobileVLM-1.7B/llava.projector \
     --output-dir path/to/MobileVLM-1.7B \
@@ -47,7 +47,7 @@ python ./examples/llava/convert_image_encoder_to_gguf \
 ```
 
 ```sh
-python ./examples/llava/convert_image_encoder_to_gguf \
+python ./examples/llava/convert_image_encoder_to_gguf.py \
     -m path/to/clip-vit-large-patch14-336 \
     --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \
     --output-dir path/to/MobileVLM-1.7B_V2 \
@@ -57,12 +57,12 @@ python ./examples/llava/convert_image_encoder_to_gguf \
 4. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
 
 ```sh
-python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B
+python ./examples/convert_legacy_llama.py path/to/MobileVLM-1.7B --skip-unknown
 ```
 
-5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
+5. Use `quantize` to convert LLaMA part's DataType from `fp32` to `q4_k`
 ```sh
-./llama-quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
+./llama-quantize path/to/MobileVLM-1.7B/ggml-model-F32.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
 ```
 
 Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.

From e6b7801bd189d102d901d3e72035611a25456ef1 Mon Sep 17 00:00:00 2001
From: Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
Date: Thu, 12 Sep 2024 19:46:43 +0800
Subject: [PATCH 40/65] cann: Add host buffer type for Ascend NPU (#9406)

* feat: Add host buffer type for Ascend NPU(CANN backend)

* fix some checking errors

* Add a few comments
---
 ggml/include/ggml-cann.h |   7 +++
 ggml/src/ggml-cann.cpp   | 110 +++++++++++++++++++++++++++++++++++++++
 src/llama.cpp            |   4 ++
 3 files changed, 121 insertions(+)

diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h
index ca73211fe..031ad1ce2 100644
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
  */
 GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
 
+/**
+ * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
+ *
+ * @return A pointer to the host buffer type interface.
+ */
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+
 /**
  * @brief Retrieves the description of a specific CANN device.
  *
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp
index e9c370b9b..aa315b83f 100644
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -1221,6 +1221,116 @@ ggml_backend_cann_buffer_type(int32_t device) {
     return &ggml_backend_cann_buffer_types[device];
 }
 
+/**
+ * @brief Retrieves the name associated with a CANN host buffer type.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer type context.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return "CANN_Host";
+
+    GGML_UNUSED(buft);
+}
+
+/**
+ * @brief Retrieves the name associated with a CANN host buffer.
+ *
+ * This function returns the descriptive name associated with the specified
+ * CANN host buffer context.
+ *
+ * @param buft Pointer to the host buffer context.
+ * @return Const pointer to the C-style string containing the name.
+ */
+GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return "CANN_Host";
+
+    GGML_UNUSED(buffer);
+}
+
+/**
+ * @brief Free resources associated with a CANN host buffer.
+ *
+ * This function frees the resources associated with a CANN host buffer, including
+ * its context.
+ *
+ * @param buffer The CANN host buffer to free.
+ */
+GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
+    ACL_CHECK(aclrtFreeHost(buffer->context));
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified size.
+ *
+ * This function allocates a new CANN host buffer with the given size.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or nullptr if allocation fails.
+ */
+static void * ggml_cann_host_malloc(size_t size) {
+    if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * hostPtr = nullptr;
+    aclError err = aclrtMallocHost((void **) &hostPtr, size);
+    if (err != ACL_SUCCESS) {
+
+        GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
+                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
+        return nullptr;
+    }
+    return hostPtr;
+}
+
+/**
+ * @brief Allocates a new CANN host buffer of the specified type and size.
+ *
+ * @param buft Pointer to the host buffer type context.
+ * @param size Size in bytes of the host buffer to allocate.
+ * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
+ */
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * hostPtr = ggml_cann_host_malloc(size);
+
+    if (hostPtr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
+    buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
+    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
+
+    return buffer;
+}
+
+/**
+ * @brief Interface for managing CANN host buffer types in the GGML backend.
+ *
+ * Provides function pointers for allocating, querying properties, and managing
+ * memory for CANN buffer types in the GGML backend.
+ */
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cann_buffer_type_host;
+}
+
 /**
  * @brief Computes the forward operation for a given tensor using CANN
  * operations.
diff --git a/src/llama.cpp b/src/llama.cpp
index acda9e235..cdc3f1856 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2156,6 +2156,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
     if (host_buffer) {
         buft = ggml_backend_sycl_host_buffer_type();
     }
+#elif defined(GGML_USE_CANN)
+    if (host_buffer) {
+        buft = ggml_backend_cann_host_buffer_type();
+    }
 #elif defined(GGML_USE_CPU_HBM)
     buft = ggml_backend_cpu_hbm_buffer_type();
 #elif defined(GGML_USE_VULKAN)

From 78203641fee3b1f82abaff0c7f667e1b4a286390 Mon Sep 17 00:00:00 2001
From: Mathijs Henquet <mathijs.henquet@gmail.com>
Date: Thu, 12 Sep 2024 22:30:11 +0200
Subject: [PATCH 41/65] server : Add option to return token pieces in /tokenize
 endpoint (#9108)

* server : added with_pieces functionality to /tokenize endpoint

* server : Add tokenize with pieces tests to server.feature

* Handle case if tokenizer splits along utf8 continuation bytes

* Add example of token splitting

* Remove trailing ws

* Fix trailing ws

* Maybe fix ci

* maybe this fix windows ci?

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 .github/workflows/server.yml                  |  1 +
 examples/server/README.md                     | 39 ++++++++++++++++++-
 examples/server/server.cpp                    | 33 ++++++++++++++--
 examples/server/tests/features/server.feature |  8 ++++
 examples/server/tests/features/steps/steps.py | 29 ++++++++++++++
 examples/server/utils.hpp                     | 35 ++++++++++++++++-
 6 files changed, 139 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 99feb28f2..29f8fd444 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -173,6 +173,7 @@ jobs:
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
           cd examples/server/tests
+          $env:PYTHONIOENCODING = ":replace"
           behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 
       - name: Slow tests
diff --git a/examples/server/README.md b/examples/server/README.md
index 79196e9c1..44a73ca0a 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
 
     *Options:*
 
-    `content`: Set the text to tokenize.
+    `content`: (Required) The text to tokenize.
 
-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+
+    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+
+**Response:**
+
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
+
+
+If `with_pieces` is `false`:
+```json
+{
+  "tokens": [123, 456, 789]
+}
+```
+
+If `with_pieces` is `true`:
+```json
+{
+  "tokens": [
+    {"id": 123, "piece": "Hello"},
+    {"id": 456, "piece": " world"},
+    {"id": 789, "piece": "!"}
+  ]
+}
+```
+
+With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+```json
+{
+  "tokens": [
+    {"id": 198, "piece": [195]}, // hex C3
+    {"id": 164, "piece": [161]} // hex A1
+  ]
+}
+```
 
 ### POST `/detokenize`: Convert tokens to text
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5b263f646..5e4dffadf 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
     const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
         const json body = json::parse(req.body);
 
-        std::vector<llama_token> tokens;
+        json tokens_response = json::array();
         if (body.count("content") != 0) {
             const bool add_special = json_value(body, "add_special", false);
-            tokens = ctx_server.tokenize(body.at("content"), add_special);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = llama_token_to_piece(ctx_server.ctx, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
+                        }
+                    }
+
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece_json}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
         }
-        const json data = format_tokenizer_response(tokens);
+
+        const json data = format_tokenizer_response(tokens_response);
         res_ok(res, data);
     };
 
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index b55971454..15e24c624 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -105,6 +105,14 @@ Feature: llama.cpp server
     Given first token is removed
     Then  tokens can be detokenized
 
+  Scenario: Tokenize with pieces
+    When  tokenizing with pieces:
+    """
+    What is the capital of Germany?
+    媽
+    """
+    Then  tokens are given with pieces
+
   Scenario: Models available
     Given available models
     Then  1 models are supported
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 65b71a8e8..11587dd64 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
 import asyncio
 import json
 import os
@@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
     context.tokenize_add_special = True
 
 
+@step("tokenizing with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    context.tokenized_text = context_text(context)
+    async with aiohttp.ClientSession() as session:
+        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
+        if getattr(context, "tokenize_add_special", None) is not None:
+            tokenize_args["add_special"] = context.tokenize_add_special
+
+        async with session.post(
+            f"{context.base_url}/tokenize", json=tokenize_args
+        ) as response:
+            assert response.status == 200
+            tokenize_json = await response.json()
+            context.tokens_with_pieces = tokenize_json["tokens"]
+
+
+@step("tokens are given with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    # Verify that the response contains both token IDs and pieces
+    assert all(
+        "id" in token and "piece" in token for token in context.tokens_with_pieces
+    )
+
+
 @step('tokenizing')
 @async_run_until_complete
 async def step_tokenize(context):
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index edfce65b6..adb1a1cb9 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
     return res;
 }
 
-static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+static bool is_valid_utf8(const std::string & str) {
+    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+    const unsigned char* end = bytes + str.length();
+
+    while (bytes < end) {
+        if (*bytes <= 0x7F) {
+            // 1-byte sequence (0xxxxxxx)
+            bytes++;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // 2-byte sequence (110xxxxx 10xxxxxx)
+            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+                return false;
+            bytes += 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+                return false;
+            bytes += 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
+                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+                return false;
+            bytes += 4;
+        } else {
+            // Invalid UTF-8 lead byte
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static json format_tokenizer_response(const json & tokens) {
     return json {
         {"tokens", tokens}
     };

From bd35cb0ae357185c173345f10dc89a4ff925fc25 Mon Sep 17 00:00:00 2001
From: "Gilad S." <7817232+giladgd@users.noreply.github.com>
Date: Fri, 13 Sep 2024 04:54:49 +0300
Subject: [PATCH 42/65] feat: remove a sampler from a chain (#9445)

* feat: remove a sampler from a chain

* fix: return removed sampler

* fix: safer casting
---
 include/llama.h        |  3 +++
 src/llama-sampling.cpp | 15 ++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/llama.h b/include/llama.h
index 405af912c..744ef9d90 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1056,6 +1056,9 @@ extern "C" {
     LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
     LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
 
+    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
+
     // available samplers:
 
     LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index fd1b7f919..c828dc359 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -349,13 +349,26 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
 struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
     const auto * p = (const llama_sampler_chain *) chain->ctx;
 
-    if (i < 0 || i >= (int32_t) p->samplers.size()) {
+    if (i < 0 || (size_t) i >= p->samplers.size()) {
         return nullptr;
     }
 
     return p->samplers[i];
 }
 
+struct llama_sampler * llama_sampler_chain_remove(struct llama_sampler * chain, int32_t i) {
+    auto * p = (llama_sampler_chain *) chain->ctx;
+
+    if (i < 0 || (size_t) i >= p->samplers.size()) {
+        return nullptr;
+    }
+
+    auto * result = p->samplers[i];
+    p->samplers.erase(p->samplers.begin() + i);
+
+    return result;
+}
+
 int llama_sampler_chain_n(const struct llama_sampler * chain) {
     const auto * p = (const llama_sampler_chain *) chain->ctx;
 

From 0abc6a2c25272d5cf01384dda8ee8bfec4ba8745 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 13 Sep 2024 09:53:38 +0300
Subject: [PATCH 43/65] llama : llama_perf + option to disable timings during
 decode (#9355)

* llama : llama_perf + option to disable timings during decode

ggml-ci

* common : add llama_arg

* Update src/llama.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* perf : separate functions in the API

ggml-ci

* perf : safer pointer handling + naming update

ggml-ci

* minor : better local var name

* perf : abort on invalid sampler pointer

ggml-ci

---------

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 common/arg.cpp                            |   8 ++
 common/common.cpp                         |   3 +-
 common/common.h                           |   2 +
 common/sampling.cpp                       |   6 +-
 examples/batched-bench/batched-bench.cpp  |   2 +-
 examples/batched.swift/Sources/main.swift |   4 +-
 examples/batched/batched.cpp              |   4 +-
 examples/embedding/embedding.cpp          |   2 +-
 examples/eval-callback/eval-callback.cpp  |   2 +-
 examples/imatrix/imatrix.cpp              |   2 +-
 examples/llama-bench/llama-bench.cpp      |   2 +-
 examples/llava/llava-cli.cpp              |   4 +-
 examples/llava/minicpmv-cli.cpp           |   2 +-
 examples/lookup/lookup.cpp                |   3 +-
 examples/parallel/parallel.cpp            |   2 +-
 examples/passkey/passkey.cpp              |   2 +-
 examples/perplexity/perplexity.cpp        |   2 +-
 examples/retrieval/retrieval.cpp          |   2 +-
 examples/simple/simple.cpp                |   4 +-
 examples/speculative/speculative.cpp      |   2 +-
 include/llama.h                           |  29 ++++--
 src/llama-sampling.cpp                    |  34 +++++++
 src/llama.cpp                             | 103 +++++++++-------------
 23 files changed, 135 insertions(+), 91 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index ce6a27614..a1cd5830f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -720,6 +720,14 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.prompt = value;
         }
     ));
+    add_opt(llama_arg(
+        {"--no-perf"},
+        format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        [](gpt_params & params) {
+            params.no_perf = true;
+            params.sparams.no_perf = true;
+        }
+    ).set_env("LLAMA_ARG_NO_PERF"));
     add_opt(llama_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
diff --git a/common/common.cpp b/common/common.cpp
index c492ae0cc..f9a831ec7 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -820,7 +820,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         }
         llama_kv_cache_clear(lctx);
         llama_synchronize(lctx);
-        llama_perf_reset(lctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_reset(lctx);
     }
 
     iparams.model   = model;
@@ -916,6 +916,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
     cparams.flash_attn        = params.flash_attn;
+    cparams.no_perf           = params.no_perf;
 
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
diff --git a/common/common.h b/common/common.h
index 23babdd09..e8025aeef 100644
--- a/common/common.h
+++ b/common/common.h
@@ -124,6 +124,7 @@ struct gpt_sampler_params {
     float   mirostat_eta      = 0.10f; // learning rate
     bool    penalize_nl       = false; // consider newlines as a repeatable token
     bool    ignore_eos        = false;
+    bool    no_perf           = false; // disable performance metrics
 
     std::vector<enum gpt_sampler_type> samplers = {
         GPT_SAMPLER_TYPE_TOP_K,
@@ -246,6 +247,7 @@ struct gpt_params {
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
+    bool no_perf           = false; // disable performance metrics
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool logits_all        = false; // return logits for all tokens in the batch
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 4498feb11..c07b5e940 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -142,7 +142,7 @@ std::string gpt_sampler_params::print() const {
 struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 
-    lparams.no_perf = false; // TODO: control via params
+    lparams.no_perf = params.no_perf;
 
     auto * result = new gpt_sampler {
         /* .params = */ params,
@@ -257,10 +257,10 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
     // TODO: measure grammar performance
 
     if (gsmpl) {
-        llama_perf_print(gsmpl->chain, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
+        llama_perf_sampler_print(gsmpl->chain);
     }
     if (ctx) {
-        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_print(ctx);
     }
 }
 
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 89a4566c4..ec00fcf78 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -187,7 +187,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift
index 9f7c49492..10f2e7fd1 100644
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -200,8 +200,8 @@ let t_main_end = ggml_time_us()
 
 print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n\n")
 
-llama_perf_print(UnsafeRawPointer(context), LLAMA_PERF_TYPE_CONTEXT)
-llama_perf_print(UnsafeRawPointer(smpl),    LLAMA_PERF_TYPE_SAMPLER_CHAIN)
+llama_perf_sampler_print(smpl)
+llama_perf_context_print(context)
 
 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
     let utf8Count = text.utf8.count
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 5d32153fe..f1df20c6e 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -229,8 +229,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
-    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index db00c6363..e94ae2955 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -306,7 +306,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
 
     // clean up
     llama_batch_free(batch);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index bc7203143..af389abe1 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -182,7 +182,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 032a90136..73b54da7f 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -637,7 +637,7 @@ int main(int argc, char ** argv) {
     g_collector.save_imatrix();
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
 
     llama_free(ctx);
     llama_free_model(model);
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index d7db5af72..2d90f65a0 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1630,7 +1630,7 @@ int main(int argc, char ** argv) {
             fflush(p_err->fout);
         }
 
-        llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_print(ctx);
 
         llama_free(ctx);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index e9108a9bd..12fe7345f 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -308,7 +308,7 @@ int main(int argc, char ** argv) {
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_print(ctx_llava->ctx_llama);
         llava_image_embed_free(image_embed);
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
@@ -325,7 +325,7 @@ int main(int argc, char ** argv) {
             // process the prompt
             process_prompt(ctx_llava, image_embed, &params, params.prompt);
 
-            llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
+            llama_perf_context_print(ctx_llava->ctx_llama);
             llava_image_embed_free(image_embed);
             ctx_llava->model = NULL;
             llava_free(ctx_llava);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index afc74d279..3ac455e69 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
             }
         }
         printf("\n");
-        llama_perf_print(ctx_llava->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
+        llama_perf_context_print(ctx_llava->ctx_llama);
 
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index fff44a499..be6f8d7d7 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -240,8 +240,7 @@ int main(int argc, char ** argv){
     LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
     LOG_TEE("\ntarget:\n\n");
-    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
-    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+    gpt_perf_print(ctx, smpl);
 
     gpt_sampler_free(smpl);
 
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index bc6301311..758393c3d 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -415,7 +415,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("\n");
 
     // TODO: print sampling/grammar timings for all clients
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
 
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index d3d5ab46f..52aa68bfc 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -256,7 +256,7 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 04df65b0a..29ff86bbc 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2047,7 +2047,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
     write_logfile(ctx, params, model, results);
 
     llama_free(ctx);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 7a360b731..d08679edb 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -292,7 +292,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx);
 
     // clean up
     llama_batch_free(query_batch);
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 3fdc04394..0c923d4ed 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -154,8 +154,8 @@ int main(int argc, char ** argv) {
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
     LOG_TEE("\n");
-    llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
-    llama_perf_print(ctx,  LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
 
     fprintf(stderr, "\n");
 
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 214e4932b..843579acd 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -616,7 +616,7 @@ int main(int argc, char ** argv) {
 
     LOG_TEE("\ndraft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
-    llama_perf_print(ctx_dft, LLAMA_PERF_TYPE_CONTEXT);
+    llama_perf_context_print(ctx_dft);
 
     LOG_TEE("\ntarget:\n\n");
     gpt_perf_print(ctx_tgt, smpl);
diff --git a/include/llama.h b/include/llama.h
index 744ef9d90..cfc8d85dc 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -343,7 +343,7 @@ extern "C" {
         bool embeddings;  // if true, extract embeddings (together with logits)
         bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
         bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
-      //bool no_perf;     // whether to measure performance timings, TODO: implement
+        bool no_perf;     // whether to measure performance timings
 
         // Abort callback
         // if it returns true, execution of llama_decode() will be aborted
@@ -1176,13 +1176,30 @@ extern "C" {
     // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
     //
 
-    enum llama_perf_type {
-        LLAMA_PERF_TYPE_CONTEXT       = 0,
-        LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
+    struct llama_perf_context_data {
+        double t_start_ms;
+        double t_load_ms;
+        double t_p_eval_ms;
+        double t_eval_ms;
+
+        int32_t n_p_eval;
+        int32_t n_eval;
     };
 
-    LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
-    LLAMA_API void llama_perf_reset(      void * ctx, enum llama_perf_type type);
+    struct llama_perf_sampler_data {
+        double t_sample_ms;
+
+        int32_t n_sample;
+    };
+
+    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
+    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
+
+    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
+    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
+    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
     LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index c828dc359..5275b1d60 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -1669,3 +1669,37 @@ uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {
 
     return LLAMA_DEFAULT_SEED;
 }
+
+// perf
+
+struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * chain) {
+    struct llama_perf_sampler_data data = {};
+
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+    }
+
+    const auto * ctx = (const struct llama_sampler_chain *) chain->ctx;
+
+    data.t_sample_ms = 1e-3 * ctx->t_sample_us;
+    data.n_sample    = std::max(0, ctx->n_sample);
+
+    return data;
+}
+
+void llama_perf_sampler_print(const struct llama_sampler * chain) {
+    const auto data = llama_perf_sampler(chain);
+
+    LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
+}
+
+void llama_perf_sampler_reset(struct llama_sampler * chain) {
+    if (chain == nullptr || chain->iface != &llama_sampler_chain_i) {
+        GGML_ABORT("%s: invalid sampler passed - requires a sampler created with llama_sampler_chain_init()\n", __func__);
+    }
+
+    auto * ctx = (struct llama_sampler_chain *) chain->ctx;
+
+    ctx->t_sample_us = ctx->n_sample = 0;
+}
diff --git a/src/llama.cpp b/src/llama.cpp
index cdc3f1856..65afcc84a 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2486,6 +2486,7 @@ struct llama_cparams {
     bool causal_attn;
     bool offload_kqv;
     bool flash_attn;
+    bool no_perf;
 
     enum llama_pooling_type pooling_type;
 
@@ -6661,8 +6662,6 @@ static bool llm_load_tensors(
         bool use_mlock,
         llama_progress_callback progress_callback,
         void * progress_callback_user_data) {
-    model.t_start_us = ggml_time_us();
-
     auto & hparams = model.hparams;
 
     model.split_mode   = split_mode;
@@ -8593,14 +8592,13 @@ static bool llm_load_tensors(
         }
     }
 
-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    model.t_load_us = ggml_time_us() - model.t_start_us;
     return true;
 }
 
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
+    model.t_start_us = ggml_time_us();
+
     try {
         llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
 
@@ -8662,6 +8660,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
         return -1;
     }
 
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    model.t_load_us = ggml_time_us() - model.t_start_us;
+
     return 0;
 }
 
@@ -17949,6 +17951,7 @@ struct llama_context_params llama_context_default_params() {
         /*.embeddings                  =*/ false,
         /*.offload_kqv                 =*/ true,
         /*.flash_attn                  =*/ false,
+        /*.no_perf                     =*/ true,
         /*.abort_callback              =*/ nullptr,
         /*.abort_callback_data         =*/ nullptr,
     };
@@ -18159,6 +18162,7 @@ struct llama_context * llama_new_context_with_model(
     cparams.embeddings       = params.embeddings;
     cparams.offload_kqv      = params.offload_kqv;
     cparams.flash_attn       = params.flash_attn;
+    cparams.no_perf          = params.no_perf;
     cparams.pooling_type     = params.pooling_type;
 
     cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
@@ -20077,10 +20081,14 @@ void llama_synchronize(struct llama_context * ctx) {
 
     // add the evaluation to the stats
     if (ctx->n_queued_tokens == 1) {
-        ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_eval++;
     } else if (ctx->n_queued_tokens > 1) {
-        ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        if (!ctx->cparams.no_perf) {
+            ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
+        }
         ctx->n_p_eval += ctx->n_queued_tokens;
     }
 
@@ -20688,65 +20696,40 @@ const char * llama_print_system_info(void) {
     return s.c_str();
 }
 
-void llama_perf_print(const void * ctx, enum llama_perf_type type) {
-    switch (type) {
-        case LLAMA_PERF_TYPE_CONTEXT:
-            {
-                const auto * p = (const struct llama_context *) ctx;
+struct llama_perf_context_data llama_perf_context(const struct llama_context * ctx) {
+    struct llama_perf_context_data data = {};
 
-                const double t_start_ms   = 1e-3 * p->t_start_us;
-                const double t_end_ms     = 1.00 * ggml_time_ms();
-                const double t_load_ms    = 1e-3 * p->t_load_us;
-                const double t_p_eval_ms  = 1e-3 * p->t_p_eval_us;
-                const double t_eval_ms    = 1e-3 * p->t_eval_us;
-
-                const int32_t n_p_eval  = std::max(0, p->n_p_eval);
-                const int32_t n_eval    = std::max(1, p->n_eval);
-
-                LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, t_load_ms);
-                LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
-                LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
-                LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
-            } break;
-        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
-            {
-                const auto * smpl = (const struct llama_sampler *) ctx;
-                const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
-
-                const double t_sampler_ms = 1e-3 * p->t_sample_us;
-
-                const int32_t n_sampler = std::max(0, p->n_sample);
-
-                LLAMA_LOG_INFO("%s:    sampling time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
-                        __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
-            } break;
-        default:
-            GGML_ABORT("invalid perf type");
+    if (ctx == nullptr) {
+        return data;
     }
+
+    data.t_start_ms  = 1e-3 * ctx->t_start_us;
+    data.t_load_ms   = 1e-3 * ctx->t_load_us;
+    data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us;
+    data.t_eval_ms   = 1e-3 * ctx->t_eval_us;
+    data.n_p_eval    = std::max(1, ctx->n_p_eval);
+    data.n_eval      = std::max(1, ctx->n_eval);
+
+    return data;
 }
 
-void llama_perf_reset(void * ctx, enum llama_perf_type type) {
-    switch (type) {
-        case LLAMA_PERF_TYPE_CONTEXT:
-            {
-                auto * p = (struct llama_context *) ctx;
+void llama_perf_context_print(const struct llama_context * ctx) {
+    const auto data = llama_perf_context(ctx);
 
-                p->t_start_us  = ggml_time_us();
-                p->t_eval_us   = p->n_eval = 0;
-                p->t_p_eval_us = p->n_p_eval = 0;
-            } break;
-        case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
-            {
-                auto * smpl = (struct llama_sampler *) ctx;
-                auto * p = (struct llama_sampler_chain *) smpl->ctx;
+    const double t_end_ms = 1e-3 * ggml_time_us();
 
-                p->t_sample_us = p->n_sample = 0;
-            } break;
-        default:
-            GGML_ABORT("invalid perf type");
-    }
+    LLAMA_LOG_INFO("%s:        load time = %10.2f ms\n", __func__, data.t_load_ms);
+    LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
+    LLAMA_LOG_INFO("%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)\n",
+            __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
+    LLAMA_LOG_INFO("%s:       total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
+}
+
+void llama_perf_context_reset(struct llama_context * ctx) {
+    ctx->t_start_us  = ggml_time_us();
+    ctx->t_eval_us   = ctx->n_eval = 0;
+    ctx->t_p_eval_us = ctx->n_p_eval = 0;
 }
 
 void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {

From feff4aa8461da7c432d144c11da4802e41fef3cf Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Fri, 13 Sep 2024 14:23:11 +0200
Subject: [PATCH 44/65] server : add loading html page while model is loading
 (#9468)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Adding loading page for '/' server requests

* set content when model is loading

* removed loading html file

* updated cmakelist

* updated makefile

* cleaned up whitespace

* cleanup for PR removed error

* updated server test to handle 503 HTML

* updated server test to handle 503 HTML

* ca†ch 503 before parsing json

* revert test

* account for both api and web browser requests

* precommit corrections

* eol fix

* revert changes to pre-commit

* removed print statement

* made loading message more descriptive

* also support .html files

---------

Co-authored-by: VJHack <flymyplane21@gmail.com>
Co-authored-by: Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
---
 Makefile                            |  1 +
 examples/server/CMakeLists.txt      |  1 +
 examples/server/public/loading.html | 12 ++++++++++++
 examples/server/server.cpp          | 11 +++++++++--
 4 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 examples/server/public/loading.html

diff --git a/Makefile b/Makefile
index 8d3fd3ee8..f41887a4d 100644
--- a/Makefile
+++ b/Makefile
@@ -1440,6 +1440,7 @@ llama-server: \
 	examples/server/system-prompts.js.hpp \
 	examples/server/prompt-formats.js.hpp \
 	examples/server/json-schema-to-grammar.mjs.hpp \
+	examples/server/loading.html.hpp \
 	common/json.hpp \
 	common/stb_image.h \
 	$(OBJ_ALL)
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index dbe41f1fd..580f3a824 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -30,6 +30,7 @@ set(PUBLIC_ASSETS
     system-prompts.js
     prompt-formats.js
     json-schema-to-grammar.mjs
+    loading.html
 )
 
 foreach(asset ${PUBLIC_ASSETS})
diff --git a/examples/server/public/loading.html b/examples/server/public/loading.html
new file mode 100644
index 000000000..c3fd19a0f
--- /dev/null
+++ b/examples/server/public/loading.html
@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html>
+    <head>
+        <meta http-equiv="refresh" content="5">
+    </head>
+    <body>
+        <div id="loading">
+            The model is loading. Please wait.<br/>
+            The user interface will appear soon.
+        </div>
+    </body>
+</html>
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5e4dffadf..73cd6aae7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -28,6 +28,7 @@
 #include "system-prompts.js.hpp"
 #include "prompt-formats.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
+#include "loading.html.hpp"
 
 #include <atomic>
 #include <chrono>
@@ -2592,10 +2593,16 @@ int main(int argc, char ** argv) {
         return false;
     };
 
-    auto middleware_server_state = [&res_error, &state](const httplib::Request &, httplib::Response & res) {
+    auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
         server_state current_state = state.load();
         if (current_state == SERVER_STATE_LOADING_MODEL) {
-            res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+            auto tmp = string_split(req.path, '.');
+            if (req.path == "/" || tmp.back() == "html") {
+                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
+                res.status = 503;
+            } else {
+                res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+            }
             return false;
         }
         return true;

From befaf1197fa447f61714de041828852a270659d2 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Sat, 14 Sep 2024 09:50:12 +0200
Subject: [PATCH 45/65] llama : make cell_id const in inp_s_mask block (#9470)

This commit makes the cell_id variable const in the inp_s_mask block.

The motivation for this change is consistency with the code in the
inp_s_copy block.
---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 65afcc84a..1986a90fb 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15826,7 +15826,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
 
             // clear unused states
             for (int i = 0; i < n_kv; ++i) {
-                uint32_t        cell_id = i + kv_self.head;
+                const uint32_t  cell_id = i + kv_self.head;
                 llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
 
                 data[i] = (float) (kv_cell.src >= 0);

From 1f4111e540bacec8d00ca9fd96417bf4c1339394 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 14 Sep 2024 10:55:05 +0300
Subject: [PATCH 46/65] cmake : use list(APPEND ...) instead of set() + dedup
 linker (#9463)

* cmake : use list(APPEND ...) instead of set() + dedup linker

ggml-ci

* cmake : try fix sycl

* cmake : try to fix sycl 2

* cmake : fix sycl build (#9469)

* try fix sycl build

* use CMAKE_CXX_FLAGS as a string variable

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* one more CMAKE_CXX_FLAGS fix (#9471)

---------

Co-authored-by: Michael Podvitskiy <podvitskiymichael@gmail.com>
---
 ggml/src/CMakeLists.txt | 77 ++++++++++++++++++++++++-----------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index cd2dcd066..506b6dc7b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -26,6 +26,8 @@ if (NOT MSVC)
     endif()
 endif()
 
+unset(GGML_EXTRA_LIBS)
+
 if (APPLE AND GGML_ACCELERATE)
     find_library(ACCELERATE_FRAMEWORK Accelerate)
     if (ACCELERATE_FRAMEWORK)
@@ -35,7 +37,7 @@ if (APPLE AND GGML_ACCELERATE)
         add_compile_definitions(ACCELERATE_NEW_LAPACK)
         add_compile_definitions(ACCELERATE_LAPACK_ILP64)
 
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
+        list(APPEND GGML_EXTRA_LIBS ${ACCELERATE_FRAMEWORK})
     else()
         message(WARNING "Accelerate framework not found")
     endif()
@@ -87,7 +89,7 @@ if (GGML_METAL)
             COMMENT "Generate assembly for embedded Metal library"
         )
 
-        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
+    list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
     else()
         if (GGML_METAL_SHADER_DEBUG)
             # custom command to do the following:
@@ -132,7 +134,7 @@ if (GGML_METAL)
             )
     endif() # GGML_METAL_EMBED_LIBRARY
 
-    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
+    list(APPEND GGML_EXTRA_LIBS
         ${FOUNDATION_LIBRARY}
         ${METAL_FRAMEWORK}
         ${METALKIT_FRAMEWORK}
@@ -157,11 +159,11 @@ if (GGML_OPENMP)
 
         add_compile_definitions(GGML_USE_OPENMP)
 
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        list(APPEND GGML_EXTRA_LIBS OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
 
         if (GGML_MUSA)
-            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
+            list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
+            list(APPEND GGML_EXTRA_LIBS     "/usr/lib/llvm-10/lib/libomp.so")
         endif()
     else()
         message(WARNING "OpenMP not found")
@@ -244,8 +246,8 @@ if (GGML_BLAS)
         set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
         set(GGML_SOURCES_BLAS ggml-blas.cpp)
 
-        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${BLAS_LIBRARIES})
-        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
+        list(APPEND GGML_EXTRA_LIBS     ${BLAS_LIBRARIES})
+        list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS})
     else()
         message(WARNING "BLAS not found, please refer to "
         "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@@ -368,19 +370,19 @@ if (GGML_CUDA)
         if (GGML_STATIC)
             if (WIN32)
                 # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+                list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
             else ()
                 if (GGML_MUSA)
-                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
+                    list(APPEND GGML_EXTRA_LIBS MUSA::musart_static MUSA::mublas_static)
                 else()
-                    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+                    list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
                 endif()
             endif()
         else()
             if (GGML_MUSA)
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
+                list(APPEND GGML_EXTRA_LIBS MUSA::musart MUSA::mublas)
             else()
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+                list(APPEND GGML_EXTRA_LIBS CUDA::cudart CUDA::cublas CUDA::cublasLt)
             endif()
         endif()
 
@@ -388,9 +390,9 @@ if (GGML_CUDA)
             # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
         else()
             if (GGML_MUSA)
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
+                list(APPEND GGML_EXTRA_LIBS MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
             else()
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+                list(APPEND GGML_EXTRA_LIBS CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
             endif()
         endif()
     else()
@@ -495,7 +497,7 @@ if (GGML_HIPBLAS)
 
     if (CXX_IS_HIPCC)
         set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
+        list(APPEND GGML_EXTRA_LIBS hip::device)
     else()
         set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
     endif()
@@ -504,7 +506,8 @@ if (GGML_HIPBLAS)
         message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
     endif()
 
-    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
+    # TODO: this "PUBLIC" here seems wrong
+    list(APPEND GGML_EXTRA_LIBS PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 
 if (GGML_SYCL)
@@ -513,7 +516,8 @@ if (GGML_SYCL)
     endif()
 
     check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
-    if ( DEFINED ENV{ONEAPI_ROOT})
+
+    if (DEFINED ENV{ONEAPI_ROOT})
         message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
     elseif(SUPPORTS_SYCL)
         message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
@@ -551,21 +555,27 @@ if (GGML_SYCL)
 
     find_package(DNNL)
     message("-- DNNL found:" ${DNNL_FOUND})
+
     if (GGML_SYCL_TARGET STREQUAL "INTEL")
         add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
     else()
         add_compile_definitions(GGML_SYCL_DNNL=0)
     endif()
+
+    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+        list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
+    endif()
+
     if (WIN32)
         find_package(IntelSYCL REQUIRED)
         find_package(MKL REQUIRED)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        list(APPEND GGML_EXTRA_LIBS IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
     else()
         if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            list(APPEND GGML_EXTRA_LIBS OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
         elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
+            list(APPEND GGML_EXTRA_LIBS pthread m dl onemkl)
         endif()
     endif()
     if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
@@ -579,7 +589,7 @@ if (GGML_RPC)
     list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
 
     if (WIN32)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
+        list(APPEND GGML_EXTRA_LIBS ws2_32)
     endif()
 
     set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@@ -657,8 +667,8 @@ if (GGML_VULKAN)
         set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
         set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
 
-        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
-        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
+        list(APPEND GGML_EXTRA_LIBS     Vulkan::Vulkan)
+        list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
     else()
         message(WARNING "Vulkan not found")
     endif()
@@ -817,8 +827,8 @@ if (GGML_KOMPUTE)
 
         list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
 
-        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     kompute)
-        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
+        list(APPEND GGML_EXTRA_LIBS     kompute)
+        list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
     else()
         message(WARNING "Kompute not found")
     endif()
@@ -883,9 +893,10 @@ if (GGML_CANN)
             message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
             message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 
-            set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${CANN_LIBRARIES} )
-            set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CANN_INCLUDE_DIRS})
-            set(GGML_EXTRA_LIBDIRS  ${GGML_EXTRA_LIBDIRS}  ${CANN_INSTALL_DIR}/lib64)
+            list(APPEND GGML_EXTRA_LIBS     ${CANN_LIBRARIES} )
+            list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS})
+            list(APPEND GGML_EXTRA_LIBDIRS  ${CANN_INSTALL_DIR}/lib64)
+
             list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
         endif()
     else()
@@ -1322,12 +1333,14 @@ if (EMSCRIPTEN)
     set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
 
-target_compile_definitions(ggml PUBLIC  ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC ../include)
+target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
+target_include_directories(ggml PUBLIC  ../include)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
-target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
+target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump
 
+list(REMOVE_DUPLICATES GGML_EXTRA_LIBS)
+
 target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
 
 find_library(MATH_LIBRARY m)

From dcdcee3a744f39714503ee2b19c49b7c7b6209c9 Mon Sep 17 00:00:00 2001
From: VoidIsVoid <343750470@qq.com>
Date: Sat, 14 Sep 2024 17:36:44 +0800
Subject: [PATCH 47/65] server: add data: [DONE] to /chat/completions stream
 response (#9459)

---
 examples/server/server.cpp                    | 2 ++
 examples/server/tests/features/steps/steps.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 73cd6aae7..14c4af3d9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2993,6 +2993,8 @@ int main(int argc, char ** argv) {
                 }, [&](json error_data) {
                     server_sent_event(sink, "error", error_data);
                 });
+                static const std::string ev_done = "data: [DONE]\n\n";
+                sink.write(ev_done.data(), ev_done.size());
                 sink.done();
                 return true;
             };
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 11587dd64..0f4249b13 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1020,6 +1020,8 @@ async def oai_chat_completions(user_prompt,
                             event_data = line.split(': ', 1)
                             assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```'
                             chunk_raw = event_data[1]
+                            if chunk_raw == '[DONE]':
+                                break
 
                             chunk = json.loads(chunk_raw)
                             assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```"

From 822b6322dea704110797a5671fc80ae39ee6ac97 Mon Sep 17 00:00:00 2001
From: Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Date: Sat, 14 Sep 2024 05:54:37 -0400
Subject: [PATCH 48/65] ggml : ggml_type_name return "NONE" for invalid values
 (#9458)

When running on Windows, the quantization utility attempts to print the types that are not set which leads to a crash.
---
 ggml/src/ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 493ff7fc0..490c8d602 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -3399,7 +3399,7 @@ double ggml_type_sizef(enum ggml_type type) {
 }
 
 GGML_CALL const char * ggml_type_name(enum ggml_type type) {
-    return type_traits[type].type_name;
+    return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
 }
 
 GGML_CALL bool ggml_is_quantized(enum ggml_type type) {

From 7596487bebd58eade3cd0133d42a9008aaaf9d09 Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Sun, 15 Sep 2024 09:06:38 +0200
Subject: [PATCH 49/65] cmake : try to fix sycl+intel build (#9487)

---
 ggml/src/CMakeLists.txt | 77 +++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 38 deletions(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 506b6dc7b..b25440769 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -26,7 +26,8 @@ if (NOT MSVC)
     endif()
 endif()
 
-unset(GGML_EXTRA_LIBS)
+unset(GGML_EXTRA_LIBS_PRIVATE)
+unset(GGML_EXTRA_LIBS_PUBLIC)
 
 if (APPLE AND GGML_ACCELERATE)
     find_library(ACCELERATE_FRAMEWORK Accelerate)
@@ -37,7 +38,7 @@ if (APPLE AND GGML_ACCELERATE)
         add_compile_definitions(ACCELERATE_NEW_LAPACK)
         add_compile_definitions(ACCELERATE_LAPACK_ILP64)
 
-        list(APPEND GGML_EXTRA_LIBS ${ACCELERATE_FRAMEWORK})
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
     else()
         message(WARNING "Accelerate framework not found")
     endif()
@@ -89,7 +90,7 @@ if (GGML_METAL)
             COMMENT "Generate assembly for embedded Metal library"
         )
 
-    list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
+        list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
     else()
         if (GGML_METAL_SHADER_DEBUG)
             # custom command to do the following:
@@ -134,7 +135,7 @@ if (GGML_METAL)
             )
     endif() # GGML_METAL_EMBED_LIBRARY
 
-    list(APPEND GGML_EXTRA_LIBS
+    list(APPEND GGML_EXTRA_LIBS_PRIVATE
         ${FOUNDATION_LIBRARY}
         ${METAL_FRAMEWORK}
         ${METALKIT_FRAMEWORK}
@@ -159,11 +160,11 @@ if (GGML_OPENMP)
 
         add_compile_definitions(GGML_USE_OPENMP)
 
-        list(APPEND GGML_EXTRA_LIBS OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
 
         if (GGML_MUSA)
-            list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
-            list(APPEND GGML_EXTRA_LIBS     "/usr/lib/llvm-10/lib/libomp.so")
+            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-10/include/openmp")
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
         endif()
     else()
         message(WARNING "OpenMP not found")
@@ -246,8 +247,8 @@ if (GGML_BLAS)
         set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
         set(GGML_SOURCES_BLAS ggml-blas.cpp)
 
-        list(APPEND GGML_EXTRA_LIBS     ${BLAS_LIBRARIES})
-        list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS})
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
+        list(APPEND GGML_EXTRA_INCLUDES     ${BLAS_INCLUDE_DIRS})
     else()
         message(WARNING "BLAS not found, please refer to "
         "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@@ -370,19 +371,19 @@ if (GGML_CUDA)
         if (GGML_STATIC)
             if (WIN32)
                 # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
             else ()
                 if (GGML_MUSA)
-                    list(APPEND GGML_EXTRA_LIBS MUSA::musart_static MUSA::mublas_static)
+                    list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
                 else()
-                    list(APPEND GGML_EXTRA_LIBS CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+                    list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
                 endif()
             endif()
         else()
             if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS MUSA::musart MUSA::mublas)
+                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
             else()
-                list(APPEND GGML_EXTRA_LIBS CUDA::cudart CUDA::cublas CUDA::cublasLt)
+                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
             endif()
         endif()
 
@@ -390,9 +391,9 @@ if (GGML_CUDA)
             # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
         else()
             if (GGML_MUSA)
-                list(APPEND GGML_EXTRA_LIBS MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
+                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
             else()
-                list(APPEND GGML_EXTRA_LIBS CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
             endif()
         endif()
     else()
@@ -497,7 +498,7 @@ if (GGML_HIPBLAS)
 
     if (CXX_IS_HIPCC)
         set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        list(APPEND GGML_EXTRA_LIBS hip::device)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
     else()
         set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
     endif()
@@ -506,8 +507,7 @@ if (GGML_HIPBLAS)
         message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
     endif()
 
-    # TODO: this "PUBLIC" here seems wrong
-    list(APPEND GGML_EXTRA_LIBS PUBLIC hip::host roc::rocblas roc::hipblas)
+    list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 
 if (GGML_SYCL)
@@ -563,24 +563,23 @@ if (GGML_SYCL)
     endif()
 
     if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-        list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
     endif()
 
     if (WIN32)
         find_package(IntelSYCL REQUIRED)
         find_package(MKL REQUIRED)
-        list(APPEND GGML_EXTRA_LIBS IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
     else()
         if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND GGML_EXTRA_LIBS OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl")
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
         elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-            list(APPEND GGML_EXTRA_LIBS pthread m dl onemkl)
+            set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl")
+            set(CMAKE_CXX_FLAGS        "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE pthread m dl onemkl)
         endif()
     endif()
-    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-        list(APPEND GGML_EXTRA_LIBS DNNL::dnnl)
-    endif()
 endif()
 
 if (GGML_RPC)
@@ -589,7 +588,7 @@ if (GGML_RPC)
     list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
 
     if (WIN32)
-        list(APPEND GGML_EXTRA_LIBS ws2_32)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
     endif()
 
     set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@@ -667,8 +666,8 @@ if (GGML_VULKAN)
         set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
         set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
 
-        list(APPEND GGML_EXTRA_LIBS     Vulkan::Vulkan)
-        list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
+        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
     else()
         message(WARNING "Vulkan not found")
     endif()
@@ -827,8 +826,8 @@ if (GGML_KOMPUTE)
 
         list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
 
-        list(APPEND GGML_EXTRA_LIBS     kompute)
-        list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
+        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
     else()
         message(WARNING "Kompute not found")
     endif()
@@ -893,9 +892,9 @@ if (GGML_CANN)
             message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
             message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
 
-            list(APPEND GGML_EXTRA_LIBS     ${CANN_LIBRARIES} )
-            list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS})
-            list(APPEND GGML_EXTRA_LIBDIRS  ${CANN_INSTALL_DIR}/lib64)
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
+            list(APPEND GGML_EXTRA_INCLUDES     ${CANN_INCLUDE_DIRS})
+            list(APPEND GGML_EXTRA_LIBDIRS      ${CANN_INSTALL_DIR}/lib64)
 
             list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
         endif()
@@ -1339,9 +1338,7 @@ target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
 target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump
 
-list(REMOVE_DUPLICATES GGML_EXTRA_LIBS)
-
-target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
+list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
 
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
@@ -1350,6 +1347,10 @@ if (MATH_LIBRARY)
     endif()
 endif()
 
+list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
+list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
+target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)

From d6b37c881f056bd32b681dcd7658a37ea6ec3a1e Mon Sep 17 00:00:00 2001
From: OSecret <135510162+OLSecret@users.noreply.github.com>
Date: Sun, 15 Sep 2024 10:36:53 +0300
Subject: [PATCH 50/65] readme : update tools list (#9475)

* Added link to proprietary wrapper for Unity3d into README.md

Wrapper has prebuild library and was tested on iOS, Android, WebGL, PC, Mac platforms, has online demos like [this](https://d23myu0xfn2ttc.cloudfront.net/rich/index.html) and [that](https://d23myu0xfn2ttc.cloudfront.net/).

* Update README.md

Fixes upon review
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 73041b1a2..9a10ead83 100644
--- a/README.md
+++ b/README.md
@@ -173,6 +173,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
 - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
+- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example)
 
 **Infrastructure:**
 

From 3c7989fd29a2db2b75e28fd708cc441febe99a82 Mon Sep 17 00:00:00 2001
From: Csaba Kecskemeti <csaba.kecskemeti@gmail.com>
Date: Sun, 15 Sep 2024 00:48:25 -0700
Subject: [PATCH 51/65] py : add "LLaMAForCausalLM" conversion support (#9485)

Co-authored-by: Csaba Kecskemeti <csabakecskemeti@Csabas-Mac-Pro.local>
---
 convert_hf_to_gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 01a8a50a2..2c6d5d95b 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1487,7 +1487,7 @@ class StableLMModel(Model):
                 raise ValueError(f"Unprocessed norms: {norms}")
 
 
-@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
 class LlamaModel(Model):
     model_arch = gguf.MODEL_ARCH.LLAMA
 

From 6988da94a261444859f78595899212eeedc5ff9d Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Sun, 15 Sep 2024 18:55:52 +0200
Subject: [PATCH 52/65] cmake : correct order of sycl flags (#9497)

---
 ggml/src/CMakeLists.txt | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index b25440769..11b877e19 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -572,12 +572,10 @@ if (GGML_SYCL)
         list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
     else()
         if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
         elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsycl")
-            set(CMAKE_CXX_FLAGS        "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE pthread m dl onemkl)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
         endif()
     endif()
 endif()

From e6deac31f7e62db43b6afbc3be814f764fd5a187 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sun, 15 Sep 2024 19:02:27 +0200
Subject: [PATCH 53/65] gguf-split : add basic checks (#9499)

* gguf-split : do not overwrite existing files when merging

* gguf-split : error when too many arguments are passed
---
 examples/gguf-split/gguf-split.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index 881f0451c..82c239b83 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -152,7 +152,7 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
         throw std::invalid_argument("error: invalid parameter for argument: " + arg);
     }
 
-    if (argc - arg_idx < 2) {
+    if (argc - arg_idx != 2) {
         throw std::invalid_argument("error: bad arguments");
     }
 
@@ -389,10 +389,17 @@ static void gguf_merge(const split_params & split_params) {
     int n_split = 1;
     int total_tensors = 0;
 
-    auto * ctx_out = gguf_init_empty();
+    // avoid overwriting existing output file
+    if (std::ifstream(split_params.output.c_str())) {
+        fprintf(stderr, "%s: output file %s already exists\n", __func__, split_params.output.c_str());
+        exit(EXIT_FAILURE);
+    }
+
     std::ofstream fout(split_params.output.c_str(), std::ios::binary);
     fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
+    auto * ctx_out = gguf_init_empty();
+
     std::vector<uint8_t> read_data;
     std::vector<ggml_context *> ctx_metas;
     std::vector<gguf_context *> ctx_ggufs;

From 6262d13e0b2da91f230129a93a996609a2f5a2f2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 15 Sep 2024 20:46:12 +0300
Subject: [PATCH 54/65] common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
---
 .github/workflows/build.yml                   |   3 +
 .github/workflows/server.yml                  |   6 +
 Makefile                                      |  38 +-
 ci/run.sh                                     |   3 +
 common/CMakeLists.txt                         |  24 +-
 common/arg.cpp                                | 115 ++-
 common/common.cpp                             | 210 +++--
 common/common.h                               |  13 +-
 common/log.cpp                                | 401 +++++++++
 common/log.h                                  | 786 ++----------------
 common/ngram-cache.cpp                        |   3 +
 common/sampling.cpp                           |   2 +-
 common/train.cpp                              |   2 +
 examples/batched-bench/batched-bench.cpp      |  35 +-
 examples/batched/batched.cpp                  |  49 +-
 .../convert-llama2c-to-ggml.cpp               |  91 +-
 .../cvector-generator/cvector-generator.cpp   |   9 +-
 examples/embedding/embedding.cpp              | 107 +--
 examples/eval-callback/eval-callback.cpp      |  50 +-
 examples/export-lora/export-lora.cpp          |   2 +-
 examples/gritlm/gritlm.cpp                    |   2 +
 examples/imatrix/imatrix.cpp                  | 101 ++-
 examples/infill/infill.cpp                    | 182 ++--
 examples/llava/clip.cpp                       | 144 ++--
 examples/llava/llava-cli.cpp                  |  67 +-
 examples/llava/llava.cpp                      |  58 +-
 examples/llava/minicpmv-cli.cpp               |  86 +-
 examples/lookahead/lookahead.cpp              |  55 +-
 examples/lookup/lookup-stats.cpp              |  29 +-
 examples/lookup/lookup.cpp                    |  55 +-
 examples/main/main.cpp                        | 282 +++----
 examples/parallel/parallel.cpp                |  66 +-
 examples/passkey/passkey.cpp                  |  59 +-
 examples/perplexity/perplexity.cpp            | 343 ++++----
 examples/retrieval/retrieval.cpp              |  70 +-
 examples/server/CMakeLists.txt                |   7 +-
 examples/server/README.md                     |   1 -
 examples/server/bench/README.md               |   1 -
 examples/server/bench/bench.py                |   1 -
 examples/server/server.cpp                    | 572 +++++--------
 examples/server/tests/.gitignore              |   1 +
 examples/server/tests/README.md               |   1 -
 examples/server/tests/features/steps/steps.py |   2 -
 examples/server/utils.hpp                     | 110 +--
 examples/simple/simple.cpp                    |  43 +-
 examples/speculative/speculative.cpp          | 107 ++-
 examples/tokenize/tokenize.cpp                |  50 +-
 ggml/include/ggml.h                           |   9 +-
 ggml/src/ggml-metal.m                         |   7 +-
 src/llama-impl.h                              |   1 +
 src/llama.cpp                                 |   8 +-
 tests/CMakeLists.txt                          |   1 +
 tests/test-arg-parser.cpp                     |   2 +-
 tests/test-log.cpp                            |  39 +
 54 files changed, 2092 insertions(+), 2419 deletions(-)
 create mode 100644 common/log.cpp
 create mode 100644 examples/server/tests/.gitignore
 create mode 100644 tests/test-log.cpp

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 181ef37e2..1777489ec 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -23,6 +23,9 @@ env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   GGML_NLOOP: 3
   GGML_N_THREADS: 1
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
 
 jobs:
   macOS-latest-cmake-arm64:
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 29f8fd444..699ac095d 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -20,6 +20,12 @@ on:
     types: [opened, synchronize, reopened]
     paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
 
+env:
+  LLAMA_LOG_COLORS: 1
+  LLAMA_LOG_PREFIX: 1
+  LLAMA_LOG_TIMESTAMPS: 1
+  LLAMA_LOG_VERBOSITY: 10
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
diff --git a/Makefile b/Makefile
index f41887a4d..cb5ff9f9d 100644
--- a/Makefile
+++ b/Makefile
@@ -54,6 +54,7 @@ TEST_TARGETS = \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
 	tests/test-llama-grammar \
+	tests/test-log \
 	tests/test-model-load-cancel \
 	tests/test-opt \
 	tests/test-quantize-fns \
@@ -148,6 +149,14 @@ GGML_NO_METAL := 1
 DEPRECATE_WARNING := 1
 endif
 
+ifdef LLAMA_DISABLE_LOGS
+REMOVE_WARNING := 1
+endif
+
+ifdef LLAMA_SERVER_VERBOSE
+REMOVE_WARNING := 1
+endif
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -351,19 +360,11 @@ ifdef LLAMA_SANITIZE_UNDEFINED
 	MK_LDFLAGS  += -fsanitize=undefined -g
 endif
 
-ifdef LLAMA_SERVER_VERBOSE
-	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
-endif
-
 ifdef LLAMA_SERVER_SSL
 	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
 	MK_LDFLAGS += -lssl -lcrypto
 endif
 
-ifdef LLAMA_DISABLE_LOGS
-	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
-
 # warnings
 WARN_FLAGS = \
 	-Wall \
@@ -931,6 +932,7 @@ OBJ_LLAMA = \
 OBJ_COMMON = \
 	common/common.o \
 	common/arg.o \
+	common/log.o \
 	common/console.o \
 	common/ngram-cache.o \
 	common/sampling.o \
@@ -1027,6 +1029,14 @@ $(info   - LLAMA_NO_CCACHE)
 $(info )
 endif
 
+ifdef REMOVE_WARNING
+$(info !!! REMOVAL WARNING !!!)
+$(info The following LLAMA_ options have been removed and are no longer supported)
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info )
+endif
+
 #
 # Build libraries
 #
@@ -1168,6 +1178,11 @@ common/arg.o: \
 	common/arg.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+common/log.o: \
+	common/log.cpp \
+	common/log.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
@@ -1346,7 +1361,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
 llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
-	$(OBJ_GGML) $(OBJ_LLAMA)
+	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
@@ -1528,6 +1543,11 @@ tests/test-llama-grammar: tests/test-llama-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+tests/test-log: tests/test-log.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 tests/test-grammar-parser: tests/test-grammar-parser.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/ci/run.sh b/ci/run.sh
index 751bb0a02..1ac08ee4e 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -737,6 +737,9 @@ function gg_sum_embd_bge_small {
 
 ## main
 
+export LLAMA_LOG_PREFIX=1
+export LLAMA_LOG_TIMESTAMPS=1
+
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
     # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
     rm -rf ${SRC}/models-mnt
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 22fd99689..042e895ad 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -51,21 +51,23 @@ endif()
 set(TARGET common)
 
 add_library(${TARGET} STATIC
-    base64.hpp
-    common.h
-    common.cpp
-    arg.h
     arg.cpp
-    sampling.h
-    sampling.cpp
-    console.h
+    arg.h
+    base64.hpp
+    common.cpp
+    common.h
     console.cpp
-    json.hpp
+    console.h
     json-schema-to-grammar.cpp
-    train.h
-    train.cpp
-    ngram-cache.h
+    json.hpp
+    log.cpp
+    log.h
     ngram-cache.cpp
+    ngram-cache.h
+    sampling.cpp
+    sampling.h
+    train.cpp
+    train.h
     )
 
 if (BUILD_SHARED_LIBS)
diff --git a/common/arg.cpp b/common/arg.cpp
index a1cd5830f..8fcb8c25f 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,15 +1,17 @@
 #include "arg.h"
 
+#include "log.h"
 #include "sampling.h"
 
 #include <algorithm>
-#include <string>
-#include <vector>
-#include <set>
+#include <climits>
+#include <cstdarg>
 #include <fstream>
 #include <regex>
-#include <cstdarg>
-#include <climits>
+#include <set>
+#include <string>
+#include <thread>
+#include <vector>
 
 #include "json-schema-to-grammar.h"
 
@@ -383,20 +385,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             exit(0);
         }
     ));
-    add_opt(llama_arg(
-        {"-v", "--verbose"},
-        "print verbose information",
-        [](gpt_params & params) {
-            params.verbosity = 1;
-        }
-    ));
-    add_opt(llama_arg(
-        {"--verbosity"}, "N",
-        format("set specific verbosity level (default: %d)", params.verbosity),
-        [](gpt_params & params, int value) {
-            params.verbosity = value;
-        }
-    ));
     add_opt(llama_arg(
         {"--verbose-prompt"},
         format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
@@ -417,7 +405,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
         [](gpt_params & params) {
             params.use_color = true;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(llama_arg(
         {"-t", "--threads"}, "N",
         format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -876,7 +864,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.input_prefix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
@@ -884,7 +872,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.input_suffix = value;
             params.enable_chat_template = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(llama_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
@@ -1824,19 +1812,6 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.system_prompt = system_prompt;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
-    add_opt(llama_arg(
-        {"--log-format"}, "{text, json}",
-        "log output format: json or text (default: json)",
-        [](gpt_params & params, const std::string & value) {
-            if (value == "json") {
-                params.log_json = true;
-            } else if (value == "text") {
-                params.log_json = false;
-            } else {
-                throw std::invalid_argument("invalid value");
-            }
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(llama_arg(
         {"--metrics"},
         format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
@@ -1956,39 +1931,57 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             else { std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
-#ifndef LOG_DISABLE_LOGS
-    // TODO: make this looks less weird
-    add_opt(llama_arg(
-        {"--log-test"},
-        "Log test",
-        [](gpt_params &) { log_param_single_parse("--log-test"); }
-    ));
     add_opt(llama_arg(
         {"--log-disable"},
         "Log disable",
-        [](gpt_params &) { log_param_single_parse("--log-disable"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-enable"},
-        "Log enable",
-        [](gpt_params &) { log_param_single_parse("--log-enable"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-new"},
-        "Log new",
-        [](gpt_params &) { log_param_single_parse("--log-new"); }
-    ));
-    add_opt(llama_arg(
-        {"--log-append"},
-        "Log append",
-        [](gpt_params &) { log_param_single_parse("--log-append"); }
+        [](gpt_params &) {
+            gpt_log_pause(gpt_log_main());
+        }
     ));
     add_opt(llama_arg(
         {"--log-file"}, "FNAME",
-        "Log file",
-        [](gpt_params &, const std::string & value) { log_param_pair_parse(false, "--log-file", value); }
+        "Log to file",
+        [](gpt_params &, const std::string & value) {
+            gpt_log_set_file(gpt_log_main(), value.c_str());
+        }
     ));
-#endif // LOG_DISABLE_LOGS
+    add_opt(llama_arg(
+        {"--log-colors"},
+        "Enable colored logging",
+        [](gpt_params &) {
+            gpt_log_set_colors(gpt_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_COLORS"));
+    add_opt(llama_arg(
+        {"-v", "--verbose", "--log-verbose"},
+        "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
+        [](gpt_params & params) {
+            params.verbosity = INT_MAX;
+            gpt_log_set_verbosity_thold(INT_MAX);
+        }
+    ));
+    add_opt(llama_arg(
+        {"-lv", "--verbosity", "--log-verbosity"}, "N",
+        "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
+        [](gpt_params & params, int value) {
+            params.verbosity = value;
+            gpt_log_set_verbosity_thold(value);
+        }
+    ).set_env("LLAMA_LOG_VERBOSITY"));
+    add_opt(llama_arg(
+        {"--log-prefix"},
+        "Enable prefx in log messages",
+        [](gpt_params &) {
+            gpt_log_set_prefix(gpt_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_PREFIX"));
+    add_opt(llama_arg(
+        {"--log-timestamps"},
+        "Enable timestamps in log messages",
+        [](gpt_params &) {
+            gpt_log_set_timestamps(gpt_log_main(), true);
+        }
+    ).set_env("LLAMA_LOG_TIMESTAMPS"));
 
     return ctx_arg;
 }
diff --git a/common/common.cpp b/common/common.cpp
index f9a831ec7..8d0ed4f95 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -3,6 +3,7 @@
 #endif
 
 #include "common.h"
+#include "log.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
@@ -25,6 +26,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include <thread>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -48,7 +50,6 @@
 #if defined(LLAMA_USE_CURL)
 #include <curl/curl.h>
 #include <curl/easy.h>
-#include <thread>
 #include <future>
 #endif
 
@@ -226,7 +227,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
     }
 
     if (!SetPriorityClass(GetCurrentProcess(), p)) {
-        fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
+        LOG_WRN("failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
         return false;
     }
 
@@ -251,7 +252,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
     }
 
     if (!setpriority(PRIO_PROCESS, 0, p)) {
-        fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
+        LOG_WRN("failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
         return false;
     }
     return true;
@@ -284,14 +285,14 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
 
     if (n_set && n_set < cpuparams.n_threads) {
         // Not enough set bits, may experience performance issues.
-        fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
+        LOG_WRN("Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
     }
 }
 
 bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
     size_t dash_loc = range.find('-');
     if (dash_loc == std::string::npos) {
-        fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
+        LOG_ERR("Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
         return false;
     }
 
@@ -303,7 +304,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
     } else {
         start_i = std::stoull(range.substr(0, dash_loc));
         if (start_i >= GGML_MAX_N_THREADS) {
-            fprintf(stderr, "Start index out of bounds!\n");
+            LOG_ERR("Start index out of bounds!\n");
             return false;
         }
     }
@@ -313,7 +314,7 @@ bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THRE
     } else {
         end_i = std::stoull(range.substr(dash_loc + 1));
         if (end_i >= GGML_MAX_N_THREADS) {
-            fprintf(stderr, "End index out of bounds!\n");
+            LOG_ERR("End index out of bounds!\n");
             return false;
         }
     }
@@ -348,7 +349,7 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
         } else if (c >= 'A' && c <= 'F') {
             id -= 'A' - 10;
         } else {
-            fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
+            LOG_ERR("Invalid hex character '%c' at position %d\n", c, int32_t(i));
             return false;
         }
 
@@ -361,6 +362,22 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
     return true;
 }
 
+void gpt_init() {
+    llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
+        if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
+            gpt_log_add(gpt_log_main(), level, "%s", text);
+        }
+    }, NULL);
+
+#ifdef NDEBUG
+    const char * build_type = "";
+#else
+    const char * build_type = " (debug)";
+#endif
+
+    LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
+}
+
 std::string gpt_params_get_system_info(const gpt_params & params) {
     std::ostringstream os;
 
@@ -441,6 +458,94 @@ void string_replace_all(std::string & s, const std::string & search, const std::
     s = std::move(builder);
 }
 
+std::string string_from(bool value) {
+    return value ? "true" : "false";
+}
+
+std::string string_from(const std::vector<int> & values) {
+    std::stringstream buf;
+
+    buf << "[ ";
+    bool first = true;
+    for (auto e : values) {
+        if (first) {
+            first = false;
+        } else {
+            buf << ", ";
+        }
+        buf << std::to_string(e);
+    }
+    buf << " ]";
+
+    return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
+    std::stringstream buf;
+
+    buf << "[ ";
+
+    bool first = true;
+    for (const auto & token : tokens) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, token);
+
+        detokenized.erase(
+            std::remove_if(
+                detokenized.begin(),
+                detokenized.end(),
+                [](const unsigned char c) { return !std::isprint(c); }),
+            detokenized.end());
+
+        buf << "'" << detokenized << "'"
+            << ":" << std::to_string(token);
+    }
+
+    buf << " ]";
+
+    return buf.str();
+}
+
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
+    std::stringstream buf;
+
+    buf << "[ ";
+
+    bool first = true;
+    for (int i = 0; i < batch.n_tokens; ++i) {
+        if (!first) {
+            buf << ", ";
+        } else {
+            first = false;
+        }
+
+        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
+
+        detokenized.erase(
+                std::remove_if(
+                    detokenized.begin(),
+                    detokenized.end(),
+                    [](const unsigned char c) { return !std::isprint(c); }),
+                detokenized.end());
+
+        buf << "\n" << std::to_string(i)
+            << ":token '" << detokenized << "'"
+            << ":pos " << std::to_string(batch.pos[i])
+            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ":logits " << std::to_string(batch.logits[i]);
+    }
+
+    buf << " ]";
+
+    return buf.str();
+}
+
 void string_process_escapes(std::string & input) {
     std::size_t input_len = input.length();
     std::size_t output_idx = 0;
@@ -481,7 +586,7 @@ void string_process_escapes(std::string & input) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
     const char * sep = strchr(data, '=');
     if (sep == nullptr || sep - data >= 128) {
-        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+        LOG_ERR("%s: malformed KV override '%s'\n", __func__, data);
         return false;
     }
     llama_model_kv_override kvo;
@@ -504,20 +609,20 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
         } else if (std::strcmp(sep, "false") == 0) {
             kvo.val_bool = false;
         } else {
-            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+            LOG_ERR("%s: invalid boolean value for KV override '%s'\n", __func__, data);
             return false;
         }
     } else if (strncmp(sep, "str:", 4) == 0) {
         sep += 4;
         kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
         if (strlen(sep) > 127) {
-            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+            LOG_ERR("%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
             return false;
         }
         strncpy(kvo.val_str, sep, 127);
         kvo.val_str[127] = '\0';
     } else {
-        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+        LOG_ERR("%s: invalid type for KV override '%s'\n", __func__, data);
         return false;
     }
     overrides.emplace_back(std::move(kvo));
@@ -729,7 +834,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     }
 
     if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
         return iparams;
     }
 
@@ -737,7 +842,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
 
     llama_context * lctx = llama_new_context_with_model(model, cparams);
     if (lctx == NULL) {
-        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
         llama_free_model(model);
         return iparams;
     }
@@ -773,7 +878,7 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
         loaded_la.scale = la.scale;
         loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
         if (loaded_la.adapter == nullptr) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
+            LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
             llama_free(lctx);
             llama_free_model(model);
             return iparams;
@@ -785,12 +890,12 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
     }
 
     if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
-        fprintf(stderr, "%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
+        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
         params.sparams.ignore_eos = false;
     }
 
     if (params.warmup) {
-        LOG("warming up the model with an empty run\n");
+        LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
 
         std::vector<llama_token> tmp;
         llama_token bos = llama_token_bos(model);
@@ -955,7 +1060,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
     int remaining_attempts = max_attempts;
 
     while (remaining_attempts > 0) {
-        fprintf(stderr, "%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
 
         CURLcode res = curl_easy_perform(curl);
         if (res == CURLE_OK) {
@@ -963,13 +1068,14 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
         }
 
         int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        fprintf(stderr, "%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
 
         remaining_attempts--;
         std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
     }
 
-    fprintf(stderr, "%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
     return false;
 }
 
@@ -978,7 +1084,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
     if (!curl) {
-        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
         return false;
     }
 
@@ -1019,11 +1125,11 @@ static bool llama_download_file(const std::string & url, const std::string & pat
         if (metadata_in.good()) {
             try {
                 metadata_in >> metadata;
-                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
                 if (metadata.contains("url") && metadata.at("url").is_string()) {
                     auto previous_url = metadata.at("url").get<std::string>();
                     if (previous_url != url) {
-                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
                         return false;
                     }
                 }
@@ -1034,12 +1140,12 @@ static bool llama_download_file(const std::string & url, const std::string & pat
                     last_modified = metadata.at("lastModified");
                 }
             } catch (const nlohmann::json::exception & e) {
-                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
                 return false;
             }
         }
     } else {
-        fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str());
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
     }
 
     // Send a HEAD request to retrieve the etag and last-modified headers
@@ -1087,26 +1193,26 @@ static bool llama_download_file(const std::string & url, const std::string & pat
             // HEAD not supported, we don't know if the file has changed
             // force trigger downloading
             force_download = true;
-            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
         }
     }
 
     bool should_download = !file_exists || force_download;
     if (!should_download) {
         if (!etag.empty() && etag != headers.etag) {
-            fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
             should_download = true;
         } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
             should_download = true;
         }
     }
     if (should_download) {
         std::string path_temporary = path + ".downloadInProgress";
         if (file_exists) {
-            fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
             if (remove(path.c_str()) != 0) {
-                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str());
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
                 return false;
             }
         }
@@ -1121,7 +1227,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
 
         std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
         if (!outfile) {
-            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
             return false;
         }
 
@@ -1152,7 +1258,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
         };
 
         // start the download
-        fprintf(stderr, "%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
             llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
         bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
         if (!was_perform_successful) {
@@ -1162,7 +1268,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
         long http_code = 0;
         curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
         if (http_code < 200 || http_code >= 400) {
-            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
+            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
             return false;
         }
 
@@ -1176,10 +1282,10 @@ static bool llama_download_file(const std::string & url, const std::string & pat
             {"lastModified", headers.last_modified}
         });
         std::ofstream(metadata_path) << metadata.dump(4);
-        fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
 
         if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
             return false;
         }
     }
@@ -1194,7 +1300,7 @@ struct llama_model * llama_load_model_from_url(
         const struct llama_model_params & params) {
     // Basic validation of the model_url
     if (!model_url || strlen(model_url) == 0) {
-        fprintf(stderr, "%s: invalid model_url\n", __func__);
+        LOG_ERR("%s: invalid model_url\n", __func__);
         return NULL;
     }
 
@@ -1211,7 +1317,7 @@ struct llama_model * llama_load_model_from_url(
         };
         auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params);
         if (!ctx_gguf) {
-            fprintf(stderr, "\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, path_model);
             return NULL;
         }
 
@@ -1231,14 +1337,12 @@ struct llama_model * llama_load_model_from_url(
         // and extract split URL and PATH prefixes
         {
             if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) {
-                fprintf(stderr, "\n%s: unexpected model file name: %s"
-                                " n_split=%d\n", __func__, path_model, n_split);
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, path_model, n_split);
                 return NULL;
             }
 
             if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) {
-                fprintf(stderr, "\n%s: unexpected model url: %s"
-                                " n_split=%d\n", __func__, model_url, n_split);
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url, n_split);
                 return NULL;
             }
         }
@@ -1298,7 +1402,7 @@ struct llama_model * llama_load_model_from_url(
         const char * /*path_model*/,
         const char * /*hf_token*/,
         const struct llama_model_params & /*params*/) {
-    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
     return nullptr;
 }
 
@@ -1308,7 +1412,7 @@ struct llama_model * llama_load_model_from_hf(
         const char * /*path_model*/,
         const char * /*hf_token*/,
         const struct llama_model_params & /*params*/) {
-    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
     return nullptr;
 }
 
@@ -1636,13 +1740,13 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
     };
     struct gguf_context * ctx_gguf = gguf_init_from_file(load_info.fname.c_str(), meta_gguf_params);
     if (!ctx_gguf) {
-        fprintf(stderr, "%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
+        LOG_ERR("%s: failed to load control vector file from %s\n", __func__, load_info.fname.c_str());
         return result;
     }
 
     int32_t n_tensors = gguf_get_n_tensors(ctx_gguf);
     if (n_tensors == 0) {
-        fprintf(stderr, "%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
+        LOG_WRN("%s: no direction tensors found in %s\n", __func__, load_info.fname.c_str());
     }
 
     for (int i = 0; i < n_tensors; i++) {
@@ -1660,23 +1764,23 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
             }
         }
         if (layer_idx < 0) {
-            fprintf(stderr, "%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid/unparsable direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
             result.n_embd = -1;
             break;
         } else if (layer_idx == 0) {
-            fprintf(stderr, "%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid (zero) direction tensor layer index in %s\n", __func__, load_info.fname.c_str());
             result.n_embd = -1;
             break;
         }
 
         struct ggml_tensor * tensor = ggml_get_tensor(ctx, name.c_str());
         if (tensor->type != GGML_TYPE_F32) {
-            fprintf(stderr, "%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid (non-F32) direction tensor type in %s\n", __func__, load_info.fname.c_str());
             result.n_embd = -1;
             break;
         }
         if (ggml_n_dims(tensor) != 1) {
-            fprintf(stderr, "%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: invalid (non-1D) direction tensor shape in %s\n", __func__, load_info.fname.c_str());
             result.n_embd = -1;
             break;
         }
@@ -1684,7 +1788,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
         if (result.n_embd == -1) {
             result.n_embd = ggml_nelements(tensor);
         } else if (ggml_nelements(tensor) != result.n_embd) {
-            fprintf(stderr, "%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
+            LOG_ERR("%s: direction tensor in %s does not match previous dimensions\n", __func__, load_info.fname.c_str());
             result.n_embd = -1;
             break;
         }
@@ -1701,7 +1805,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
     }
 
     if (result.n_embd == -1) {
-        fprintf(stderr, "%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
+        LOG_WRN("%s: skipping %s due to invalid direction tensors\n", __func__, load_info.fname.c_str());
         result.data.clear();
     }
 
@@ -1722,7 +1826,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
             break;
         }
         if (result.n_embd != -1 && result.n_embd != cur.n_embd) {
-            fprintf(stderr, "%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
+            LOG_ERR("%s: control vectors in %s does not match previous dimensions\n", __func__, info.fname.c_str());
             result.n_embd = -1;
             break;
         }
@@ -1738,7 +1842,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
     }
 
     if (result.n_embd == -1) {
-        fprintf(stderr, "%s: no valid control vector files passed\n", __func__);
+        LOG_ERR("%s: no valid control vector files passed\n", __func__);
         result.data.clear();
     }
 
diff --git a/common/common.h b/common/common.h
index e8025aeef..e100c8fa7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -4,11 +4,9 @@
 
 #include "llama.h"
 
-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-
 #include <string>
 #include <vector>
+#include <sstream>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -343,6 +341,10 @@ struct gpt_params {
     bool batched_bench_output_jsonl = false;
 };
 
+// call once at the start of a program if it uses libcommon
+// initializes the logging system and prints info about the build
+void gpt_init();
+
 std::string gpt_params_get_system_info(const gpt_params & params);
 
 bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
@@ -378,6 +380,11 @@ static std::vector<T> string_split(const std::string & str, char delim) {
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 
+std::string string_from(bool value);
+std::string string_from(const std::vector<int> & values);
+std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
+std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
+
 //
 // Filesystem utils
 //
diff --git a/common/log.cpp b/common/log.cpp
new file mode 100644
index 000000000..2825a227e
--- /dev/null
+++ b/common/log.cpp
@@ -0,0 +1,401 @@
+#include "log.h"
+
+#include <condition_variable>
+#include <cstdarg>
+#include <cstdio>
+#include <mutex>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
+
+void gpt_log_set_verbosity_thold(int verbosity) {
+    gpt_log_verbosity_thold = verbosity;
+}
+
+#define LOG_COL_DEFAULT "\033[0m"
+#define LOG_COL_BOLD    "\033[1m"
+#define LOG_COL_RED     "\033[31m"
+#define LOG_COL_GREEN   "\033[32m"
+#define LOG_COL_YELLOW  "\033[33m"
+#define LOG_COL_BLUE    "\033[34m"
+#define LOG_COL_MAGENTA "\033[35m"
+#define LOG_COL_CYAN    "\033[36m"
+#define LOG_COL_WHITE   "\033[37m"
+
+static int64_t t_us() {
+    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+}
+
+// colors
+enum gpt_log_col : int {
+    GPT_LOG_COL_DEFAULT = 0,
+    GPT_LOG_COL_BOLD,
+    GPT_LOG_COL_RED,
+    GPT_LOG_COL_GREEN,
+    GPT_LOG_COL_YELLOW,
+    GPT_LOG_COL_BLUE,
+    GPT_LOG_COL_MAGENTA,
+    GPT_LOG_COL_CYAN,
+    GPT_LOG_COL_WHITE,
+};
+
+// disable colors by default
+static std::vector<const char *> g_col = {
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+    "",
+};
+
+struct gpt_log_entry {
+    enum ggml_log_level level;
+
+    bool prefix;
+
+    int64_t timestamp;
+
+    std::vector<char> msg;
+
+    // signals the worker thread to stop
+    bool is_end;
+
+    void print(FILE * file = nullptr) const {
+        FILE * fcur = file;
+        if (!fcur) {
+            // stderr displays DBG messages only when their verbosity level is not higher than the threshold
+            // these messages will still be logged to a file
+            if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
+                return;
+            }
+
+            fcur = stdout;
+
+            if (level != GGML_LOG_LEVEL_NONE) {
+                fcur = stderr;
+            }
+        }
+
+        if (level != GGML_LOG_LEVEL_NONE && prefix) {
+            if (timestamp) {
+                // [M.s.ms.us]
+                fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
+                        g_col[GPT_LOG_COL_BLUE],
+                        (int) (timestamp / 1000000 / 60),
+                        (int) (timestamp / 1000000 % 60),
+                        (int) (timestamp / 1000 % 1000),
+                        (int) (timestamp % 1000),
+                        g_col[GPT_LOG_COL_DEFAULT]);
+            }
+
+            switch (level) {
+                case GGML_LOG_LEVEL_INFO:  fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN],   g_col[GPT_LOG_COL_DEFAULT]); break;
+                case GGML_LOG_LEVEL_WARN:  fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], ""                        ); break;
+                case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED],     ""                        ); break;
+                case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW],  ""                        ); break;
+                default:
+                    break;
+            }
+        }
+
+        fprintf(fcur, "%s", msg.data());
+
+        if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
+            fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
+        }
+
+        fflush(fcur);
+    }
+};
+
+struct gpt_log {
+    // default capacity - will be expanded if needed
+    gpt_log() : gpt_log(256) {}
+
+    gpt_log(size_t capacity) {
+        file = nullptr;
+        prefix = false;
+        timestamps = false;
+        running = false;
+        t_start = t_us();
+
+        // initial message size - will be expanded if longer messages arrive
+        entries.resize(capacity);
+        for (auto & entry : entries) {
+            entry.msg.resize(256);
+        }
+
+        head = 0;
+        tail = 0;
+
+        resume();
+    }
+
+    ~gpt_log() {
+        pause();
+        if (file) {
+            fclose(file);
+        }
+    }
+
+private:
+    std::mutex mtx;
+    std::thread thrd;
+    std::condition_variable cv;
+
+    FILE * file;
+
+    bool prefix;
+    bool timestamps;
+    bool running;
+
+    int64_t t_start;
+
+    // ring buffer of entries
+    std::vector<gpt_log_entry> entries;
+    size_t head;
+    size_t tail;
+
+    // worker thread copies into this
+    gpt_log_entry cur;
+
+public:
+    void add(enum ggml_log_level level, const char * fmt, va_list args) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (!running) {
+            // discard messages while the worker thread is paused
+            return;
+        }
+
+        auto & entry = entries[tail];
+
+        {
+            // cannot use args twice, so make a copy in case we need to expand the buffer
+            va_list args_copy;
+            va_copy(args_copy, args);
+
+#if 1
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), fmt, args_copy);
+            }
+#else
+            // hack for bolding arguments
+
+            std::stringstream ss;
+            for (int i = 0; fmt[i] != 0; i++) {
+                if (fmt[i] == '%') {
+                    ss << LOG_COL_BOLD;
+                    while (fmt[i] != ' ' && fmt[i] != ')' && fmt[i] != ']' && fmt[i] != 0) ss << fmt[i++];
+                    ss << LOG_COL_DEFAULT;
+                    if (fmt[i] == 0) break;
+                }
+                ss << fmt[i];
+            }
+            const size_t n = vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args);
+            if (n >= entry.msg.size()) {
+                entry.msg.resize(n + 1);
+                vsnprintf(entry.msg.data(), entry.msg.size(), ss.str().c_str(), args_copy);
+            }
+#endif
+        }
+
+        entry.level = level;
+        entry.prefix = prefix;
+        entry.timestamp = 0;
+        if (timestamps) {
+            entry.timestamp = t_us() - t_start;
+        }
+        entry.is_end = false;
+
+        tail = (tail + 1) % entries.size();
+        if (tail == head) {
+            // expand the buffer
+            std::vector<gpt_log_entry> new_entries(2*entries.size());
+
+            size_t new_tail = 0;
+
+            do {
+                new_entries[new_tail] = std::move(entries[head]);
+
+                head     = (head     + 1) % entries.size();
+                new_tail = (new_tail + 1);
+            } while (head != tail);
+
+            head = 0;
+            tail = new_tail;
+
+            for (size_t i = tail; i < new_entries.size(); i++) {
+                new_entries[i].msg.resize(256);
+            }
+
+            entries = std::move(new_entries);
+        }
+
+        cv.notify_one();
+    }
+
+    void resume() {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        if (running) {
+            return;
+        }
+
+        running = true;
+
+        thrd = std::thread([this]() {
+            while (true) {
+                {
+                    std::unique_lock<std::mutex> lock(mtx);
+                    cv.wait(lock, [this]() { return head != tail; });
+
+                    cur = entries[head];
+
+                    head = (head + 1) % entries.size();
+                }
+
+                if (cur.is_end) {
+                    break;
+                }
+
+                cur.print(); // stdout and stderr
+
+                if (file) {
+                    cur.print(file);
+                }
+            }
+        });
+    }
+
+    void pause() {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+
+            if (!running) {
+                return;
+            }
+
+            running = false;
+
+            // push an entry to signal the worker thread to stop
+            {
+                auto & entry = entries[tail];
+                entry.is_end = true;
+
+                tail = (tail + 1) % entries.size();
+            }
+
+            cv.notify_one();
+        }
+
+        thrd.join();
+    }
+
+    void set_file(const char * path) {
+        pause();
+
+        if (file) {
+            fclose(file);
+        }
+
+        if (path) {
+            file = fopen(path, "w");
+        } else {
+            file = nullptr;
+        }
+
+        resume();
+    }
+
+    void set_colors(bool colors) {
+        pause();
+
+        if (colors) {
+            g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
+            g_col[GPT_LOG_COL_BOLD]    = LOG_COL_BOLD;
+            g_col[GPT_LOG_COL_RED]     = LOG_COL_RED;
+            g_col[GPT_LOG_COL_GREEN]   = LOG_COL_GREEN;
+            g_col[GPT_LOG_COL_YELLOW]  = LOG_COL_YELLOW;
+            g_col[GPT_LOG_COL_BLUE]    = LOG_COL_BLUE;
+            g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
+            g_col[GPT_LOG_COL_CYAN]    = LOG_COL_CYAN;
+            g_col[GPT_LOG_COL_WHITE]   = LOG_COL_WHITE;
+        } else {
+            for (size_t i = 0; i < g_col.size(); i++) {
+                g_col[i] = "";
+            }
+        }
+
+        resume();
+    }
+
+    void set_prefix(bool prefix) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->prefix = prefix;
+    }
+
+    void set_timestamps(bool timestamps) {
+        std::lock_guard<std::mutex> lock(mtx);
+
+        this->timestamps = timestamps;
+    }
+};
+
+//
+// public API
+//
+
+struct gpt_log * gpt_log_init() {
+    return new gpt_log;
+}
+
+struct gpt_log * gpt_log_main() {
+    static struct gpt_log log;
+
+    return &log;
+}
+
+void gpt_log_pause(struct gpt_log * log) {
+    log->pause();
+}
+
+void gpt_log_resume(struct gpt_log * log) {
+    log->resume();
+}
+
+void gpt_log_free(struct gpt_log * log) {
+    delete log;
+}
+
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    log->add(level, fmt, args);
+    va_end(args);
+}
+
+void gpt_log_set_file(struct gpt_log * log, const char * file) {
+    log->set_file(file);
+}
+
+void gpt_log_set_colors(struct gpt_log * log, bool colors) {
+    log->set_colors(colors);
+}
+
+void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
+    log->set_prefix(prefix);
+}
+
+void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
+    log->set_timestamps(timestamps);
+}
diff --git a/common/log.h b/common/log.h
index 1bc5328ce..d13f72d89 100644
--- a/common/log.h
+++ b/common/log.h
@@ -1,724 +1,90 @@
 #pragma once
 
-#include <chrono>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include <thread>
-#include <vector>
-#include <algorithm>
-#include <cinttypes>
+#include "ggml.h" // for ggml_log_level
 
-// --------------------------------
-//
-// Basic usage:
-//
-// --------
-//
-//  The LOG() and LOG_TEE() macros are ready to go by default
-//   they do not require any initialization.
-//
-//  LOGLN() and LOG_TEELN() are variants which automatically
-//   include \n character at the end of the log string.
-//
-//  LOG() behaves exactly like printf, by default writing to a logfile.
-//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
-//
-//  Default logfile is named
-//   "llama.<threadID>.log"
-//  Default LOG_TEE() secondary output target is
-//   stderr
-//
-//  Logs can be dynamically disabled or enabled using functions:
-//   log_disable()
-//  and
-//   log_enable()
-//
-//  A log target can be changed with:
-//   log_set_target( string )
-//    creating and opening, or re-opening a file by string filename
-//  or
-//   log_set_target( FILE* )
-//    allowing to point at stderr, stdout, or any valid FILE* file handler.
-//
-// --------
-//
-// End of Basic usage.
-//
-// --------------------------------
-
-// Specifies a log target.
-//  default uses log_handler() with "llama.log" log file
-//  this can be changed, by defining LOG_TARGET
-//  like so:
-//
-//  #define LOG_TARGET (a valid FILE*)
-//  #include "log.h"
-//
-//  or it can be simply redirected to stdout or stderr
-//  like so:
-//
-//  #define LOG_TARGET stderr
-//  #include "log.h"
-//
-//  The log target can also be redirected to a different function
-//  like so:
-//
-//  #define LOG_TARGET log_handler_different()
-//  #include "log.h"
-//
-//  FILE* log_handler_different()
-//  {
-//      return stderr;
-//  }
-//
-//  or:
-//
-//  #define LOG_TARGET log_handler_another_one("somelog.log")
-//  #include "log.h"
-//
-//  FILE* log_handler_another_one(char*filename)
-//  {
-//      static FILE* logfile = nullptr;
-//      (...)
-//      if( !logfile )
-//      {
-//          fopen(...)
-//      }
-//      (...)
-//      return logfile
-//  }
-//
-#ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-#endif
-
-#ifndef LOG_TEE_TARGET
-    #define LOG_TEE_TARGET stderr
-#endif
-
-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
-// Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-   static std::string pid;
-   if (pid.empty())
-   {
-       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-       //  it's not the same as "pid" but is unique enough to solve multiple instances
-       //  trying to write to the same log.
-       std::stringstream ss;
-       ss << std::this_thread::get_id();
-       pid = ss.str();
-   }
-
-   return pid;
-}
-
-// Utility function for generating log file names with unique id based on thread id.
-//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
-//  where the number is a runtime id of the current thread.
-
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
-
-// INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
-{
-    static bool _multilog = false;
-
-    if (multilog != LogTriStateSame)
-    {
-        _multilog = multilog == LogTriStateTrue;
-    }
-
-    std::stringstream buf;
-
-    buf << log_file_basename;
-    if (_multilog)
-    {
-        buf << ".";
-        buf << log_get_pid();
-    }
-    buf << ".";
-    buf << log_file_extension;
-
-    return buf.str();
-}
-
-#ifndef LOG_DEFAULT_FILE_NAME
-    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
-#endif
-
-// Utility for turning #define values into string literals
-//  so we can have a define for stderr and
-//  we can print "stderr" instead of literal stderr, etc.
-#define LOG_STRINGIZE1(s) #s
-#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
-
-#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
-
-// Allows disabling timestamps.
-//  in order to disable, define LOG_NO_TIMESTAMPS
-//  like so:
-//
-//  #define LOG_NO_TIMESTAMPS
-//  #include "log.h"
-//
-#ifndef LOG_NO_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
+#ifndef __GNUC__
+#    define LOG_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
-    #define LOG_TIMESTAMP_FMT "%s"
-    #define LOG_TIMESTAMP_VAL ,""
+#    define LOG_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 
-#ifdef LOG_TEE_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TEE_TIMESTAMP_FMT "%s"
-    #define LOG_TEE_TIMESTAMP_VAL ,""
-#endif
+#define LOG_DEFAULT_DEBUG 1
+#define LOG_DEFAULT_LLAMA 0
 
-// Allows disabling file/line/function prefix
-//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
-//  like so:
+// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
+// set via gpt_log_set_verbosity()
+extern int gpt_log_verbosity_thold;
+
+void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
+
+// the gpt_log uses an internal worker thread to print/write log messages
+// when the worker thread is paused, incoming log messages are discarded
+struct gpt_log;
+
+struct gpt_log * gpt_log_init();
+struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
+void             gpt_log_pause (struct gpt_log * log); // pause  the worker thread, not thread-safe
+void             gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
+void             gpt_log_free  (struct gpt_log * log);
+
+LOG_ATTRIBUTE_FORMAT(3, 4)
+void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
+
+// defaults: file = NULL, colors = false, prefix = false, timestamps = false
 //
-//  #define LOG_NO_FILE_LINE_FUNCTION
-//  #include "log.h"
+// regular log output:
 //
-#ifndef LOG_NO_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_FLF_FMT "%s"
-    #define LOG_FLF_VAL ,""
-#endif
-
-#ifdef LOG_TEE_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_TEE_FLF_FMT "%s"
-    #define LOG_TEE_FLF_VAL ,""
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG() INSTEAD
+//   ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   llm_load_tensors: ggml ctx size =    0.27 MiB
+//   llm_load_tensors: offloading 32 repeating layers to GPU
+//   llm_load_tensors: offloading non-repeating layers to GPU
 //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
-    #define LOG_IMPL(str, ...)                                                                                      \
-    do {                                                                                                            \
-        if (LOG_TARGET != nullptr)                                                                                  \
-        {                                                                                                           \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                     \
-        }                                                                                                           \
+// with prefix = true, timestamps = true, the log output will look like this:
+//
+//   0.00.035.060 D ggml_backend_metal_log_allocated_size: allocated buffer, size =  6695.84 MiB, ( 6695.91 / 21845.34)
+//   0.00.035.064 I llm_load_tensors: ggml ctx size =    0.27 MiB
+//   0.00.090.578 I llm_load_tensors: offloading 32 repeating layers to GPU
+//   0.00.090.579 I llm_load_tensors: offloading non-repeating layers to GPU
+//
+// I - info    (stdout, V = 0)
+// W - warning (stderr, V = 0)
+// E - error   (stderr, V = 0)
+// D - debug   (stderr, V = LOG_DEFAULT_DEBUG)
+//
+
+void gpt_log_set_file      (struct gpt_log * log, const char * file);       // not thread-safe
+void gpt_log_set_colors    (struct gpt_log * log,       bool   colors);     // not thread-safe
+void gpt_log_set_prefix    (struct gpt_log * log,       bool   prefix);     // whether to output prefix to each log
+void gpt_log_set_timestamps(struct gpt_log * log,       bool   timestamps); // whether to output timestamps in the prefix
+
+// helper macros for logging
+// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
+//
+// for example:
+//
+//   LOG_DBG("this is a debug message: %d\n", expensive_function());
+//
+// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
+//
+
+#define LOG_TMPL(level, verbosity, ...) \
+    do { \
+        if ((verbosity) <= gpt_log_verbosity_thold) { \
+            gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
+        } \
     } while (0)
-#else
-    #define LOG_IMPL(str, ...)                                                                                           \
-    do {                                                                                                                 \
-        if (LOG_TARGET != nullptr)                                                                                       \
-        {                                                                                                                \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                          \
-        }                                                                                                                \
-    } while (0)
-#endif
 
-// INTERNAL, DO NOT USE
-//  USE LOG_TEE() INSTEAD
-//
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
-    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
-    do {                                                                                                                                \
-        if (LOG_TARGET != nullptr)                                                                                                      \
-        {                                                                                                                               \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                         \
-        }                                                                                                                               \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
-        {                                                                                                                               \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                     \
-        }                                                                                                                               \
-    } while (0)
-#else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
-    do {                                                                                                                                     \
-        if (LOG_TARGET != nullptr)                                                                                                           \
-        {                                                                                                                                    \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                              \
-        }                                                                                                                                    \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
-        {                                                                                                                                    \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                          \
-        }                                                                                                                                    \
-    } while (0)
-#endif
+#define LOG(...)             LOG_TMPL(GGML_LOG_LEVEL_NONE, 0,         __VA_ARGS__)
+#define LOGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_NONE, verbosity, __VA_ARGS__)
 
-// The '\0' as a last argument, is a trick to bypass the silly
-//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
-//  so we can have a single macro which can be called just like printf.
+#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  0,                 __VA_ARGS__)
+#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  0,                 __VA_ARGS__)
+#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, 0,                 __VA_ARGS__)
+#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, LOG_DEFAULT_DEBUG, __VA_ARGS__)
 
-// Main LOG macro.
-//  behaves like printf, and supports arguments the exact same way.
-//
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// Main TEE macro.
-//  does the same as LOG
-//  and
-//  simultaneously writes stderr.
-//
-// Secondary target can be changed just like LOG_TARGET
-//  by defining LOG_TEE_TARGET
-//
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
-#endif
-
-// LOG macro variants with auto endline.
-#if !defined(_MSC_VER) || defined(__clang__)
-    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
-    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
-#else
-    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "\n")
-#endif
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
-{
-    static bool _initialized = false;
-    static bool _append = false;
-    static bool _disabled = filename.empty() && target == nullptr;
-    static std::string log_current_filename{filename};
-    static FILE *log_current_target{target};
-    static FILE *logfile = nullptr;
-
-    if (change)
-    {
-        if (append != LogTriStateSame)
-        {
-            _append = append == LogTriStateTrue;
-            return logfile;
-        }
-
-        if (disable == LogTriStateTrue)
-        {
-            // Disable primary target
-            _disabled = true;
-        }
-        // If previously disabled, only enable, and keep previous target
-        else if (disable == LogTriStateFalse)
-        {
-            _disabled = false;
-        }
-        // Otherwise, process the arguments
-        else if (log_current_filename != filename || log_current_target != target)
-        {
-            _initialized = false;
-        }
-    }
-
-    if (_disabled)
-    {
-        // Log is disabled
-        return nullptr;
-    }
-
-    if (_initialized)
-    {
-        // with fallback in case something went wrong
-        return logfile ? logfile : stderr;
-    }
-
-    // do the (re)initialization
-    if (target != nullptr)
-    {
-        if (logfile != nullptr && logfile != stdout && logfile != stderr)
-        {
-            fclose(logfile);
-        }
-
-        log_current_filename = LOG_DEFAULT_FILE_NAME;
-        log_current_target = target;
-
-        logfile = target;
-    }
-    else
-    {
-        if (log_current_filename != filename)
-        {
-            if (logfile != nullptr && logfile != stdout && logfile != stderr)
-            {
-                fclose(logfile);
-            }
-        }
-
-        logfile = fopen(filename.c_str(), _append ? "a" : "w");
-    }
-
-    if (!logfile)
-    {
-        //  Verify whether the file was opened, otherwise fallback to stderr
-        logfile = stderr;
-
-        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
-        fflush(stderr);
-
-        // At this point we let the init flag be to true below, and let the target fallback to stderr
-        //  otherwise we would repeatedly fopen() which was already unsuccessful
-    }
-
-    _initialized = true;
-
-    return logfile ? logfile : stderr;
-}
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
-{
-    return log_handler1_impl(change, append, disable, filename, target);
-}
-
-// Disables logs entirely at runtime.
-//  Makes LOG() and LOG_TEE() produce no output,
-//  until enabled back.
-#define log_disable() log_disable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_disable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
-}
-
-// Enables logs at runtime.
-#define log_enable() log_enable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_enable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
-}
-
-// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
-#define log_set_target(target) log_set_target_impl(target)
-
-// INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler() { return log_handler1_impl(); }
-
-// Enable or disable creating separate log files for each run.
-//  can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-//  can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
-    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
-inline void log_test()
-{
-    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n");
-    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
-    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n");
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n");
-    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n");
-    log_disable();
-    LOG("09 Hello World _1_ into the void!\n");
-    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
-    log_disable();
-    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
-    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
-    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n");
-    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n");
-#ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n");
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
-    LOG("19 Hello msvc LOG without arguments\n");
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
-    LOGLN("21 Hello msvc LOGLN without arguments\n");
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
-#endif
-}
-
-inline bool log_param_single_parse(const std::string & param)
-{
-    if ( param == "--log-test")
-    {
-        log_test();
-        return true;
-    }
-
-    if ( param == "--log-disable")
-    {
-        log_disable();
-        return true;
-    }
-
-    if ( param == "--log-enable")
-    {
-        log_enable();
-        return true;
-    }
-
-    if (param == "--log-new")
-    {
-        log_multilog(true);
-        return true;
-    }
-
-    if (param == "--log-append")
-    {
-        log_append(true);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
-{
-    if ( param == "--log-file")
-    {
-        if (!check_but_dont_parse)
-        {
-            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
-        }
-
-        return true;
-    }
-
-    return false;
-}
-
-inline void log_print_usage()
-{
-    printf("log options:\n");
-    /* format
-    printf("  -h, --help            show this help message and exit\n");*/
-    /* spacing
-    printf("__-param----------------Description\n");*/
-    printf("  --log-test            Run simple logging test\n");
-    printf("  --log-disable         Disable trace logs\n");
-    printf("  --log-enable          Enable trace logs\n");
-    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("  --log-new             Create a separate new log file on start. "
-                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
-    printf("  --log-append          Don't truncate the old log file.\n");
-    printf("\n");
-}
-
-#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
-
-// INTERNAL, DO NOT USE
-inline void log_dump_cmdline_impl(int argc, char **argv)
-{
-    std::stringstream buf;
-    for (int i = 0; i < argc; ++i)
-    {
-        if (std::string(argv[i]).find(' ') != std::string::npos)
-        {
-            buf << " \"" << argv[i] <<"\"";
-        }
-        else
-        {
-            buf << " " << argv[i];
-        }
-    }
-    LOGLN("Cmd:%s", buf.str().c_str());
-}
-
-#define log_tostr(var) log_var_to_string_impl(var).c_str()
-
-inline std::string log_var_to_string_impl(bool var)
-{
-    return var ? "true" : "false";
-}
-
-inline std::string log_var_to_string_impl(std::string var)
-{
-    return var;
-}
-
-inline std::string log_var_to_string_impl(const std::vector<int> & var)
-{
-    std::stringstream buf;
-    buf << "[ ";
-    bool first = true;
-    for (auto e : var)
-    {
-        if (first)
-        {
-            first = false;
-        }
-        else
-        {
-            buf << ", ";
-        }
-        buf << std::to_string(e);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename T>
-inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (const auto & token : tokens)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, token);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "'" << detokenized << "'"
-            << ":" << std::to_string(token);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename B>
-inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-#ifdef LOG_DISABLE_LOGS
-
-#undef LOG
-#define LOG(...) // dummy stub
-#undef LOGLN
-#define LOGLN(...) // dummy stub
-
-#undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_DISABLE
-#define LOG_DISABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_SET_TARGET
-#define LOG_SET_TARGET(...) // dummy stub
-
-#undef LOG_DUMP_CMDLINE
-#define LOG_DUMP_CMDLINE(...) // dummy stub
-
-#endif // LOG_DISABLE_LOGS
+#define LOG_INFV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  verbosity, __VA_ARGS__)
+#define LOG_WRNV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  verbosity, __VA_ARGS__)
+#define LOG_ERRV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, verbosity, __VA_ARGS__)
+#define LOG_DBGV(verbosity, ...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, verbosity, __VA_ARGS__)
diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
index 3ca112ef1..7953c723e 100644
--- a/common/ngram-cache.cpp
+++ b/common/ngram-cache.cpp
@@ -2,8 +2,11 @@
 #include "common.h"
 #include "log.h"
 
+#include <cinttypes>
 #include <cstdint>
+#include <cstdio>
 #include <fstream>
+#include <thread>
 
 void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
                               std::vector<llama_token> & inp, int nnew, bool print_progress) {
diff --git a/common/sampling.cpp b/common/sampling.cpp
index c07b5e940..e51d07611 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -325,7 +325,7 @@ llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
 }
 
 std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
-    std::string result = "\tlogits ";
+    std::string result = "logits ";
 
     for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
         const auto * smpl = llama_sampler_chain_get(gsmpl->chain, i);
diff --git a/common/train.cpp b/common/train.cpp
index fef1e57c9..661ad8382 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1,9 +1,11 @@
 #include "train.h"
 #include "common.h"
 
+#include <algorithm>
 #include <random>
 #include <sstream>
 #include <functional>
+#include <cstring>
 
 struct random_normal_distribution {
     std::mt19937 gen;
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index ec00fcf78..4a15941f1 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
@@ -8,9 +9,9 @@
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
@@ -20,6 +21,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     int is_pp_shared = params.is_pp_shared;
 
     std::vector<int> n_pp = params.n_pp;
@@ -76,7 +79,7 @@ int main(int argc, char ** argv) {
 
             const int ret = llama_decode(ctx, batch_view);
             if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                 return false;
             }
 
@@ -93,17 +96,17 @@ int main(int argc, char ** argv) {
         }
 
         if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
     }
 
     if (!params.batched_bench_output_jsonl) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
-        LOG_TEE("\n");
-        LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
-        LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
+        LOG("\n");
+        LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+        LOG("\n");
+        LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
+        LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
     }
 
     for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
@@ -133,7 +136,7 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_clear(ctx);
 
                 if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
+                    LOG_ERR("%s: llama_decode() failed\n", __func__);
                     return 1;
                 }
 
@@ -155,7 +158,7 @@ int main(int argc, char ** argv) {
                     }
 
                     if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
+                        LOG_ERR("%s: llama_decode() failed\n", __func__);
                         return 1;
                     }
                 }
@@ -173,20 +176,20 @@ int main(int argc, char ** argv) {
                 const float speed    = n_kv / t;
 
                 if(params.batched_bench_output_jsonl) {
-                    LOG_TEE(
+                    LOG(
                         "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
                         "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
                         n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
                         pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
                     );
                 } else {
-                    LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
+                    LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
                 }
             }
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_context_print(ctx);
 
     llama_batch_free(batch);
@@ -196,7 +199,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index f1df20c6e..7887a43d6 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
@@ -8,9 +9,9 @@
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
@@ -23,6 +24,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
 
     // number of parallel batches
     int n_parallel = params.n_parallel;
@@ -42,7 +44,7 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
         return 1;
     }
 
@@ -72,31 +74,29 @@ int main(int argc, char ** argv) {
     llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
 
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
     const int n_ctx = llama_n_ctx(ctx);
 
-    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+        LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_ERR("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
         return 1;
     }
 
     // print the prompt token-by-token
 
-    fprintf(stderr, "\n");
+    LOG("\n");
 
     for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
     }
 
-    fflush(stderr);
-
     // create a llama_batch
     // we use this object to submit token data for decoding
     llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -114,7 +114,7 @@ int main(int argc, char ** argv) {
 
     if (llama_model_has_encoder(model)) {
         if (llama_encode(ctx, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
             return 1;
         }
 
@@ -131,7 +131,7 @@ int main(int argc, char ** argv) {
     batch.logits[batch.n_tokens - 1] = true;
 
     if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        LOG_ERR("%s: llama_decode() failed\n", __func__);
         return 1;
     }
 
@@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
     //}
 
     if (n_parallel > 1) {
-        LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
+        LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
     }
 
     // main loop
@@ -175,9 +175,9 @@ int main(int argc, char ** argv) {
             // is it an end of generation? -> mark the stream as finished
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                 i_batch[i] = -1;
-                LOG_TEE("\n");
+                LOG("\n");
                 if (n_parallel > 1) {
-                    LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
+                    LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
                 }
 
                 continue;
@@ -185,8 +185,7 @@ int main(int argc, char ** argv) {
 
             // if there is only one stream, we print immediately to stdout
             if (n_parallel == 1) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-                fflush(stdout);
+                LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
             }
 
             streams[i] += llama_token_to_piece(ctx, new_token_id);
@@ -208,27 +207,25 @@ int main(int argc, char ** argv) {
 
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
-
     if (n_parallel > 1) {
-        LOG_TEE("\n");
+        LOG("\n");
 
         for (int32_t i = 0; i < n_parallel; ++i) {
-            LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
+            LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
         }
     }
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_sampler_print(smpl);
     llama_perf_context_print(ctx);
 
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 8ca9f8915..ecff95f9a 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -9,6 +9,7 @@
 #include <climits>
 #include <cstring>
 #include <cstdarg>
+#include <cinttypes>
 #include <ctime>
 #include <random>
 #include <stdexcept>
@@ -105,43 +106,43 @@ static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_
     const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads;
     try {
         w->token_embedding_table.resize(p->vocab_size * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
 
         w->rms_att_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
 
         w->rms_ffn_weight.resize(p->n_layers * p->dim);
-        LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
 
         w->wq.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
         w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
 
         w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries);
 
         w->wo.resize(p->n_layers * p->dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
 
         w->w1.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
         w->w2.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
 
         w->w3.resize(p->n_layers * p->hidden_dim * p->dim);
-        LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+        LOG_INF("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
 
         w->rms_final_weight.resize(p->dim);
-        LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+        LOG_INF("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
 
         if (shared_weights) {
             w->wcls = {};
         } else {
             w->wcls.resize(p->vocab_size * p->dim);
-            LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+            LOG_INF("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
         }
     }
     catch (std::length_error &) {
@@ -173,7 +174,7 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
     fseek(f, 0, SEEK_END);
     auto end = ftell(f);
     if (curr != end) {
-        LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
+        LOG_ERR("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end =  %ld)\n", __func__, curr, end);
         return 1;
     }
 
@@ -181,20 +182,20 @@ static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FIL
 }
 
 static void print_sample_weights(TransformerWeights *w){
-    LOG("----- Quick print of first of the weight vales of all the variables\n");
-    LOG("%f\n", w->token_embedding_table[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    LOG("%f\n", w->rms_ffn_weight[0]);
+    LOG_INF("----- Quick print of first of the weight vales of all the variables\n");
+    LOG_INF("%f\n", w->token_embedding_table[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    LOG_INF("%f\n", w->rms_ffn_weight[0]);
 
-    LOG("%f\n", w->wq[0]);
-    LOG("%f\n", w->wk[0]);
-    LOG("%f\n", w->wv[0]);
-    LOG("%f\n", w->wo[0]);
-    LOG("%f\n", w->w1[0]);
-    LOG("%f\n", w->w2[0]);
-    LOG("%f\n", w->w3[0]);
-    LOG("%f\n", w->rms_att_weight[0]);
-    if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]);
+    LOG_INF("%f\n", w->wq[0]);
+    LOG_INF("%f\n", w->wk[0]);
+    LOG_INF("%f\n", w->wv[0]);
+    LOG_INF("%f\n", w->wo[0]);
+    LOG_INF("%f\n", w->w1[0]);
+    LOG_INF("%f\n", w->w2[0]);
+    LOG_INF("%f\n", w->w3[0]);
+    LOG_INF("%f\n", w->rms_att_weight[0]);
+    if (!w->wcls.empty()) LOG_INF("%f\n", w->wcls[0]);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -318,20 +319,20 @@ struct train_params {
 };
 
 static void print_params(struct my_llama_hparams * params) {
-    LOG("%s: n_vocab:   %u\n", __func__, params->n_vocab);
-    LOG("%s: n_ctx:     %u\n", __func__, params->n_ctx);
-    LOG("%s: n_embd:    %u\n", __func__, params->n_embd);
-    LOG("%s: n_mult:    %u\n", __func__, params->n_mult);
-    LOG("%s: n_head:    %u\n", __func__, params->n_head);
-    LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
-    LOG("%s: n_ff:      %u\n", __func__, params->n_ff);
-    LOG("%s: n_layer:   %u\n", __func__, params->n_layer);
-    LOG("%s: n_rot:     %u\n", __func__, params->n_rot);
+    LOG_INF("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    LOG_INF("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    LOG_INF("%s: n_embd:    %u\n", __func__, params->n_embd);
+    LOG_INF("%s: n_mult:    %u\n", __func__, params->n_mult);
+    LOG_INF("%s: n_head:    %u\n", __func__, params->n_head);
+    LOG_INF("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    LOG_INF("%s: n_ff:      %u\n", __func__, params->n_ff);
+    LOG_INF("%s: n_layer:   %u\n", __func__, params->n_layer);
+    LOG_INF("%s: n_rot:     %u\n", __func__, params->n_rot);
 }
 
 static void print_tensor_info(const struct ggml_context * ctx) {
     for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-        LOG("%s: Allocating ", __func__);
+        LOG_INF("%s: Allocating ", __func__);
         int64_t total = 1;
         int i = 0;
         for (; i < ggml_n_dims(t); ++i) {
@@ -526,7 +527,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
 
 static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) {
     if (is_ggml_file(filename)) {
-        LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
+        LOG_INF("%s: Loading vocabulary from gguf file %s\n", __func__, filename);
         struct ggml_context * ctx_data = NULL;
 
         struct gguf_init_params params = {
@@ -574,7 +575,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
         gguf_free(ctx);
     } else {
         // assume llama2.c vocabulary
-        LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
+        LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
         llama_file file(filename, "rb");
         if (!file.fp) {
             die_fmt("%s: %s", strerror(errno), filename);
@@ -871,23 +872,25 @@ static std::string basename(const std::string &path) {
 }
 
 int main(int argc, char ** argv) {
+    gpt_init();
+
     struct train_params params = get_default_train_params();
     if (!params_parse(argc, argv, &params)) {
         return 1;
     }
-    log_set_target(stdout);
+
     Config config;
     TransformerWeights weights = {};
     {
-        LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
+        LOG_INF("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model);
         FILE * file = fopen(params.fn_llama2c_model, "rb");
         if (!file) {
-            LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model);
             return 1;
         }
         // read in the config header
         if (fread(&config, sizeof(Config), 1, file) != 1) {
-            LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model);
             return 1;
         }
         auto shared_weights = config.vocab_size > 0;
@@ -896,7 +899,7 @@ int main(int argc, char ** argv) {
         // read in the Transformer weights
         alloc_weights(&weights, &config, shared_weights);
         if (checkpoint_init_weights(&weights, &config, file, shared_weights)) {
-            LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
+            LOG_ERR("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model);
             return 1;
         }
         fclose(file);
@@ -929,7 +932,7 @@ int main(int argc, char ** argv) {
     model.name = basename(params.fn_llama2c_model);
     save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
 
-    LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
+    LOG_INF("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model);
 
     ggml_free(model.ctx);
     return 0;
diff --git a/examples/cvector-generator/cvector-generator.cpp b/examples/cvector-generator/cvector-generator.cpp
index 569b6c38f..41bf4eb2a 100644
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -13,14 +13,15 @@
 #include "ggml-metal.h"
 #endif
 
+#include <algorithm>
+#include <climits>
 #include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <climits>
 
 
 //////////////////////////////////////////////////
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index e94ae2955..a438dcb5a 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <ctime>
@@ -39,16 +40,16 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     llama_kv_cache_clear(ctx);
 
     // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
     if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
         // encoder-only model
         if (llama_encode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to encode\n", __func__);
+            LOG_ERR("%s : failed to encode\n", __func__);
         }
     } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
         // decoder-only model
         if (llama_decode(ctx, batch) < 0) {
-            fprintf(stderr, "%s : failed to decode\n", __func__);
+            LOG_ERR("%s : failed to decode\n", __func__);
         }
     }
 
@@ -84,12 +85,12 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     params.embedding = true;
     // For non-causal models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
 
-    print_build_info();
-
     llama_backend_init();
     llama_numa_init(params.numa);
 
@@ -99,7 +100,7 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
@@ -109,19 +110,19 @@ int main(int argc, char ** argv) {
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
 
     if (llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
-        fprintf(stderr, "%s: error: computing embeddings in encoder-decoder models is not supported\n", __func__);
+        LOG_ERR("%s: computing embeddings in encoder-decoder models is not supported\n", __func__);
         return 1;
     }
 
     if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     // split the prompt into lines
@@ -136,7 +137,7 @@ int main(int argc, char ** argv) {
     for (const auto & prompt : prompts) {
         auto inp = ::llama_tokenize(ctx, prompt, true, false);
         if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
             return 1;
         }
@@ -147,20 +148,20 @@ int main(int argc, char ** argv) {
     // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
     for (auto & inp : inputs) {
         if (inp.empty() || inp.back() != llama_token_sep(model)) {
-            fprintf(stderr, "%s: warning: last token in the prompt is not SEP\n", __func__);
-            fprintf(stderr, "%s:          'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
+            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+            LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
         }
     }
 
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) inputs.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
             for (int j = 0; j < (int) inputs[i].size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
+                LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
             }
-            fprintf(stderr, "\n\n");
+            LOG("\n\n");
         }
     }
 
@@ -211,57 +212,57 @@ int main(int argc, char ** argv) {
     batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
 
     if (params.embd_out.empty()) {
-        fprintf(stdout, "\n");
+        LOG("\n");
 
         if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
             for (int j = 0; j < n_embd_count; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                 for (int i = 0; i < std::min(3, n_embd); i++) {
                     if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                     } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                     }
                 }
-                fprintf(stdout, " ... ");
+                LOG(" ... ");
                 for (int i = n_embd - 3; i < n_embd; i++) {
                     if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                     } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                     }
                 }
-                fprintf(stdout, "\n");
+                LOG("\n");
             }
         } else {
             // print the first part of the embeddings or for a single prompt, the full embedding
             for (int j = 0; j < n_prompts; j++) {
-                fprintf(stdout, "embedding %d: ", j);
+                LOG("embedding %d: ", j);
                 for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
                     if (params.embd_normalize == 0) {
-                        fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
+                        LOG("%6.0f ", emb[j * n_embd + i]);
                     } else {
-                        fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
+                        LOG("%9.6f ", emb[j * n_embd + i]);
                     }
                 }
-                fprintf(stdout, "\n");
+                LOG("\n");
             }
 
             // print cosine similarity matrix
             if (n_prompts > 1) {
-                fprintf(stdout, "\n");
-                printf("cosine similarity matrix:\n\n");
+                LOG("\n");
+                LOG("cosine similarity matrix:\n\n");
                 for (int i = 0; i < n_prompts; i++) {
-                    fprintf(stdout, "%6.6s ", prompts[i].c_str());
+                    LOG("%6.6s ", prompts[i].c_str());
                 }
-                fprintf(stdout, "\n");
+                LOG("\n");
                 for (int i = 0; i < n_prompts; i++) {
                     for (int j = 0; j < n_prompts; j++) {
                         float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                        fprintf(stdout, "%6.2f ", sim);
+                        LOG("%6.2f ", sim);
                     }
-                    fprintf(stdout, "%1.10s", prompts[i].c_str());
-                    fprintf(stdout, "\n");
+                    LOG("%1.10s", prompts[i].c_str());
+                    LOG("\n");
                 }
             }
         }
@@ -270,42 +271,42 @@ int main(int argc, char ** argv) {
     if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
         const bool notArray = params.embd_out != "array";
 
-        fprintf(stdout, notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
+        LOG(notArray ? "{\n  \"object\": \"list\",\n  \"data\": [\n" : "[");
         for (int j = 0;;) { // at least one iteration (one prompt)
-            if (notArray) fprintf(stdout, "    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
-            fprintf(stdout, "[");
+            if (notArray) LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %d,\n      \"embedding\": ",j);
+            LOG("[");
             for (int i = 0;;) { // at least one iteration (n_embd > 0)
-                fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
+                LOG(params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
                 i++;
-                if (i < n_embd) fprintf(stdout, ","); else break;
+                if (i < n_embd) LOG(","); else break;
             }
-            fprintf(stdout, notArray ? "]\n    }" : "]");
+            LOG(notArray ? "]\n    }" : "]");
             j++;
-            if (j < n_embd_count) fprintf(stdout, notArray ? ",\n" : ","); else break;
+            if (j < n_embd_count) LOG(notArray ? ",\n" : ","); else break;
         }
-        fprintf(stdout, notArray ? "\n  ]" : "]\n");
+        LOG(notArray ? "\n  ]" : "]\n");
 
         if (params.embd_out == "json+" && n_prompts > 1) {
-            fprintf(stdout, ",\n  \"cosineSimilarity\": [\n");
+            LOG(",\n  \"cosineSimilarity\": [\n");
             for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
-                fprintf(stdout, "    [");
+                LOG("    [");
                 for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
                     float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
-                    fprintf(stdout, "%6.2f", sim);
+                    LOG("%6.2f", sim);
                     j++;
-                    if (j < n_embd_count) fprintf(stdout, ", "); else break;
+                    if (j < n_embd_count) LOG(", "); else break;
                 }
-                fprintf(stdout, " ]");
+                LOG(" ]");
                 i++;
-                if (i < n_embd_count) fprintf(stdout, ",\n"); else break;
+                if (i < n_embd_count) LOG(",\n"); else break;
             }
-            fprintf(stdout, "\n  ]");
+            LOG("\n  ]");
         }
 
-        if (notArray) fprintf(stdout, "\n}\n");
+        if (notArray) LOG("\n}\n");
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_context_print(ctx);
 
     // clean up
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index af389abe1..6d629fe4e 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -1,12 +1,11 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 #include "ggml.h"
 
 #include <cstdio>
-#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
 
 /**
@@ -32,22 +31,22 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
     GGML_ASSERT(n > 0);
     float sum = 0;
     for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        printf("                                     [\n");
+        LOG("                                     [\n");
         for (int64_t i2 = 0; i2 < ne[2]; i2++) {
             if (i2 == n && ne[2] > 2*n) {
-                printf("                                      ..., \n");
+                LOG("                                      ..., \n");
                 i2 = ne[2] - n;
             }
-            printf("                                      [\n");
+            LOG("                                      [\n");
             for (int64_t i1 = 0; i1 < ne[1]; i1++) {
                 if (i1 == n && ne[1] > 2*n) {
-                    printf("                                       ..., \n");
+                    LOG("                                       ..., \n");
                     i1 = ne[1] - n;
                 }
-                printf("                                       [");
+                LOG("                                       [");
                 for (int64_t i0 = 0; i0 < ne[0]; i0++) {
                     if (i0 == n && ne[0] > 2*n) {
-                        printf("..., ");
+                        LOG("..., ");
                         i0 = ne[0] - n;
                     }
                     size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
@@ -65,16 +64,16 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                     } else {
                         GGML_ABORT("fatal error");
                     }
-                    printf("%12.4f", v);
+                    LOG("%12.4f", v);
                     sum += v;
-                    if (i0 < ne[0] - 1) printf(", ");
+                    if (i0 < ne[0] - 1) LOG(", ");
                 }
-                printf("],\n");
+                LOG("],\n");
             }
-            printf("                                      ],\n");
+            LOG("                                      ],\n");
         }
-        printf("                                     ]\n");
-        printf("                                     sum = %f\n", sum);
+        LOG("                                     ]\n");
+        LOG("                                     sum = %f\n", sum);
     }
 }
 
@@ -103,11 +102,11 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
         snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, ggml_ne_string(src1).c_str());
     }
 
-    printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
-           t->name, ggml_type_name(t->type), ggml_op_desc(t),
-           src0->name, ggml_ne_string(src0).c_str(),
-           src1 ? src1_str : "",
-           ggml_ne_string(t).c_str());
+    LOG("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__,
+         t->name, ggml_type_name(t->type), ggml_op_desc(t),
+         src0->name, ggml_ne_string(src0).c_str(),
+         src1 ? src1_str : "",
+         ggml_ne_string(t).c_str());
 
 
     // copy the data from the GPU memory if needed
@@ -133,7 +132,7 @@ static bool run(llama_context * ctx, const gpt_params & params) {
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
 
     if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
+        LOG_ERR("%s : failed to eval\n", __func__);
         return false;
     }
 
@@ -149,7 +148,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    print_build_info();
+    gpt_init();
 
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -166,14 +165,15 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
         return 1;
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
     }
 
     bool OK = run(ctx, params);
@@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_context_print(ctx);
 
     llama_free(ctx);
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 90126ad1e..0051a5eb6 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -406,7 +406,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    g_verbose = (params.verbosity == 1);
+    g_verbose = (params.verbosity > 1);
     try {
         lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
         ctx.run_merge();
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 14c715202..20b99a4fd 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -158,6 +158,8 @@ int main(int argc, char * argv[]) {
         return 1;
     }
 
+    gpt_init();
+
     llama_model_params mparams = llama_model_params_from_gpt_params(params);
     llama_context_params cparams = llama_context_params_from_gpt_params(params);
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 73b54da7f..265281699 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -19,12 +20,12 @@
 #endif
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s \\\n"
-            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+    LOG("\nexample usage:\n");
+    LOG("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] \\\n"
             "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
             "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
-    LOG_TEE("\n");
+    LOG("\n");
 }
 
 struct Stats {
@@ -125,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             e.counts.resize(src1->ne[0]*n_as, 0);
         }
         else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
             exit(1); //GGML_ABORT("fatal error");
         }
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
-        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
         // loop over all possible experts, regardless if they are used or not in the batch
         for (int ex = 0; ex < n_as; ++ex) {
             size_t e_start = ex*src1->ne[0];
@@ -151,7 +150,8 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                         e.values[e_start + j] += x[j]*x[j];
                         e.counts[e_start + j]++;
                         if (!std::isfinite(e.values[e_start + j])) {
-                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            LOG("\n");
+                            LOG_ERR("%f detected in %s\n", e.values[e_start + j], wname.c_str());
                             exit(1);
                         }
                     }
@@ -174,20 +174,18 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             e.counts.resize(src1->ne[0], 0);
         }
         else if (e.values.size() != (size_t)src1->ne[0]) {
-            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
+            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
             exit(1); //GGML_ABORT("fatal error");
         }
         ++e.ncall;
-        if (m_params.verbosity > 1) {
-            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
-        }
+        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
         for (int row = 0; row < (int)src1->ne[1]; ++row) {
             const float * x = data + row * src1->ne[0];
             for (int j = 0; j < (int)src1->ne[0]; ++j) {
                 e.values[j] += x[j]*x[j];
                 e.counts[j]++;
                 if (!std::isfinite(e.values[j])) {
-                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+                    LOG_ERR("%f detected in %s\n", e.values[j], wname.c_str());
                     exit(1);
                 }
             }
@@ -239,17 +237,17 @@ void IMatrixCollector::save_imatrix(int ncall) const {
         }
 
         if (n_zeros != 0 && is_first) {
-            fprintf(stderr, "\n");
+            LOG_INF("\n");
             is_first = false;
         }
 
         if (n_zeros == n_all) {
-            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
             continue;
         }
 
         if (n_zeros > 0) {
-            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
             continue;
         }
 
@@ -258,7 +256,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
     }
 
     if (to_store.size() < m_stats.size()) {
-        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
     }
 
     std::ofstream out(fname, std::ios::binary);
@@ -290,21 +288,20 @@ void IMatrixCollector::save_imatrix(int ncall) const {
         out.write(m_params.prompt_file.c_str(), len);
     }
 
-    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
-    }
+    LOGV(1, "\n");
+    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
 }
 
 bool IMatrixCollector::load_imatrix(const char * fname) {
     std::ifstream in(fname, std::ios::binary);
     if (!in) {
-        printf("%s: failed to open %s\n",__func__, fname);
+        LOG_ERR("%s: failed to open %s\n",__func__, fname);
         return false;
     }
     int n_entries;
     in.read((char*)&n_entries, sizeof(n_entries));
     if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, fname);
+        LOG_ERR("%s: no data in file %s\n", __func__, fname);
         return false;
     }
     for (int i = 0; i < n_entries; ++i) {
@@ -312,7 +309,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
         std::vector<char> name_as_vec(len+1);
         in.read((char *)name_as_vec.data(), len);
         if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
             return false;
         }
         name_as_vec[len] = 0;
@@ -323,7 +320,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
         int nval;
         in.read((char *)&nval, sizeof(nval));
         if (in.fail() || nval < 1) {
-            printf("%s: failed reading number of values for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
             m_stats = {};
             return false;
         }
@@ -336,7 +333,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
         std::vector<float> tmp(nval);
         in.read((char*)tmp.data(), nval*sizeof(float));
         if (in.fail()) {
-            printf("%s: failed reading data for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
             m_stats = {};
             return false;
         }
@@ -437,26 +434,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
     const int n_ctx = llama_n_ctx(ctx);
 
     auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
 
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
     if (params.i_chunk > 0) {
         if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
+            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
             return false;
         }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
         tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
     }
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
         return false;
     }
 
@@ -478,7 +474,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
     double nll = 0.0;
     double nll2 = 0.0;
 
-    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
@@ -514,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 
             // TODO: use batch.logits to save computations instead of relying on logits_all == true
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to eval\n", __func__);
                 return false;
             }
 
@@ -531,29 +527,29 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
 
         if (params.compute_ppl) {
             const int first = n_ctx/2;
-            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
+            const auto * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
             process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
                     workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
             count += n_ctx - first - 1;
 
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
             fflush(stdout);
 
             logits.clear();
         }
     }
-    printf("\n");
+    LOG("\n");
 
     if (params.compute_ppl) {
         nll2 /= count;
@@ -562,9 +558,9 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
         nll2 -= nll * nll;
         if (nll2 > 0) {
             nll2 = sqrt(nll2/(count-1));
-            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
         } else {
-            printf("Unexpected negative standard deviation of log(prob)\n");
+            LOG("Unexpected negative standard deviation of log(prob)\n");
         }
     }
 
@@ -576,26 +572,27 @@ int main(int argc, char ** argv) {
 
     params.n_ctx = 512;
     params.logits_all = true;
-    params.verbosity = 1;
 
     if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
         return 1;
     }
 
+    gpt_init();
+
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
     g_collector.set_params(params);
 
     for (const auto & in_file : params.in_files) {
-        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
         if (!g_collector.load_imatrix(in_file.c_str())) {
-            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
+            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
             return 1;
         }
     }
 
     if (params.in_files.size() > 1) {
-        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
+        LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
         g_collector.save_imatrix();
     }
 
@@ -614,20 +611,20 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == nullptr || ctx == nullptr) {
-        fprintf(stderr, "%s : failed to init\n", __func__);
+        LOG_ERR("%s : failed to init\n", __func__);
         return 1;
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
     if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     if (!compute_imatrix(ctx, params)) {
@@ -636,7 +633,7 @@ int main(int argc, char ** argv) {
 
     g_collector.save_imatrix();
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_context_print(ctx);
 
     llama_free(ctx);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 7e252ce09..b77b876cc 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "console.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cassert>
@@ -55,7 +56,7 @@ static void write_logfile(
 
     const bool success = fs_create_directory_with_parents(params.logdir);
     if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
                 __func__, params.logdir.c_str());
         return;
     }
@@ -64,7 +65,7 @@ static void write_logfile(
     FILE * logfile = fopen(logfile_path.c_str(), "w");
 
     if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
         return;
     }
 
@@ -93,7 +94,7 @@ static void sigint_handler(int signo) {
             is_interacting = true;
         } else {
             console::cleanup();
-            printf("\n");
+            LOG("\n");
             gpt_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
             _exit(130);
@@ -110,56 +111,51 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    auto & sparams = params.sparams;
+    gpt_init();
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("infill", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
+    auto & sparams = params.sparams;
 
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
     if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
     }
+
     if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
-        printf("\n************\n");
-        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("\n************\n");
+        LOG_ERR("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    print_build_info();
-
-    LOG("%s: llama backend init\n", __func__);
+    LOG_INF("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
 
@@ -172,34 +168,32 @@ int main(int argc, char ** argv) {
     g_smpl = &smpl;
 
     // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
     llama_init_result llama_init = llama_init_from_gpt_params(params);
 
     model = llama_init.model;
     ctx = llama_init.context;
 
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    LOG_DBG("n_ctx: %d\n", n_ctx);
 
     if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
     const bool add_bos = llama_add_bos_token(model);
     GGML_ASSERT(!llama_add_eos_token(model));
-    LOG("add_bos: %d\n", add_bos);
 
     std::vector<llama_token> embd_inp;
     std::vector<llama_token> embd_end;
@@ -224,18 +218,19 @@ int main(int argc, char ** argv) {
         embd_inp.push_back(middle_token);
     }
 
-    LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
-    LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+    LOG_DBG("add_bos: %d\n", add_bos);
+    LOG_DBG("prefix: \"%s\"\n", params.input_prefix.c_str());
+    LOG_DBG("suffix: \"%s\"\n", params.input_suffix.c_str());
+    LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         embd_inp.push_back(llama_token_bos(model));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
     }
 
     if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
 
@@ -244,9 +239,8 @@ int main(int argc, char ** argv) {
         params.n_keep = (int)embd_inp.size();
     }
 
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
+    LOG_INF("inp_pfx: %s\n", string_from(ctx, inp_pfx).c_str());
+    LOG_INF("inp_sfx: %s\n", string_from(ctx, inp_sfx).c_str());
 
     // enable interactive mode if interactive start is specified
     if (params.interactive_first) {
@@ -254,21 +248,21 @@ int main(int argc, char ** argv) {
     }
 
     if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("\n");
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG_TEE("'\n");
+            LOG("'\n");
         }
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
     if (params.interactive) {
@@ -285,28 +279,30 @@ int main(int argc, char ** argv) {
         SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG_INF("%s: interactive mode on.\n", __func__);
 
         if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG_INF("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
         }
 
         if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
     smpl = gpt_sampler_init(model, sparams);
 
-    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
-    LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
+    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
 
-    LOG_TEE("\n#####  Infill mode  #####\n\n");
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+
+    LOG("\n");
+    LOG("\n#####  Infill mode  #####\n\n");
     if (params.interactive) {
         const char *control_message;
         if (params.multiline_input) {
@@ -317,11 +313,11 @@ int main(int argc, char ** argv) {
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
@@ -354,9 +350,8 @@ int main(int argc, char ** argv) {
                 embd.resize(max_embd_size);
 
                 console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 console::set_display(console::reset);
-                fflush(stdout);
             }
 
             // infinite text generation via context swapping
@@ -365,14 +360,14 @@ int main(int argc, char ** argv) {
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
             if (n_past + (int) embd.size() > n_ctx) {
                 if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                     break;
                 }
 
                 const int n_left    = n_past - params.n_keep - 1;
                 const int n_discard = n_left/2;
 
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                     n_past, n_left, n_ctx, params.n_keep, n_discard);
 
                 llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
@@ -380,9 +375,9 @@ int main(int argc, char ** argv) {
 
                 n_past -= n_discard;
 
-                LOG("after swap: n_past = %d\n", n_past);
+                LOG_DBG("after swap: n_past = %d\n", n_past);
 
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
 
             }
 
@@ -394,16 +389,16 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
 
                 n_past += n_eval;
 
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
             }
 
         }
@@ -415,7 +410,7 @@ int main(int argc, char ** argv) {
 
             gpt_sampler_accept(smpl, id, true);
 
-            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
             embd.push_back(id);
 
@@ -425,10 +420,10 @@ int main(int argc, char ** argv) {
             // decrement remaining sampling budget
             --n_remain;
 
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
 
@@ -447,7 +442,7 @@ int main(int argc, char ** argv) {
         if (input_echo) {
             for (auto id : embd) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
 
                 if (embd.size() > 1) {
                     input_tokens.push_back(id);
@@ -456,7 +451,6 @@ int main(int argc, char ** argv) {
                     output_ss << token_str;
                 }
             }
-            fflush(stdout);
         }
         // reset color to default if we there is no pending user input
         if (input_echo && (int) embd_inp.size() == n_consumed) {
@@ -469,10 +463,9 @@ int main(int argc, char ** argv) {
             if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
                 if (is_interacting && !params.interactive_first) {
                     // print an eot token
-                    printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
+                    LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
                 }
-                fflush(stdout);
-                printf("\n");
+                LOG("\n");
                 console::set_display(console::user_input);
                 std::string buffer;
                 std::string line;
@@ -528,35 +521,33 @@ int main(int argc, char ** argv) {
                 n_remain = params.n_predict;
                 n_past = 0;
                 n_consumed = 0;
-                // LOG_TEE("took new input\n");
                 is_interacting = false;
             }
             // deal with end of generation tokens in interactive mode
             else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG("found EOS token\n");
+                LOG_DBG("found EOS token\n");
 
                 if (params.interactive) {
 
                     is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                     console::set_display(console::user_input);
-                    fflush(stdout);
                }
             }
 
             if (n_past > 0 && is_interacting && !params.interactive) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");
 
                 if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
+                    LOG_DBG("adding input prefix BOS token\n");
                     embd_inp.push_back(llama_token_bos(model));
                 }
 
                 std::string buffer;
                 if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                     buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
+                    LOG("%s", buffer.c_str());
                 }
 
                 std::string line;
@@ -574,17 +565,17 @@ int main(int argc, char ** argv) {
                 if (buffer.length() > 1) {
                     // append input suffix if any
                     if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                         buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                     }
 
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
 
                     const size_t original_size = embd_inp.size();
 
                     const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
 
@@ -595,9 +586,9 @@ int main(int argc, char ** argv) {
                     }
 
                     n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                 } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -624,11 +615,10 @@ int main(int argc, char ** argv) {
         }
     }
     if (!params.interactive && n_remain <= 0) {
-        printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
-        fflush(stdout);
+        LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     gpt_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
@@ -638,9 +628,5 @@ int main(int argc, char ** argv) {
     gpt_sampler_free(smpl);
     llama_backend_free();
 
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
     return 0;
 }
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 5dfb333d1..8aa7b0750 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -3,7 +3,6 @@
 // I'll gradually clean and extend it
 // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
-#include "log.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
@@ -40,6 +39,11 @@
 #include <cinttypes>
 #include <limits>
 
+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+
 //#define CLIP_DEBUG_FUNCTIONS
 
 // RGB uint8 image
@@ -165,7 +169,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
 static int get_key_idx(const gguf_context * ctx, const char * key) {
     int i = gguf_find_key(ctx, key);
     if (i == -1) {
-        LOG_TEE("key %s not found in file\n", key);
+        LOG_ERR("key %s not found in file\n", key);
         throw std::runtime_error(format("Missing required key: %s", key));
     }
 
@@ -270,7 +274,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 
 static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
     size_t tensor_size = ggml_nbytes(tensor);
-    LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
+    LOG_INF("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
             prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
             tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
 }
@@ -288,7 +292,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
     if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
         return;
     }
 
@@ -307,7 +311,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
 static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
     std::ofstream file(filename, std::ios::binary);
     if (!file.is_open()) {
-        LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
+        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
         return;
     }
 
@@ -568,7 +572,7 @@ struct clip_ctx {
 
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return nullptr;
     }
 
@@ -582,7 +586,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         if (load_image_size == nullptr) {
             load_image_size = clip_image_size_init();
         }
-        LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
         image_size_width  = load_image_size->width;
         image_size_height = load_image_size->height;
         if (is_inf) {
@@ -1047,21 +1051,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         const int idx_name = gguf_find_key(ctx, KEY_NAME);
         if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
             const std::string name = gguf_get_val_str(ctx, idx_name);
-            LOG_TEE("%s: model name:   %s\n", __func__, name.c_str());
+            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
         }
-        LOG_TEE("%s: description:  %s\n", __func__, description.c_str());
-        LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
-        LOG_TEE("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
-        LOG_TEE("%s: n_tensors:    %d\n", __func__, n_tensors);
-        LOG_TEE("%s: n_kv:         %d\n", __func__, n_kv);
-        LOG_TEE("%s: ftype:        %s\n", __func__, ftype_str.c_str());
-        LOG_TEE("\n");
+        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
+        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
+        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
+        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
+        LOG_INF("%s: n_kv:         %d\n", __func__, n_kv);
+        LOG_INF("%s: ftype:        %s\n", __func__, ftype_str.c_str());
+        LOG_INF("\n");
     }
     const int n_tensors = gguf_get_n_tensors(ctx);
 
     // kv
     const int n_kv = gguf_get_n_kv(ctx);
-    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
+    LOG_INF("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
         __func__, n_kv, n_tensors, fname);
     {
         std::map<enum ggml_type, uint32_t> n_type;
@@ -1072,7 +1076,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             n_type[type]++;
         }
 
-        LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
+        LOG_INF("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
         for (int i = 0; i < n_kv; i++) {
             const char * name           = gguf_get_key(ctx, i);
             const enum gguf_type type   = gguf_get_kv_type(ctx, i);
@@ -1088,7 +1092,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             }
             replace_all(value, "\n", "\\n");
 
-            LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
+            LOG_INF("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
         }
 
         // print type counts
@@ -1097,7 +1101,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 continue;
             }
 
-            LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            LOG_INF("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
         }
     }
 
@@ -1112,7 +1116,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             size_t tensor_size = ggml_nbytes(cur);
             model_size += tensor_size;
             if (verbosity >= 3) {
-                LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
+                LOG_INF("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
                        __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
             }
         }
@@ -1139,27 +1143,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
 #ifdef GGML_USE_CUDA
     new_clip->backend = ggml_backend_cuda_init(0);
-    LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
+    LOG_INF("%s: CLIP using CUDA backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_METAL
     new_clip->backend = ggml_backend_metal_init();
-    LOG_TEE("%s: CLIP using Metal backend\n", __func__);
+    LOG_INF("%s: CLIP using Metal backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_CANN
     new_clip->backend = ggml_backend_cann_init(0);
-    LOG_TEE("%s: CLIP using CANN backend\n", __func__);
+    LOG_INF("%s: CLIP using CANN backend\n", __func__);
 #endif
 
 #ifdef GGML_USE_VULKAN
     new_clip->backend = ggml_backend_vk_init(0);
-    LOG_TEE("%s: CLIP using Vulkan backend\n", __func__);
+    LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
 #endif
 
     if (!new_clip->backend) {
         new_clip->backend = ggml_backend_cpu_init();
-        LOG_TEE("%s: CLIP using CPU backend\n", __func__);
+        LOG_INF("%s: CLIP using CPU backend\n", __func__);
     }
 
     // model size and capabilities
@@ -1194,16 +1198,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
 
         if (verbosity >= 1) {
-            LOG_TEE("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
-            LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
-            LOG_TEE("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            LOG_TEE("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
-            LOG_TEE("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
-            LOG_TEE("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
+            LOG_INF("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
+            LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
+            LOG_INF("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
+            LOG_INF("%s: minicpmv_projector:  %d\n", __func__, new_clip->has_minicpmv_projector);
+            LOG_INF("%s: model size:     %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
+            LOG_INF("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
         }
     }
 
-    LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
+    LOG_INF("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
 
     // load tensors
     {
@@ -1216,7 +1220,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
         new_clip->ctx_data = ggml_init(params);
         if (!new_clip->ctx_data) {
-            LOG_TEE("%s: ggml_init() failed\n", __func__);
+            LOG_ERR("%s: ggml_init() failed\n", __func__);
             clip_free(new_clip);
             gguf_free(ctx);
             return nullptr;
@@ -1224,7 +1228,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
         auto fin = std::ifstream(fname, std::ios::binary);
         if (!fin) {
-            LOG_TEE("cannot open model file for loading tensors\n");
+            LOG_ERR("cannot open model file for loading tensors\n");
             clip_free(new_clip);
             gguf_free(ctx);
             return nullptr;
@@ -1246,7 +1250,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
             fin.seekg(offset, std::ios::beg);
             if (!fin) {
-                LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
+                LOG_ERR("%s: failed to seek for tensor %s\n", __func__, name);
                 clip_free(new_clip);
                 gguf_free(ctx);
                 return nullptr;
@@ -1317,23 +1321,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         }
 
         if (verbosity >= 2) {
-            LOG_TEE("\n%s: vision model hparams\n", __func__);
-            LOG_TEE("image_size         %d\n", hparams.image_size);
-            LOG_TEE("patch_size         %d\n", hparams.patch_size);
-            LOG_TEE("v_hidden_size      %d\n", hparams.hidden_size);
-            LOG_TEE("v_n_intermediate   %d\n", hparams.n_intermediate);
-            LOG_TEE("v_projection_dim   %d\n", hparams.projection_dim);
-            LOG_TEE("v_n_head           %d\n", hparams.n_head);
-            LOG_TEE("v_n_layer          %d\n", hparams.n_layer);
-            LOG_TEE("v_eps              %f\n", hparams.eps);
-            LOG_TEE("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
-            LOG_TEE("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
-            LOG_TEE("v_image_grid_pinpoints: ");
+            LOG_INF("\n%s: vision model hparams\n", __func__);
+            LOG_INF("image_size         %d\n", hparams.image_size);
+            LOG_INF("patch_size         %d\n", hparams.patch_size);
+            LOG_INF("v_hidden_size      %d\n", hparams.hidden_size);
+            LOG_INF("v_n_intermediate   %d\n", hparams.n_intermediate);
+            LOG_INF("v_projection_dim   %d\n", hparams.projection_dim);
+            LOG_INF("v_n_head           %d\n", hparams.n_head);
+            LOG_INF("v_n_layer          %d\n", hparams.n_layer);
+            LOG_INF("v_eps              %f\n", hparams.eps);
+            LOG_INF("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            LOG_INF("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            LOG_INF("v_image_grid_pinpoints: ");
             for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
-                LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
+                LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
             }
-            LOG_TEE("\n");
-            LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+            LOG_INF("\n");
+            LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
 
         }
 
@@ -1371,7 +1375,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
             vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
         } catch(const std::exception& /*e*/) {
-            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
+            LOG_ERR("%s: failed to load vision model tensors\n", __func__);
         }
 
         // LLaVA projection
@@ -1400,7 +1404,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             } catch (std::runtime_error & /*e*/) { }
             try {
                 vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
-                // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
+                // LOG_INF("%s: image_newline tensor (llava-1.6) found\n", __func__);
             } catch (std::runtime_error & /*e*/) { }
         } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
@@ -1501,7 +1505,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false);
         ggml_gallocr_reserve(new_clip->compute_alloc, gf);
         size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
-        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        LOG_INF("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
     }
 
     return new_clip;
@@ -1552,7 +1556,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
     int nx, ny, nc;
     auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
     if (!data) {
-        LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
+        LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
         return false;
     }
     build_clip_img_from_data(data, nx, ny, img);
@@ -1564,7 +1568,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     int nx, ny, nc;
     auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
     if (!data) {
-        LOG_TEE("%s: failed to decode image bytes\n", __func__);
+        LOG_ERR("%s: failed to decode image bytes\n", __func__);
         return false;
     }
     build_clip_img_from_data(data, nx, ny, img);
@@ -1754,7 +1758,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
         int downscaled_height = static_cast<int>(original_height * scale);
         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
         int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
             max_effective_resolution = effective_resolution;
             min_wasted_resolution = wasted_resolution;
@@ -1872,7 +1876,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
     const int multiple = fmin(ceil(ratio), max_slice_nums);
 
     std::vector<std::vector<clip_image_u8 *>> images;
-    LOG_TEE("%s: multiple %d\n", __func__, multiple);
+    LOG_INF("%s: multiple %d\n", __func__, multiple);
     images.push_back(std::vector<clip_image_u8 *>());
 
     if (multiple <= 1) {
@@ -1887,17 +1891,17 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
         clip_image_u8 * source_image = clip_image_u8_init();
         bicubic_resize(*img, *source_image, best_size.first, best_size.second);
         // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
-        LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
+        LOG_INF("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
         images[images.size()-1].push_back(source_image);
 
         std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
-        LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
+        LOG_INF("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
 
         auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
         clip_image_u8 * refine_image = clip_image_u8_init();
         bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
 
-        LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
+        LOG_INF("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
 
         // split_to_patches
         int width = refine_image->nx;
@@ -1954,7 +1958,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         int idx = 0;
         for (size_t i = 0; i < imgs.size(); ++i) {
             for (size_t j = 0; j < imgs[i].size(); ++j) {
-                LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
+                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
                 clip_image_f32 * res = clip_image_f32_init();
                 normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
                 res_imgs->data[idx++] = *res;
@@ -1966,7 +1970,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
 
     bool pad_to_square = true;
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
     auto & params = ctx->vision_model.hparams;
@@ -2043,7 +2047,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             }
 
             for (size_t i = 0; i < patches.size(); i++) {
-                // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
                 clip_image_u8_free(patches[i]);
             }
 
@@ -2279,7 +2283,7 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
 
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
 
@@ -2291,7 +2295,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
 
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
     if (!ctx->has_vision_encoder) {
-        LOG_TEE("This gguf file seems to have no vision encoder\n");
+        LOG_ERR("This gguf file seems to have no vision encoder\n");
         return false;
     }
 
@@ -2521,7 +2525,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             new_type = type;
             if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
                 new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
-                // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
+                // LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
             }
             const size_t n_elms = ggml_nelements(cur);
             float * f32_data;
@@ -2540,7 +2544,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
                 f32_data = (float *)conv_buf.data();
                 break;
             default:
-                LOG_TEE("Please use an input file in f32 or f16\n");
+                LOG_ERR("Please use an input file in f32 or f16\n");
                 gguf_free(ctx_out);
                 return false;
             }
@@ -2567,7 +2571,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
             fout.put(0);
         }
 
-        LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
                orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
     }
 
@@ -2583,8 +2587,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
     gguf_free(ctx_out);
 
     {
-        LOG_TEE("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        LOG_INF("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
     }
 
     return true;
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 12fe7345f..8f437863f 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -10,6 +10,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <vector>
 
 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
@@ -20,7 +21,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
             n_eval = n_batch;
         }
         if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
         *n_past += n_eval;
@@ -75,7 +76,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
     size_t img_base64_str_start, img_base64_str_end;
     find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
     if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
+        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
         return NULL;
     }
 
@@ -89,7 +90,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
 
     auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
     if (!embed) {
-        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
         return NULL;
     }
 
@@ -114,9 +115,9 @@ struct llava_context {
 };
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\n example usage:\n");
-    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+    LOG("\n example usage:\n");
+    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -126,11 +127,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
     auto prompt = params->prompt;
     if (prompt_contains_image(prompt)) {
         if (!params->image.empty()) {
-            LOG_TEE("using base64 encoded image instead of command line image path\n");
+            LOG_INF("using base64 encoded image instead of command line image path\n");
         }
         embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
         if (!embed) {
-            LOG_TEE("%s: can't load image from prompt\n", __func__);
+            LOG_ERR("%s: can't load image from prompt\n", __func__);
             return NULL;
         }
         params->prompt = remove_image_from_prompt(prompt);
@@ -156,18 +157,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
         system_prompt = prompt.substr(0, image_pos);
         user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
+        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
         if (params->verbose_prompt) {
             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
-        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
         if (params->verbose_prompt) {
             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     } else {
@@ -177,7 +178,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         if (params->verbose_prompt) {
             auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
             for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+                LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
             }
         }
     }
@@ -188,11 +189,11 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     // generate the response
 
-    LOG_TEE("\n");
+    LOG("\n");
 
     struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
     if (!smpl) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
     }
 
@@ -202,7 +203,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
         response += tmp;
         if (strcmp(tmp, "</s>") == 0) break;
         if (strstr(tmp, "###")) break; // Yi-VL behavior
-        printf("%s", tmp);
+        LOG("%s", tmp);
         if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
         if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -211,7 +212,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
     }
 
     gpt_sampler_free(smpl);
-    printf("\n");
+    LOG("\n");
 }
 
 static struct llama_model * llava_init(gpt_params * params) {
@@ -222,7 +223,7 @@ static struct llama_model * llava_init(gpt_params * params) {
 
     llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
     }
     return model;
@@ -245,11 +246,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
     llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return NULL;
     }
 
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
     ctx_llava->ctx_llama = ctx_llama;
     ctx_llava->ctx_clip = ctx_clip;
@@ -268,12 +269,6 @@ static void llava_free(struct llava_context * ctx_llava) {
     llama_backend_free();
 }
 
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
 int main(int argc, char ** argv) {
     ggml_time_init();
 
@@ -283,27 +278,23 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+    gpt_init();
 
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv);
         return 1;
     }
-    auto model = llava_init(&params);
+
+    auto * model = llava_init(&params);
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
         return 1;
     }
 
     if (prompt_contains_image(params.prompt)) {
-        auto ctx_llava = llava_init_context(&params, model);
+        auto * ctx_llava = llava_init_context(&params, model);
 
-        auto image_embed = load_image(ctx_llava, &params, "");
+        auto * image_embed = load_image(ctx_llava, &params, "");
 
         // process the prompt
         process_prompt(ctx_llava, image_embed, &params, params.prompt);
@@ -314,11 +305,11 @@ int main(int argc, char ** argv) {
         llava_free(ctx_llava);
     } else {
         for (auto & image : params.image) {
-            auto ctx_llava = llava_init_context(&params, model);
+            auto * ctx_llava = llava_init_context(&params, model);
 
-            auto image_embed = load_image(ctx_llava, &params, image);
+            auto * image_embed = load_image(ctx_llava, &params, image);
             if (!image_embed) {
-                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
                 return 1;
             }
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index e162586ed..8558c6bdc 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,13 +1,23 @@
 #include "clip.h"
-#include "common.h"
-#include "llama.h"
 #include "llava.h"
-#include "base64.hpp"
 
+#include "llama.h"
+
+#include <algorithm>
+#include <cerrno>
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <limits>
 #include <vector>
-#include <numeric>
+
+#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+
+#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
 
 // RGB uint8 image
 struct clip_image_u8 {
@@ -54,7 +64,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
         int downscaled_height = static_cast<int>(original_height * scale);
         int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
         int wasted_resolution = (width * height) - effective_resolution;
-        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
         if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
             max_effective_resolution = effective_resolution;
             min_wasted_resolution = wasted_resolution;
@@ -236,7 +246,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     img_res_v.size = 0;
     img_res_v.data = nullptr;
     if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
-        LOG_TEE("%s: unable to preprocess image\n", __func__);
+        LOG_ERR("%s: unable to preprocess image\n", __func__);
         delete[] img_res_v.data;
         return false;
     }
@@ -265,14 +275,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
             }
             if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
             }
             const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         int n_img_pos_out = 0;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -287,7 +297,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         load_image_size->width = img->nx;
         load_image_size->height = img->ny;
         clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
     }
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
@@ -295,7 +305,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
         delete[] img_res_v.data;
         if (!encoded) {
-            LOG_TEE("Unable to encode image\n");
+            LOG_ERR("Unable to encode image\n");
 
             return false;
         }
@@ -309,12 +319,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
             const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
             if (!encoded) {
-                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
             }
         }
         const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
 
         const int32_t * image_grid = clip_image_grid(ctx_clip);
 
@@ -347,12 +357,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
     }
 
-    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    LOG_INF("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
 
     const int64_t t_img_enc_end_us = ggml_time_us();
     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
 
-    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+    LOG_INF("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
 
     return true;
 }
@@ -362,7 +372,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
     int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
     auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
     if (n_image_embd != n_llama_embd) {
-        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        LOG_ERR("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
         return false;
     }
     return true;
@@ -375,13 +385,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
     }
     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
     if (!image_embd) {
-        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        LOG_ERR("Unable to allocate memory for image embeddings\n");
         return false;
     }
 
     int n_img_pos;
     if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        LOG_ERR("%s: cannot encode image, aborting\n", __func__);
         free(image_embd);
         return false;
     }
@@ -401,7 +411,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
         }
         llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
         if (llama_decode(ctx_llama, batch)) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
             return false;
         }
         *n_past += n_eval;
@@ -413,7 +423,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     clip_image_u8 * img = clip_image_u8_init();
     if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
         clip_image_u8_free(img);
-        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        LOG_ERR("%s: can't load image from bytes, is it a valid image?", __func__);
         return NULL;
     }
 
@@ -422,7 +432,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
     bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
     if (!image_embed_result) {
         clip_image_u8_free(img);
-        LOG_TEE("%s: coulnd't embed the image\n", __func__);
+        LOG_ERR("%s: coulnd't embed the image\n", __func__);
         return NULL;
     }
 
@@ -436,7 +446,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
 static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
     auto file = fopen(path, "rb");
     if (file == NULL) {
-        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        LOG_ERR("%s: can't read file %s\n", __func__, path);
         return false;
     }
 
@@ -446,7 +456,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
 
     auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
     if (buffer == NULL) {
-        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        LOG_ERR("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
         perror("Memory allocation error");
         fclose(file);
         return false;
@@ -471,7 +481,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
     long image_bytes_length;
     auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
     if (!loaded) {
-        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        LOG_ERR("%s: failed to load %s\n", __func__, image_path);
         return NULL;
     }
 
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index 3ac455e69..c5156c35b 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -7,9 +7,12 @@
 #include "llama.h"
 #include "ggml.h"
 
+#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <vector>
+#include <iostream> // TODO: remove me
 
 struct llava_context {
     struct clip_ctx * ctx_clip = NULL;
@@ -18,14 +21,8 @@ struct llava_context {
 };
 
 static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
+    LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
 static struct llama_model * llava_init(gpt_params * params) {
@@ -36,7 +33,7 @@ static struct llama_model * llava_init(gpt_params * params) {
 
     llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
         return NULL;
     }
     return model;
@@ -51,7 +48,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
     if (params->n_ctx < 2048) {
         // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
+        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
         ctx_params.n_ctx = 2048;
     } else {
         ctx_params.n_ctx = params->n_ctx;
@@ -60,11 +57,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
     llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
 
     if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return NULL;
     }
 
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
 
     ctx_llava->ctx_llama = ctx_llama;
     ctx_llava->model = model;
@@ -89,7 +86,7 @@ static struct clip_ctx * clip_init_context(gpt_params * params) {
     if (prompt.empty()) {
         prompt = "describe the image in detail.";
     }
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+    auto * ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
     return ctx_clip;
 }
 
@@ -101,7 +98,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
             n_eval = n_batch;
         }
         if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
             return false;
         }
         *n_past += n_eval;
@@ -125,7 +122,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
     float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
     std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
 
-    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
+    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
     slice_embed->embed = image_embed;
     slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
     llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
@@ -143,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
     else if (has_minicpmv_projector == 3) {
         system_prompt = "<|im_start|>user\n";
     }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
     eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
     process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
     eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
@@ -162,7 +159,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
         }
         eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
     }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+    LOG_INF("%s: image token past: %d\n", __func__, n_past);
 }
 
 static const char * sample(struct gpt_sampler * smpl,
@@ -181,42 +178,42 @@ static const char * sample(struct gpt_sampler * smpl,
 }
 
 static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
+    auto * ctx_clip = clip_init_context(params);
+    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
     if (!embeds) {
-        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
+        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
         return NULL;
     }
 
     // process the prompt
     if (params->prompt.empty() && params->interactive == false) {
-        LOG_TEE("prompt should be given or interactive mode should be on");
+        LOG_ERR("prompt should be given or interactive mode should be on");
         return NULL;
     }
 
-    auto model = llava_init(params);
+    auto * model = llava_init(params);
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
         return NULL;
     }
     const int64_t t_llava_init_start_us = ggml_time_us();
-    auto ctx_llava = llava_init_context(params, model);
+    auto * ctx_llava = llava_init_context(params, model);
     ctx_llava->ctx_clip = ctx_clip;
     const int64_t t_llava_init_end_us = ggml_time_us();
     float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
 
     const int64_t t_process_image_start_us = ggml_time_us();
     process_image(ctx_llava, embeds, params, n_past);
     const int64_t t_process_image_end_us = ggml_time_us();
     float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
 
     llava_image_embed_free(embeds);
     return ctx_llava;
 }
 
-static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
     std::string user_prompt = prompt;
     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
     if (!is_first) {
@@ -238,7 +235,7 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
 
     // generate the response
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
 
     struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
     return smpl;
@@ -259,12 +256,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("llava", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
+    gpt_init();
 
     if (params.mmproj.empty() || (params.image.empty())) {
         show_additional_info(argc, argv);
@@ -273,21 +265,23 @@ int main(int argc, char ** argv) {
 
     for (auto & image : params.image) {
         int n_past = 0;
-        auto ctx_llava = minicpmv_init(&params, image, n_past);
+        auto * ctx_llava = minicpmv_init(&params, image, n_past);
 
         if (!params.prompt.empty()) {
-            LOG_TEE("<user>%s\n", params.prompt.c_str());
-            LOG_TEE("<assistant>");
-            auto smpl = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
+            LOG("<user>%s\n", params.prompt.c_str());
+            LOG("<assistant>");
+            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
             const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response = "";
+            std::string response;
             bool have_tmp = false;
             for (int i = 0; i < max_tgt_len; i++) {
-                auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                 response += tmp;
                 if (strcmp(tmp, "</s>") == 0){
-                    if(!have_tmp)continue;
-                    else break;
+                    if (!have_tmp) {
+                        continue;
+                    }
+                    break;
                 }
                 if (strstr(tmp, "###")) break; // Yi-VL behavior
                 have_tmp = true;
@@ -299,15 +293,15 @@ int main(int argc, char ** argv) {
             gpt_sampler_free(smpl);
         }else {
             while (true) {
-                LOG_TEE("<user>");
+                LOG("<user>");
                 std::string prompt;
                 std::getline(std::cin, prompt);
-                LOG_TEE("<assistant>");
-                auto smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
+                LOG("<assistant>");
+                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
                 const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response = "";
+                std::string response;
                 for (int i = 0; i < max_tgt_len; i++) {
-                    auto tmp = llama_loop(ctx_llava, smpl, n_past);
+                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
                     response += tmp;
                     if (strcmp(tmp, "</s>") == 0) break;
                     if (strstr(tmp, "###")) break; // Yi-VL behavior
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index de8b792f2..49870b4a4 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,6 +1,7 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cstdio>
@@ -42,18 +43,14 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     const int W = 15; // lookahead window
     const int N = 5;  // n-gram size
     const int G = 15; // max verification n-grams
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookahead", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -75,14 +72,14 @@ int main(int argc, char ** argv) {
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -166,7 +163,7 @@ int main(int argc, char ** argv) {
         {
             const std::string token_str = llama_token_to_piece(ctx, id);
 
-            printf("%s", token_str.c_str());
+            LOG("%s", token_str.c_str());
             fflush(stdout);
         }
     }
@@ -256,7 +253,7 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
+            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
             return 1;
         }
 
@@ -293,10 +290,10 @@ int main(int argc, char ** argv) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
 
                 if (v == 0) {
-                    printf("%s", token_str.c_str());
+                    LOG("%s", token_str.c_str());
                 } else {
                     // print light cyan
-                    printf("\033[0;96m%s\033[0m", token_str.c_str());
+                    LOG("\033[0;96m%s\033[0m", token_str.c_str());
                 }
                 fflush(stdout);
 
@@ -330,21 +327,21 @@ int main(int argc, char ** argv) {
             // print known n-grams starting with token id (debug)
             if (0 && v == 0) {
                 if (ngrams_observed.cnt[id] > 0) {
-                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
+                    LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
                 }
 
                 for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    printf("   - ngram %2d: ", i);
+                    LOG("   - ngram %2d: ", i);
 
                     const int idx = id*(N - 1)*G + i*(N - 1);
 
                     for (int j = 0; j < N - 1; j++) {
                         const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
 
-                        printf("%s", token_str.c_str());
+                        LOG("%s", token_str.c_str());
                     }
 
-                    printf("\n");
+                    LOG("\n");
                 }
             }
 
@@ -455,20 +452,20 @@ int main(int argc, char ** argv) {
 
     auto t_dec_end = ggml_time_us();
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("W = %2d\n", W);
-    LOG_TEE("N = %2d\n", N);
-    LOG_TEE("G = %2d\n", G);
-    LOG_TEE("\n");
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_accept  = %d\n", n_accept);
+    LOG_INF("\n");
+    LOG_INF("W = %2d\n", W);
+    LOG_INF("N = %2d\n", N);
+    LOG_INF("G = %2d\n", G);
+    LOG_INF("\n");
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_accept  = %d\n", n_accept);
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
     gpt_perf_print(ctx, smpl);
 
     gpt_sampler_free(smpl);
@@ -482,7 +479,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index f299d68a9..6d1e1ceb9 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -5,13 +5,12 @@
 #include "llama.h"
 #include "ggml.h"
 
-#include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <cinttypes>
 #include <fstream>
 #include <string>
 #include <vector>
-#include <unordered_map>
 
 int main(int argc, char ** argv){
     gpt_params params;
@@ -20,6 +19,8 @@ int main(int argc, char ** argv){
         return 1;
     }
 
+    gpt_init();
+
     const int n_draft = params.n_draft;
 
     // init llama.cpp
@@ -49,7 +50,7 @@ int main(int argc, char ** argv){
             try {
                 ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
             } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
@@ -128,7 +129,7 @@ int main(int argc, char ** argv){
             const int64_t eta_min  = eta_ms / (60*1000);
             const int64_t eta_s    = (eta_ms - 60*1000*eta_min) / 1000;
 
-            LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
+            LOG_INF("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s);
         }
 
         // After each chunk, update the dynamic ngram cache with the context ngram cache:
@@ -136,24 +137,24 @@ int main(int argc, char ** argv){
         ngram_cache_context.clear();
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_input - n_input % n_ctx);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_input - n_input % n_ctx);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
             t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
     llama_free(ctx);
     llama_free_model(model);
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index be6f8d7d7..2ccd0e6c1 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "ngram-cache.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cstdint>
@@ -18,17 +19,13 @@ int main(int argc, char ** argv){
         return 1;
     }
 
+    gpt_init();
+
     // max. number of additional tokens to draft if match is found
     const int n_draft = params.n_draft;
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookup", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -58,7 +55,7 @@ int main(int argc, char ** argv){
             try {
                 ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
             } catch (std::ifstream::failure const &) {
-                fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
+                LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
@@ -76,14 +73,14 @@ int main(int argc, char ** argv){
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
     }
 
     fflush(stderr);
@@ -124,7 +121,7 @@ int main(int argc, char ** argv){
         }
 
         // print current draft sequence
-        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
+        LOG_DBG("drafted %s\n", string_from(ctx, draft).c_str());
 
         int i_dft = 0;
         while (true) {
@@ -136,7 +133,7 @@ int main(int argc, char ** argv){
             const std::string token_str = llama_token_to_piece(ctx, id);
 
             if (!params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
             }
 
             if (llama_token_is_eog(model, id)) {
@@ -147,7 +144,7 @@ int main(int argc, char ** argv){
 
             // check if the target token matches the draft
             if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
+                LOG_DBG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
                 ++n_accept;
                 ++n_past;
                 ++i_dft;
@@ -161,19 +158,19 @@ int main(int argc, char ** argv){
 
                 if (params.use_color) {
                     // color accepted draft token
-                    printf("\033[34m%s\033[0m", token_str.c_str());
+                    LOG("\033[34m%s\033[0m", token_str.c_str());
                     fflush(stdout);
                 }
                 continue;
             }
 
             if (params.use_color) {
-                printf("%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
             }
             fflush(stdout);
 
 
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
 
             draft.clear();
             draft.push_back(id);
@@ -224,22 +221,22 @@ int main(int argc, char ** argv){
     llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
     llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft      = %d\n", n_draft);
-    LOG_TEE("n_predict    = %d\n", n_predict);
-    LOG_TEE("n_drafted    = %d\n", n_drafted);
-    LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
-    LOG_TEE("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
+    LOG_INF("\n");
+    LOG_INF("n_draft      = %d\n", n_draft);
+    LOG_INF("n_predict    = %d\n", n_predict);
+    LOG_INF("n_drafted    = %d\n", n_drafted);
+    LOG_INF("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3);
+    LOG_INF("t_draft      = %.2f ms, %.2f us per token, %.2f tokens per second\n",
             t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us));
-    LOG_TEE("n_accept     = %d\n", n_accept);
-    LOG_TEE("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("n_accept     = %d\n", n_accept);
+    LOG_INF("accept       = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ntarget:\n\n");
+    LOG_INF("\ntarget:\n\n");
     gpt_perf_print(ctx, smpl);
 
     gpt_sampler_free(smpl);
@@ -251,7 +248,7 @@ int main(int argc, char ** argv){
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index f41be5308..d9e45ce2f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,12 +1,11 @@
 #include "arg.h"
 #include "common.h"
 #include "console.h"
+#include "log.h"
 #include "sampling.h"
 #include "llama.h"
 
 #include <cassert>
-#include <cinttypes>
-#include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
@@ -42,11 +41,13 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting  = false;
 static bool need_insert_eot = false;
 
-static void print_usage(int, char ** argv) {
-    printf("\nexample usage:\n");
-    printf("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
-    printf("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
-    printf("\n");
+static void print_usage(int argc, char ** argv) {
+    (void) argc;
+
+    LOG("\nexample usage:\n");
+    LOG("\n  text generation:     %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
+    LOG("\n  chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
+    LOG("\n");
 }
 
 static bool file_exists(const std::string & path) {
@@ -74,8 +75,7 @@ static void write_logfile(
 
     const bool success = fs_create_directory_with_parents(params.logdir);
     if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
+        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
         return;
     }
 
@@ -83,7 +83,7 @@ static void write_logfile(
     FILE * logfile = fopen(logfile_path.c_str(), "w");
 
     if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
         return;
     }
 
@@ -113,7 +113,7 @@ static void sigint_handler(int signo) {
             need_insert_eot = true;
         } else {
             console::cleanup();
-            printf("\n");
+            LOG("\n");
             gpt_perf_print(*g_ctx, *g_smpl);
             write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
             _exit(130);
@@ -122,17 +122,11 @@ static void sigint_handler(int signo) {
 }
 #endif
 
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
-static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
+static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
     llama_chat_msg new_msg{role, content};
     auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
     chat_msgs.push_back({role, content});
-    LOG("formatted: %s\n", formatted.c_str());
+    LOG_DBG("formatted: '%s'\n", formatted.c_str());
     return formatted;
 }
 
@@ -143,55 +137,46 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     auto & sparams = params.sparams;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-    llama_log_set(llama_log_callback_logTee, nullptr);
-#endif // LOG_DISABLE_LOGS
-
-    // TODO: Dump params ?
-    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
-
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
 
     if (params.logits_all) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
+        LOG_ERR("************\n");
+        LOG_ERR("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        LOG_ERR("************\n\n");
 
         return 0;
     }
 
     if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        LOG_WRN("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
     }
 
     if (params.rope_freq_base != 0.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
+        LOG_WRN("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
     }
 
     if (params.rope_freq_scale != 0.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
+        LOG_WRN("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
     }
 
-    print_build_info();
+    LOG_INF("%s: llama backend init\n", __func__);
 
-    LOG("%s: llama backend init\n", __func__);
     llama_backend_init();
     llama_numa_init(params.numa);
 
@@ -206,21 +191,19 @@ int main(int argc, char ** argv) {
     g_smpl = &smpl;
 
     // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
+    LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
     llama_init_result llama_init = llama_init_from_gpt_params(params);
 
     model = llama_init.model;
     ctx = llama_init.context;
 
     if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: error: unable to load model\n", __func__);
         return 1;
     }
 
-    LOG("%s: llama threadpool init = n_threads = %d\n",
-        __func__,
-        (int) params.cpuparams.n_threads
-    );
+    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
+
     struct ggml_threadpool_params tpp_batch =
             ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
     struct ggml_threadpool_params tpp =
@@ -232,8 +215,8 @@ int main(int argc, char ** argv) {
     if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
         threadpool_batch = ggml_threadpool_new(&tpp_batch);
         if (!threadpool_batch) {
-            LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
-            exit(1);
+            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
+            return 1;
         }
 
         // Start the non-batch threadpool in the paused state
@@ -242,55 +225,54 @@ int main(int argc, char ** argv) {
 
     struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
     if (!threadpool) {
-        LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
-        exit(1);
+        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
+        return 1;
     }
 
     llama_attach_threadpool(ctx, threadpool, threadpool_batch);
 
     const int n_ctx_train = llama_n_ctx_train(model);
     const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
 
     if (n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
     }
 
     // print chat template example in conversation mode
     if (params.conversation) {
         if (params.enable_chat_template) {
-            LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
+            LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
         } else {
-            LOG_TEE("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
+            LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
         }
     }
 
     // print system information
     {
-        LOG_TEE("\n");
-        LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
     }
 
     std::string path_session = params.path_prompt_cache;
     std::vector<llama_token> session_tokens;
 
     if (!path_session.empty()) {
-        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        LOG_INF("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
         if (!file_exists(path_session)) {
-            LOG_TEE("%s: session file does not exist, will create.\n", __func__);
+            LOG_INF("%s: session file does not exist, will create.\n", __func__);
         } else if (file_is_empty(path_session)) {
-            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
+            LOG_INF("%s: The session file is empty. A new session will be initialized.\n", __func__);
         } else {
             // The file exists and is not empty
             session_tokens.resize(n_ctx);
             size_t n_token_count_out = 0;
             if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                LOG_ERR("%s: failed to load session file '%s'\n", __func__, path_session.c_str());
                 return 1;
             }
             session_tokens.resize(n_token_count_out);
-            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
+            LOG_INF("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
         }
     }
 
@@ -298,7 +280,8 @@ int main(int argc, char ** argv) {
     if (!llama_model_has_encoder(model)) {
         GGML_ASSERT(!llama_add_eos_token(model));
     }
-    LOG("add_bos: %d\n", add_bos);
+
+    LOG_DBG("n_ctx: %d, add_bos: %d\n", n_ctx, add_bos);
 
     std::vector<llama_token> embd_inp;
 
@@ -307,31 +290,31 @@ int main(int argc, char ** argv) {
             ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
             : params.prompt;
         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
-            LOG("tokenize the prompt\n");
+            LOG_DBG("tokenize the prompt\n");
             embd_inp = ::llama_tokenize(ctx, prompt, true, true);
         } else {
-            LOG("use session tokens\n");
+            LOG_DBG("use session tokens\n");
             embd_inp = session_tokens;
         }
 
-        LOG("prompt: \"%s\"\n", log_tostr(prompt));
-        LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+        LOG_DBG("prompt: \"%s\"\n", prompt.c_str());
+        LOG_DBG("tokens: %s\n", string_from(ctx, embd_inp).c_str());
     }
 
     // Should not run without any tokens
     if (embd_inp.empty()) {
         if (add_bos) {
             embd_inp.push_back(llama_token_bos(model));
-            LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
+            LOG_WRN("embd_inp was considered empty and bos was added: %s\n", string_from(ctx, embd_inp).c_str());
         } else {
-            LOG_TEE("error: input is empty\n");
+            LOG_ERR("input is empty\n");
             return -1;
         }
     }
 
     // Tokenize negative prompt
     if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        LOG_ERR("%s: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
     }
 
@@ -345,29 +328,28 @@ int main(int argc, char ** argv) {
             n_matching_session_tokens++;
         }
         if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_TEE("%s: using full prompt from session file\n", __func__);
+            LOG_INF("%s: using full prompt from session file\n", __func__);
         } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
+            LOG_INF("%s: session file has exact match for prompt!\n", __func__);
         } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_WRN("%s: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         } else {
-            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
+            LOG_INF("%s: session file matches %zu / %zu tokens of prompt\n",
+                    __func__, n_matching_session_tokens, embd_inp.size());
         }
 
         // remove any "future" tokens that we might have inherited from the previous session
         llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
     }
 
-    LOGLN(
-            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu",
-            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
+    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
+         embd_inp.size(), n_matching_session_tokens, embd_inp.size(), session_tokens.size());
 
     // if we will use the cache for the full prompt without reaching the end of the cache, force
     // reevaluation of the last token to recalculate the cached logits
     if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
+        LOG_DBG("recalculate the cached logits (do): session_tokens.resize( %zu )\n", embd_inp.size() - 1);
 
         session_tokens.resize(embd_inp.size() - 1);
     }
@@ -389,21 +371,20 @@ int main(int argc, char ** argv) {
     }
 
     if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
         for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
         if (params.n_keep > add_bos) {
-            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+            LOG_INF("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                LOG("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
             }
-            LOG_TEE("'\n");
+            LOG("'\n");
         }
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
     // ctrl+C handling
@@ -423,40 +404,40 @@ int main(int argc, char ** argv) {
     }
 
     if (params.interactive) {
-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        LOG("%s: interactive mode on.\n", __func__);
 
         if (!params.antiprompt.empty()) {
             for (const auto & antiprompt : params.antiprompt) {
-                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+                LOG("Reverse prompt: '%s'\n", antiprompt.c_str());
                 if (params.verbose_prompt) {
                     auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
                     for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                        LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                     }
                 }
             }
         }
 
         if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            LOG("Input prefix with BOS\n");
         }
 
         if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            LOG("Input prefix: '%s'\n", params.input_prefix.c_str());
             if (params.verbose_prompt) {
                 auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
 
         if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            LOG("Input suffix: '%s'\n", params.input_suffix.c_str());
             if (params.verbose_prompt) {
                 auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
                 for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    LOG("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
                 }
             }
         }
@@ -464,15 +445,15 @@ int main(int argc, char ** argv) {
 
     smpl = gpt_sampler_init(model, sparams);
     if (!smpl) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
+        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
+        return 1;
     }
 
-    LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
-    LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
-    LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
+    LOG_INF("sampler seed: %u\n",     gpt_sampler_get_seed(smpl));
+    LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
+    LOG_INF("sampler chain: %s\n",    gpt_sampler_print(smpl).c_str());
 
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
 
     // group-attention state
     // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@@ -486,9 +467,9 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
       //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
       //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
+        LOG_INF("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
     }
-    LOG_TEE("\n\n");
+    LOG("\n");
 
     if (params.interactive) {
         const char * control_message;
@@ -500,11 +481,11 @@ int main(int argc, char ** argv) {
                               " - To return control without starting a new line, end your input with '/'.\n"
                               " - If you want to submit another line, end your input with '\\'.\n";
         }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        LOG("== Running in interactive mode. ==\n");
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+        LOG(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_TEE(       "%s\n", control_message);
+        LOG(       "%s\n", control_message);
 
         is_interacting = params.interactive_first;
     }
@@ -543,7 +524,7 @@ int main(int argc, char ** argv) {
         llama_token * enc_input_buf = embd_inp.data();
 
         if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size, 0, 0))) {
-            LOG_TEE("%s : failed to eval\n", __func__);
+            LOG_ERR("%s : failed to eval\n", __func__);
             return 1;
         }
 
@@ -569,9 +550,8 @@ int main(int argc, char ** argv) {
                 embd.resize(max_embd_size);
 
                 console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                LOG_WRN("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                 console::set_display(console::reset);
-                fflush(stdout);
             }
 
             if (ga_n == 1) {
@@ -581,14 +561,14 @@ int main(int argc, char ** argv) {
                 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
                 if (n_past + (int) embd.size() >= n_ctx) {
                     if (params.n_predict == -2) {
-                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                         break;
                     }
 
                     const int n_left    = n_past - params.n_keep;
                     const int n_discard = n_left/2;
 
-                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                             n_past, n_left, n_ctx, params.n_keep, n_discard);
 
                     llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
@@ -596,11 +576,11 @@ int main(int argc, char ** argv) {
 
                     n_past -= n_discard;
 
-                    LOG("after swap: n_past = %d\n", n_past);
+                    LOG_DBG("after swap: n_past = %d\n", n_past);
 
-                    LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
 
-                    LOG("clear session path\n");
+                    LOG_DBG("clear session path\n");
                     path_session.clear();
                 }
             } else {
@@ -610,10 +590,10 @@ int main(int argc, char ** argv) {
                     const int bd = (ga_w/ga_n)*(ga_n - 1);
                     const int dd = (ga_w/ga_n) - ib*bd - ga_w;
 
-                    LOG("\n");
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
-                    LOG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
-                    LOG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
+                    LOG_DBG("\n");
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i, n_past, ib*bd, ga_i + ib*bd, n_past + ib*bd);
+                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
+                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
 
                     llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
                     llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
@@ -623,7 +603,7 @@ int main(int argc, char ** argv) {
 
                     ga_i += ga_w/ga_n;
 
-                    LOG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
+                    LOG_DBG("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", n_past + bd, n_past, ga_i);
                 }
             }
 
@@ -655,19 +635,19 @@ int main(int argc, char ** argv) {
                     n_eval = params.n_batch;
                 }
 
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
+                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
 
                 if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    LOG_ERR("%s : failed to eval\n", __func__);
                     return 1;
                 }
 
                 n_past += n_eval;
 
-                LOG("n_past = %d\n", n_past);
+                LOG_DBG("n_past = %d\n", n_past);
                 // Display total tokens alongside total time
                 if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_DBG("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
                 }
             }
 
@@ -685,14 +665,14 @@ int main(int argc, char ** argv) {
                 need_to_save_session = false;
                 llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
 
-                LOG("saved session to %s\n", path_session.c_str());
+                LOG_DBG("saved session to %s\n", path_session.c_str());
             }
 
             const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
 
-            gpt_sampler_accept(smpl, id, /* apply_grammar= */ true);
+            gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
 
-            // LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, smpl->prev.to_vector()).c_str());
+            // LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
 
             embd.push_back(id);
 
@@ -702,16 +682,16 @@ int main(int argc, char ** argv) {
             // decrement remaining sampling budget
             --n_remain;
 
-            LOG("n_remain: %d\n", n_remain);
+            LOG_DBG("n_remain: %d\n", n_remain);
         } else {
             // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
+            LOG_DBG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
             while ((int) embd_inp.size() > n_consumed) {
                 embd.push_back(embd_inp[n_consumed]);
 
                 // push the prompt in the sampling context in order to apply repetition penalties later
                 // for the prompt, we don't apply grammar rules
-                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* apply_grammar= */ false);
+                gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
 
                 ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
@@ -726,7 +706,7 @@ int main(int argc, char ** argv) {
                 const std::string token_str = llama_token_to_piece(ctx, id, params.special);
 
                 // Console/Stream Output
-                fprintf(stdout, "%s", token_str.c_str());
+                LOG("%s", token_str.c_str());
 
                 // Record Displayed Tokens To Log
                 // Note: Generated tokens are created one by one hence this check
@@ -738,8 +718,6 @@ int main(int argc, char ** argv) {
                     output_tokens.push_back(id);
                     output_ss << token_str;
                 }
-
-                fflush(stdout);
             }
         }
 
@@ -788,13 +766,13 @@ int main(int argc, char ** argv) {
                 }
 
                 if (is_antiprompt) {
-                    LOG("found antiprompt: %s\n", last_output.c_str());
+                    LOG_DBG("found antiprompt: %s\n", last_output.c_str());
                 }
             }
 
             // deal with end of generation tokens in interactive mode
             if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
-                LOG("found an EOG token\n");
+                LOG_DBG("found an EOG token\n");
 
                 if (params.interactive) {
                     if (!params.antiprompt.empty()) {
@@ -808,7 +786,7 @@ int main(int argc, char ** argv) {
                         chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
                     }
                     is_interacting = true;
-                    printf("\n");
+                    LOG("\n");
                 }
             }
 
@@ -819,21 +797,21 @@ int main(int argc, char ** argv) {
             }
 
             if (n_past > 0 && is_interacting) {
-                LOG("waiting for user input\n");
+                LOG_DBG("waiting for user input\n");
 
                 if (params.conversation) {
-                    printf("\n> ");
+                    LOG("\n> ");
                 }
 
                 if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
+                    LOG_DBG("adding input prefix BOS token\n");
                     embd_inp.push_back(llama_token_bos(model));
                 }
 
                 std::string buffer;
                 if (!params.input_prefix.empty() && !params.conversation) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
-                    printf("%s", params.input_prefix.c_str());
+                    LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
+                    LOG("%s", params.input_prefix.c_str());
                 }
 
                 // color user input only
@@ -856,11 +834,11 @@ int main(int argc, char ** argv) {
                 if (buffer.length() > 1) {
                     // append input suffix if any
                     if (!params.input_suffix.empty() && !params.conversation) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
-                        printf("%s", params.input_suffix.c_str());
+                        LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
+                        LOG("%s", params.input_suffix.c_str());
                     }
 
-                    LOG("buffer: '%s'\n", buffer.c_str());
+                    LOG_DBG("buffer: '%s'\n", buffer.c_str());
 
                     const size_t original_size = embd_inp.size();
 
@@ -877,7 +855,7 @@ int main(int argc, char ** argv) {
                     const auto line_inp = ::llama_tokenize(ctx, user_inp,            false, format_chat);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
 
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
+                    LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
 
                     // if user stop generation mid-way, we must add EOT to finish model's last response
                     if (need_insert_eot && format_chat) {
@@ -900,9 +878,9 @@ int main(int argc, char ** argv) {
                     assistant_ss.str("");
 
                     n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
+                    LOG_DBG("n_remain: %d\n", n_remain);
                 } else {
-                    LOG("empty line, passing control back\n");
+                    LOG_DBG("empty line, passing control back\n");
                 }
 
                 input_echo = false; // do not echo this again
@@ -918,7 +896,7 @@ int main(int argc, char ** argv) {
 
         // end of generation
         if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
-            LOG_TEE(" [end of text]\n");
+            LOG(" [end of text]\n");
             break;
         }
 
@@ -931,11 +909,11 @@ int main(int argc, char ** argv) {
     }
 
     if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        LOG("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
         llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
     }
 
-    LOG_TEE("\n");
+    LOG("\n\n");
     gpt_perf_print(ctx, smpl);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
@@ -949,9 +927,5 @@ int main(int argc, char ** argv) {
     ggml_threadpool_free(threadpool);
     ggml_threadpool_free(threadpool_batch);
 
-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n");
-#endif // LOG_DISABLE_LOGS
-
     return 0;
 }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 758393c3d..81e2f7ed7 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -4,6 +4,7 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -83,7 +84,9 @@ static void print_date_time() {
     char buffer[80];
     strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
 
-    printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
+    LOG_INF("\n");
+    LOG_INF("\033[35mrun parameters as of %s\033[0m\n", buffer);
+    LOG_INF("\n");
 }
 
 // Define a split string function to ...
@@ -106,6 +109,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     // number of simultaneous "clients" to simulate
     const int32_t n_clients = params.n_parallel;
 
@@ -120,12 +125,6 @@ int main(int argc, char ** argv) {
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("parallel", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -138,23 +137,22 @@ int main(int argc, char ** argv) {
 
     // load the prompts from an external file if there are any
     if (params.prompt.empty()) {
-        printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
+        LOG_INF("\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
     } else {
         // Output each line of the input params.prompts vector and copy to k_prompts
         int index = 0;
-        printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
+        LOG_INF("\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
 
         std::vector<std::string> prompts = split_string(params.prompt, '\n');
         for (const auto& prompt : prompts) {
             k_prompts.resize(index + 1);
             k_prompts[index] = prompt;
             index++;
-            printf("%3d prompt: %s\n", index, prompt.c_str());
+            LOG_INF("%3d prompt: %s\n", index, prompt.c_str());
         }
     }
 
-    fprintf(stderr, "\n\n");
-    fflush(stderr);
+    LOG_INF("\n\n");
 
     const int n_ctx = llama_n_ctx(ctx);
 
@@ -183,19 +181,19 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
-    LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
-    LOG_TEE("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
-    LOG_TEE("\n");
+    LOG_INF("%s: Simulating parallel requests from clients:\n", __func__);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("\n");
 
     {
-        LOG_TEE("%s: Evaluating the system prompt ...\n", __func__);
+        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
 
         for (int32_t i = 0; i < n_tokens_system; ++i) {
             llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
@@ -204,10 +202,10 @@ int main(int argc, char ** argv) {
             llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
         }
 
-        LOG_TEE("\n");
+        LOG_INF("\n");
     }
 
-    LOG_TEE("Processing requests ...\n\n");
+    LOG_INF("Processing requests ...\n\n");
 
     while (true) {
         if (dump_kv_cache) {
@@ -238,7 +236,7 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
 
-            LOG_TEE("%s: clearing the KV cache\n", __func__);
+            LOG_INF("%s: clearing the KV cache\n", __func__);
         }
 
         // insert new sequences for decoding
@@ -273,7 +271,7 @@ int main(int argc, char ** argv) {
                     client.n_decoded = 0;
                     client.i_batch   = batch.n_tokens - 1;
 
-                    LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
+                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
 
                     g_seq_id += 1;
 
@@ -317,11 +315,11 @@ int main(int argc, char ** argv) {
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
+                    LOG_ERR("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
                     return 1;
                 }
 
-                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
 
                 n_cache_miss += 1;
 
@@ -332,7 +330,7 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            LOG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
+            LOG_DBG("%s : decoded batch of %d tokens\n", __func__, n_tokens);
 
             for (auto & client : clients) {
                 if (client.i_batch < (int) i || client.i_batch >= (int) (i + n_tokens)) {
@@ -377,7 +375,7 @@ int main(int argc, char ** argv) {
 
                     const auto t_main_end = ggml_time_us();
 
-                    LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
+                    LOG_INF("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\n\033[35mResponse: %s\033[0m\n\n",
                             client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
                             (t_main_end - client.t_start_prompt) / 1e6,
                             (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
@@ -400,19 +398,19 @@ int main(int argc, char ** argv) {
 
     print_date_time();
 
-    LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
+    LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
     if (params.prompt_file.empty()) {
         params.prompt_file = "used built-in defaults";
     }
-    LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_TEE("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
 
-    LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
-    LOG_TEE("Cache misses:        %6d\n", n_cache_miss);
+    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Total speed (AVG):   %6s  speed: %5.2f t/s\n", "",             (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
+    LOG_INF("Cache misses:        %6d\n", n_cache_miss);
 
-    LOG_TEE("\n");
+    LOG_INF("\n");
 
     // TODO: print sampling/grammar timings for all clients
     llama_perf_context_print(ctx);
@@ -424,7 +422,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index 52aa68bfc..7ef8d14f3 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -1,5 +1,6 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <cmath>
@@ -8,9 +9,9 @@
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
@@ -24,6 +25,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     int n_junk = params.n_junk;
     int n_keep = params.n_keep;
     int n_grp  = params.grp_attn_n;
@@ -63,7 +66,7 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
     if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        LOG_ERR("%s: unable to load model\n" , __func__);
         return 1;
     }
 
@@ -77,7 +80,7 @@ int main(int argc, char ** argv) {
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
     if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
         return 1;
     }
 
@@ -107,14 +110,14 @@ int main(int argc, char ** argv) {
     const int n_batch     = ctx_params.n_batch;
     const int n_batch_grp = ctx_params.n_batch/n_grp;
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
+    LOG_INF("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d, n_junk = %d, i_pos = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch, n_junk, i_pos);
 
     // print the prompt token-by-token
 
-    LOG_TEE("\n");
-    LOG_TEE("prefix tokens: %d\n", n_tokens_prefix);
-    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
-    //LOG_TEE("prompt: %s\n", params.prompt.c_str());
+    LOG_INF("\n");
+    LOG_INF("prefix tokens: %d\n", n_tokens_prefix);
+    LOG_INF("prompt tokens: %d\n", n_tokens_all);
+    //LOG_INF("prompt: %s\n", params.prompt.c_str());
 
     llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
 
@@ -145,11 +148,11 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_INF("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
 
         if (i + n_batch >= n_tokens_all) {
             break;
@@ -159,7 +162,7 @@ int main(int argc, char ** argv) {
     for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
         const int n_discard = n_batch;
 
-        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
+        LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
 
         llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
         llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -179,18 +182,18 @@ int main(int argc, char ** argv) {
         }
 
         if (llama_decode(ctx, batch) != 0) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
 
-        LOG_TEE("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
+        LOG_INF("%s: processed: [%6d, %6d)\n", __func__, i, std::min(i + n_batch, n_tokens_all));
     }
 
     {
         const int n_discard = n_past - n_ctx + n_predict;
 
         if (n_discard > 0) {
-            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
+            LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
 
             llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
             llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
@@ -201,17 +204,16 @@ int main(int argc, char ** argv) {
         }
     }
 
-    LOG_TEE("\n");
-    LOG_TEE("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
-    LOG_TEE("\n");
+    LOG_INF("\n");
+    LOG_INF("%s: passkey = %d, inserted at position %d / %d (token pos: ~%d)\n", __func__, passkey, i_pos, n_junk, (i_pos * n_tokens_all) / n_junk);
+    LOG_INF("\n");
 
     // main loop
 
     int n_cur    = n_tokens_all;
     int n_decode = 0;
 
-    LOG_TEE("%s", prompt_suffix.c_str());
-    fflush(stdout);
+    LOG_INF("%s", prompt_suffix.c_str());
 
     const auto t_main_start = ggml_time_us();
 
@@ -222,13 +224,12 @@ int main(int argc, char ** argv) {
 
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
-                LOG_TEE("\n");
+                LOG("\n");
 
                 break;
             }
 
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
-            fflush(stdout);
+            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
 
             n_decode += 1;
 
@@ -243,22 +244,22 @@ int main(int argc, char ** argv) {
 
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_context_print(ctx);
 
-    fprintf(stderr, "\n");
+    LOG("\n");
 
     llama_sampler_free(smpl);
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 29ff86bbc..18e75a7a2 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,7 +1,9 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
+#include <algorithm>
 #include <array>
 #include <atomic>
 #include <cmath>
@@ -41,7 +43,7 @@ static void write_logfile(
     }
 
     if (params.hellaswag) {
-        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
+        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
         return;
     }
 
@@ -49,7 +51,7 @@ static void write_logfile(
 
     const bool success = fs_create_directory_with_parents(params.logdir);
     if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
+        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
                 __func__, params.logdir.c_str());
         return;
     }
@@ -58,7 +60,7 @@ static void write_logfile(
     FILE * logfile = fopen(logfile_path.c_str(), "w");
 
     if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
+        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
         return;
     }
 
@@ -344,16 +346,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
     GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
 
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
 
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
 
     const int n_ctx = llama_n_ctx(ctx);
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                 n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
         return {std::move(tokens), 0., {}, {}};
     }
 
@@ -364,16 +366,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     prob_history.resize(tokens.size());
 
     if (params.ppl_stride <= 0) {
-        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        LOG_ERR("%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
         return {tokens, -1, logit_history, prob_history};
     }
 
     const int calc_chunk = n_ctx;
 
-    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+    LOG_INF("%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
 
     if (int(tokens.size()) <= calc_chunk) {
-        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+        LOG_ERR("%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                 tokens.size(), n_ctx, params.ppl_stride);
         return {tokens, -1, logit_history, prob_history};
     }
@@ -387,14 +389,14 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
     int count = 0;
     double nll = 0.0;
 
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+    LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
     for (int i = 0; i < n_chunk; ++i) {
         const int start =     i * params.ppl_stride;
         const int end   = start + calc_chunk;
 
         const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
-        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+        //LOG_DBG("%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
 
         std::vector<float> logits;
 
@@ -407,10 +409,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
             const int batch_start = start + j * n_batch;
             const int batch_size  = std::min(end - batch_start, n_batch);
 
-            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
             // TODO: use llama_batch.logits instead of relying on logits_all == true
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                //fprintf(stderr, "%s : failed to eval\n", __func__);
+                //LOG_ERR("%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
 
@@ -434,16 +436,17 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
+        LOG("\n");
 
-        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        //LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
         for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
 
             // Calculate probability of next token, given the previous ones.
@@ -460,13 +463,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
         }
         // perplexity is e^(average negative log-likelihood)
         if (params.ppl_output_type == 0) {
-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
         } else {
-            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
         }
-        fflush(stdout);
     }
-    printf("\n");
+    LOG("\n");
 
     return {tokens, std::exp(nll / count), logit_history, prob_history};
 }
@@ -488,26 +490,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     if (!params.logits_file.empty()) {
         logits_stream.open(params.logits_file.c_str(), std::ios::binary);
         if (!logits_stream.is_open()) {
-            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
             return {};
         }
-        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
+        LOG_INF("%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
         logits_stream.write("_logits_", 8);
         logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
     }
 
     auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
+    LOG_INF("%s: tokenizing the input ..\n", __func__);
 
     std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
 
     auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
+    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
     if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
+        LOG_ERR("%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
                 n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
+        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
         return {std::move(tokens), 0., {}, {}};
     }
 
@@ -540,7 +542,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
         logits.reserve((size_t)n_ctx * n_vocab);
     }
 
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
+    LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
 
     std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 
@@ -613,7 +615,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             }
 
             if (llama_decode(ctx, batch)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_INF("%s : failed to eval\n", __func__);
                 return {tokens, -1, logit_history, prob_history};
             }
 
@@ -628,14 +630,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
             llama_synchronize(ctx);
             const auto t_end = std::chrono::high_resolution_clock::now();
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total*n_chunk/n_seq);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
+        LOG("\n");
 
         for (int seq = 0; seq < n_seq_batch; seq++) {
             const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first);
@@ -656,19 +659,18 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
 
             // perplexity is e^(average negative log-likelihood)
             if (params.ppl_output_type == 0) {
-                printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
+                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
             } else {
                 double av = nll/count;
                 double av2 = nll2/count - av*av;
                 if (av2 > 0) av2 = sqrt(av2/(count-1));
-                printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+                LOG("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
             }
         }
-        fflush(stdout);
 
         logits.clear();
     }
-    printf("\n");
+    LOG("\n");
 
     nll2 /= count;
     nll /= count;
@@ -676,9 +678,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
     nll2 -= nll * nll;
     if (nll2 > 0) {
         nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+        LOG_INF("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
     } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
+        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
     }
 
     llama_batch_free(batch);
@@ -704,7 +706,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
 
         const int ret = llama_decode(ctx, batch_view);
         if (ret != 0) {
-            LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
+            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
             return false;
         }
 
@@ -790,15 +792,15 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     }
 
     if (prompt_lines.size() % 6 != 0) {
-        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        LOG_ERR("%s : number of lines in prompt not a multiple of 6.\n", __func__);
         return;
     }
 
     size_t hs_task_count = prompt_lines.size()/6;
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
 
     const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
-    fprintf(stderr, "================================= is_spm = %d\n", is_spm);
+    LOG_INF("================================= is_spm = %d\n", is_spm);
 
     // The tasks should be randomized so the score stabilizes quickly.
     bool randomize_tasks = true;
@@ -825,7 +827,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         std::vector<llama_token> seq_tokens[4];
     };
 
-    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+    LOG_INF("%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
 
     // Select and read data from prompt lines
     std::vector<hs_data_t> hs_data(hs_task_count);
@@ -871,9 +873,9 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
     }
 
-    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating hellaswag score over selected tasks.\n", __func__);
 
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
 
     double acc = 0.0f;
 
@@ -941,7 +943,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -949,7 +951,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -999,7 +1001,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
                 }
             }
 
-            //printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
+            //LOG("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_cur.gold_ending_idx);
 
             // If the gold ending got the maximum logprobe add one accuracy point
             if (ending_logprob_max_idx == hs_cur.gold_ending_idx) {
@@ -1007,8 +1009,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             }
 
             // Print the accumulated accuracy mean x 100
-            printf("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
-            fflush(stdout);
+            LOG("%zu\t%.8lf\n", i + 1, acc/double(i + 1)*100.0);
         }
 
         i0 = i1 - 1;
@@ -1016,7 +1017,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
 
     llama_batch_free(batch);
 
-    printf("\n");
+    LOG("\n");
 }
 
 struct winogrande_entry {
@@ -1060,7 +1061,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
             }
         }
         if (ipos != 4) {
-            printf("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
+            LOG_ERR("%s: failed to find comma separators in <%s>\n", __func__, line.c_str());
             continue;
         }
         auto sentence = line[comma_pos[0]+1] == '"' ? line.substr(comma_pos[0]+2, comma_pos[1] - comma_pos[0] - 3)
@@ -1074,13 +1075,13 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
             if (sentence[where] == '_') break;
         }
         if (where == int(sentence.size())) {
-            printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
+            LOG_ERR("%s: no _ in <%s>\n", __func__, sentence.c_str());
             continue;
         }
         std::istringstream stream(answer.c_str());
         int i_answer; stream >> i_answer;
         if (stream.fail() || i_answer < 1 || i_answer > 2) {
-            printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
+            LOG_ERR("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
             continue;
         }
         result.emplace_back();
@@ -1109,14 +1110,14 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
     auto data = load_winogrande_from_csv(params.prompt);
     if (data.empty()) {
-        fprintf(stderr, "%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
         return;
     }
 
-    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, data.size());
+    LOG_INF("%s : loaded %zu tasks from prompt.\n", __func__, data.size());
 
     if (params.winogrande_tasks > 0 && params.winogrande_tasks < data.size()) {
-        fprintf(stderr, "%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
+        LOG_INF("%s : selecting %zu random tasks\n", __func__, params.winogrande_tasks);
         std::mt19937 rng(1);
         std::vector<int> aux(data.size());
         for (int i = 0; i < int(data.size()); ++i) {
@@ -1134,7 +1135,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         data = std::move(selected);
     }
 
-    fprintf(stderr, "%s : tokenizing selected tasks\n", __func__);
+    LOG_INF("%s : tokenizing selected tasks\n", __func__);
 
     for (auto & task : data) {
         task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
@@ -1157,7 +1158,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
     }
 
-    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
+    LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
@@ -1218,7 +1219,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -1226,7 +1227,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -1286,20 +1287,20 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             ++n_done;
 
             // print the accumulated accuracy mean x 100
-            printf("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
-            fflush(stdout);
+            LOG("%zu\t%.4lf\t%10.6f  %10.6f  %d  %d\n", i+1, 100.0 * n_correct/n_done, score_1st, score_2nd, result, task.answer);
         }
 
         i0 = i1 - 1;
     }
 
-    printf("\n");
+    LOG("\n");
 
     if (n_done < 100) return;
 
     const float p = 1.f*n_correct/n_done;
     const float sigma = 100.f*sqrt(p*(1-p)/(n_done-1));
-    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
+
+    LOG_INF("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }
 
 static bool deserialize_string(std::istream & in, std::string & str) {
@@ -1348,7 +1349,7 @@ struct multiple_choice_task {
 static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) {
     if (task.question.empty() || task.mc1.answers.empty()) {
         if (log_error) {
-            printf("%s: found bad task with empty question and/or answers\n", __func__);
+            LOG_ERR("%s: found bad task with empty question and/or answers\n", __func__);
         }
         return false;
     }
@@ -1356,7 +1357,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
     for (auto& answer : task.mc1.answers) {
         if (answer.empty()) {
             if (log_error) {
-                printf("%s: found empty answer\n", __func__);
+                LOG_ERR("%s: found empty answer\n", __func__);
             }
             return false;
         }
@@ -1410,14 +1411,14 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     uint32_t n_task;
     strstream.read((char *)&n_task, sizeof(n_task));
     if (strstream.fail() || n_task == 0) {
-        printf("%s: no tasks\n", __func__);
+        LOG_ERR("%s: no tasks\n", __func__);
         return;
     }
-    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
+    LOG_INF("%s: there are %u tasks in prompt\n", __func__, n_task);
     std::vector<uint32_t> task_pos(n_task);
     strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
     if (strstream.fail()) {
-        printf("%s: failed to read task positions from prompt\n", __func__);
+        LOG_ERR("%s: failed to read task positions from prompt\n", __func__);
         return;
     }
 
@@ -1425,21 +1426,21 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
         // Use all tasks
         tasks.resize(n_task);
-        printf("%s: reading tasks", __func__);
+        LOG_INF("%s: reading tasks", __func__);
         int n_dot = std::max((int) n_task/100, 1);
         int i = 0;
         for (auto& task : tasks) {
             ++i;
             if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
+                LOG_ERR("%s: failed to read task %d of %u\n", __func__, i, n_task);
                 return;
             }
-            if (i%n_dot == 0) printf(".");
+            if (i%n_dot == 0) LOG(".");
         }
-        printf("done\n");
+        LOG("done\n");
     }
     else {
-        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
+        LOG_INF("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
         std::mt19937 rng(1);
         std::vector<int> aux(n_task);
         for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
@@ -1452,18 +1453,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             aux.pop_back();
             strstream.seekg(task_pos[idx], std::ios::beg);
             if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
+                LOG_ERR("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
                 return;
             }
         }
         n_task = params.multiple_choice_tasks;
     }
 
-    printf("%s: preparing task data", __func__);
-    fflush(stdout);
+    LOG_INF("%s: preparing task data", __func__);
     if (n_task > 500) {
-        printf("...");
-        fflush(stdout);
+        LOG("...");
         std::atomic<int> counter(0);
         std::atomic<int> n_bad(0);
         auto prepare = [&counter, &n_bad, &tasks, ctx] () {
@@ -1487,11 +1486,10 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         for (auto& w : workers) w = std::thread(prepare);
         prepare();
         for (auto& w : workers) w.join();
-        printf("done\n");
-        fflush(stdout);
+        LOG("done\n");
         int nbad = n_bad;
         if (nbad > 0) {
-            printf("%s: found %d malformed tasks\n", __func__, nbad);
+            LOG_ERR("%s: found %d malformed tasks\n", __func__, nbad);
             return;
         }
     } else {
@@ -1503,16 +1501,15 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                 return;
             }
             if (i_task%n_dot == 0) {
-                printf(".");
-                fflush(stdout);
+                LOG(".");
             }
         }
-        printf("done\n");
+        LOG("done\n");
     }
 
-    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
+    LOG_INF("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
 
-    printf("\ntask\tacc_norm\n");
+    LOG("\ntask\tacc_norm\n");
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx));
     const int n_ctx   = llama_n_ctx(ctx);
@@ -1591,7 +1588,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         }
 
         if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
+            LOG_ERR("%s : task %zu does not fit in the context window\n", __func__, i0);
             return;
         }
 
@@ -1599,7 +1596,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
         // decode all tasks [i0, i1)
         if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
+            LOG_ERR("%s: llama_decode() failed\n", __func__);
             return;
         }
 
@@ -1623,13 +1620,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
         // compute the logprobs for each ending of the decoded tasks
         for (size_t i = i0; i < i1; ++i) {
             auto & cur_task = tasks[i];
-            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
+            //LOG("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
             //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
             //    if (cur_task.mc1.labels[j] == 1) {
-            //        printf("%d", j+1);
+            //        LOG("%d", j+1);
             //    }
             //}
-            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
+            //LOG("\n    common_prefix: %zu\n", cur_task.common_prefix);
 
             // get the logits of the last token of the common prefix
             std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
@@ -1641,13 +1638,13 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
                 size_t count = 1;
                 float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
                 for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //printf("        %zu  %g\n", ir, eval_results[ir]);
+                    //LOG("        %zu  %g\n", ir, eval_results[ir]);
                     ++count;
                     log_prob += eval_results[ir++];
                 }
                 cur_task.log_probs[s] = log_prob / count;
-                //printf("        Final: %g\n", log_prob / count);
-                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
+                //LOG("        Final: %g\n", log_prob / count);
+                //LOG("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
             }
 
             // Find the ending with maximum logprob
@@ -1667,8 +1664,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             ++n_done;
 
             // Print the accumulated accuracy mean x 100
-            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
-            fflush(stdout);
+            LOG("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
         }
 
         i0 = i1 - 1;
@@ -1680,29 +1676,30 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
 
     float p = 1.f*n_correct/n_done;
     float sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG("\n");
+    LOG_INF("Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
     p = 1.f*n_done/n_tot_answers;
     sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
+    LOG_INF("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
 
-    printf("\n");
+    LOG_INF("\n");
 }
 
 static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     if (params.logits_file.empty()) {
-        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
+        LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
         return;
     }
     std::ifstream in(params.logits_file.c_str(), std::ios::binary);
     if (!in) {
-        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed to open %s\n", __func__, params.logits_file.c_str());
         return;
     }
     {
         char check[9]; check[8] = 0;
         in.read(check, 8);
         if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
+            LOG_ERR("%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
             return;
         }
     }
@@ -1710,7 +1707,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     uint32_t n_ctx;
     in.read((char *)&n_ctx, sizeof(n_ctx));
     if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
+        LOG_ERR("%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
                 __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
     }
 
@@ -1718,16 +1715,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
     in.read((char *)&n_vocab, sizeof(n_vocab));
     in.read((char *)&n_chunk, sizeof(n_chunk));
     if (in.fail()) {
-        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
         return;
     }
     if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
+        LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
     }
 
     std::vector<llama_token> tokens(n_ctx * n_chunk);
     if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
+        LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
         return;
     }
 
@@ -1776,7 +1773,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
             return;
         }
 
@@ -1797,7 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
             // TODO: use llama_batch.logits instead of relying on logits_all == true
             if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
+                LOG_ERR("%s : failed to eval\n", __func__);
                 return;
             }
 
@@ -1814,16 +1811,16 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 
         if (i == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
             if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                LOG("%d hours ", total_seconds / (60*60));
                 total_seconds = total_seconds % (60*60);
             }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-
-            printf("\nchunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
+            LOG("%.2f minutes\n", total_seconds / 60.0);
         }
+        LOG("\n");
+        LOG("chunk             PPL               ln(PPL(Q)/PPL(base))          KL Divergence              Δp RMS            Same top p\n");
 
         const int first = n_ctx/2;
         const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
@@ -1832,79 +1829,77 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         p_diff_ptr += n_ctx - 1 - first;
         kld_ptr    += n_ctx - 1 - first;
 
-        printf("%4d", i+1);
+        LOG("%4d", i+1);
 
         auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
         const double ppl_val = exp(log_ppl.first);
         const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-        printf("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
+        LOG("    %9.4lf ± %9.4lf", ppl_val, ppl_unc);
 
         auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
         const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
         const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
         const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-        printf("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
+        LOG("    %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc);
 
         auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        printf("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
+        LOG("    %10.5lf ± %10.5lf", kl_div.first, kl_div.second);
 
         auto p_diff_mse   = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
         const double p_diff_rms_val = sqrt(p_diff_mse.first);
         const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
 
         double p_top_val = 1.*kld.n_same_top/kld.count;
         double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1));
-        printf("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
+        LOG("    %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc);
 
-        printf("\n");
-
-        fflush(stdout);
+        LOG("\n");
 
         logits.clear();
     }
-    printf("\n");
+    LOG("\n");
 
     if (kld.count < 100) return; // we do not wish to do statistics on so few values
 
     std::sort(kld_values.begin(), kld_values.end());
     std::sort(p_diff_values.begin(), p_diff_values.end());
 
-    printf("====== Perplexity statistics ======\n");
+    LOG("====== Perplexity statistics ======\n");
 
     auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
     const double ppl_val = exp(log_ppl.first);
     const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 )
-    printf("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
+    LOG("Mean PPL(Q)                   : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc);
 
     auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count);
     const double ppl_base_val = exp(log_ppl_base.first);
     const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 )
-    printf("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
+    LOG("Mean PPL(base)                : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc);
 
     const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count);
-    // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
+    // LOG("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov);
     const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second);
-    printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
+    LOG("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor);
 
     const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first;
     const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov);
-    printf("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
+    LOG("Mean ln(PPL(Q)/PPL(base))     : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc);
 
     const double ppl_ratio_val = exp(log_ppl_ratio_val);
     const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 )
-    printf("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
+    LOG("Mean PPL(Q)/PPL(base)         : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc);
 
     const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov;
     const double ppl_diff_val = ppl_val - ppl_base_val;
     const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov);
-    printf("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
+    LOG("Mean PPL(Q)-PPL(base)         : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc);
 
-    printf("\n");
+    LOG("\n");
 
-    printf("====== KL divergence statistics ======\n");
+    LOG("====== KL divergence statistics ======\n");
     auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
+    LOG("Mean    KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second);
     auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
                                                : kld_values[kld_values.size()/2];
 
@@ -1916,50 +1911,49 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
         return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)];
     };
 
-    printf("Maximum KLD: %10.6f\n", kld_values.back());
-    printf("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
-    printf("Median  KLD: %10.6f\n", kld_median);
-    printf("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
-    printf(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
-    printf(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
-    printf("Minimum KLD: %10.6f\n", kld_values.front());
+    LOG("Maximum KLD: %10.6f\n", kld_values.back());
+    LOG("99.9%%   KLD: %10.6f\n", percentile(kld_values, 0.999f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("99.0%%   KLD: %10.6f\n", percentile(kld_values, 0.990f));
+    LOG("Median  KLD: %10.6f\n", kld_median);
+    LOG("10.0%%   KLD: %10.6f\n", percentile(kld_values, 0.100f));
+    LOG(" 5.0%%   KLD: %10.6f\n", percentile(kld_values, 0.050f));
+    LOG(" 1.0%%   KLD: %10.6f\n", percentile(kld_values, 0.010f));
+    LOG("Minimum KLD: %10.6f\n", kld_values.front());
 
-    printf("\n");
+    LOG("\n");
 
-    printf("====== Token probability statistics ======\n");
+    LOG("====== Token probability statistics ======\n");
 
     auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count);
-    printf("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
+    LOG("Mean    Δp: %6.3lf ± %5.3lf %%\n",  100.0*p_diff.first, 100.0*p_diff.second);
 
     auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1])
                                                : p_diff_values[p_diff_values.size()/2];
 
-    printf("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
-    printf("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
-    printf("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
-    printf("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
-    printf("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
-    printf("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
-    printf("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
-    printf("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
-    printf("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
-    printf(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
-    printf(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
-    printf(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
-    printf("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
+    LOG("Maximum Δp: %6.3lf%%\n",  100.0*p_diff_values.back());
+    LOG("99.9%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f));
+    LOG("99.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f));
+    LOG("95.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f));
+    LOG("90.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f));
+    LOG("75.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f));
+    LOG("Median  Δp: %6.3lf%%\n",  100.0*p_diff_median);
+    LOG("25.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f));
+    LOG("10.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f));
+    LOG(" 5.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f));
+    LOG(" 1.0%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f));
+    LOG(" 0.1%%   Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f));
+    LOG("Minimum Δp: %6.3lf%%\n",  100.0*p_diff_values.front());
 
     auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count);
-    // printf("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
+    // LOG("MSE Δp    : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second);
 
     const double p_diff_rms_val = sqrt(p_diff_mse.first);
     const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second;
-    printf("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
+    LOG("RMS Δp    : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc);
 
     const double same_top_p = 1.0*kld.n_same_top/kld.count;
-    printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
-
+    LOG("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1)));
 }
 
 int main(int argc, char ** argv) {
@@ -1972,10 +1966,12 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     const int32_t n_ctx = params.n_ctx;
 
     if (n_ctx <= 0) {
-        fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
+        LOG_ERR("%s: perplexity tool requires '--ctx-size' > 0\n", __func__);
         return 1;
     }
 
@@ -2000,13 +1996,11 @@ int main(int argc, char ** argv) {
     }
 
     if (params.ppl_stride > 0) {
-        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+        LOG_INF("Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
                 params.n_ctx, params.n_ctx + params.ppl_stride/2);
         params.n_ctx += params.ppl_stride/2;
     }
 
-    print_build_info();
-
     llama_backend_init();
     llama_numa_init(params.numa);
 
@@ -2016,21 +2010,21 @@ int main(int argc, char ** argv) {
     llama_model * model = llama_init.model;
     llama_context * ctx = llama_init.context;
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
 
     if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     struct results_perplexity results;
@@ -2046,8 +2040,9 @@ int main(int argc, char ** argv) {
         results = perplexity(ctx, params, n_ctx);
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_context_print(ctx);
+
     write_logfile(ctx, params, model, results);
 
     llama_free(ctx);
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index d08679edb..5971690f1 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -1,14 +1,16 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
 #include <algorithm>
 #include <fstream>
+#include <iostream> // TODO: remove me
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG("\n");
 }
 
 struct chunk {
@@ -17,7 +19,7 @@ struct chunk {
     // original file position
     size_t filepos;
     // original text data
-    std::string textdata = "";
+    std::string textdata;
     // tokenized text data
     std::vector<llama_token> tokens;
     // embedding
@@ -31,14 +33,14 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
     std::ifstream f(filename.c_str());
 
     if (!f.is_open()) {
-        fprintf(stderr, "Error: could not open file %s\n", filename.c_str());
+        LOG_ERR("could not open file %s\n", filename.c_str());
         return chunks;
     }
 
     chunk current_chunk;
     char buffer[1024];
     int64_t filepos = 0;
-    std::string current = "";
+    std::string current;
     while (f.read(buffer, 1024)) {
         current += std::string(buffer, f.gcount());
         size_t pos;
@@ -84,9 +86,9 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
     llama_kv_cache_clear(ctx);
 
     // run model
-    fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
     if (llama_decode(ctx, batch) < 0) {
-        fprintf(stderr, "%s : failed to decode\n", __func__);
+        LOG_ERR("%s : failed to decode\n", __func__);
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
@@ -99,7 +101,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         if (embd == NULL) {
             embd = llama_get_embeddings_ith(ctx, i);
             if (embd == NULL) {
-                fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
+                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
                 continue;
             }
         }
@@ -116,24 +118,24 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     // For BERT models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
     params.embedding = true;
 
     if (params.chunk_size <= 0) {
-        fprintf(stderr, "chunk_size must be positive\n");
+        LOG_ERR("chunk_size must be positive\n");
         return 1;
     }
     if (params.context_files.empty()) {
-        fprintf(stderr, "context_files must be specified\n");
+        LOG_ERR("context_files must be specified\n");
         return 1;
     }
 
-    print_build_info();
-
-    printf("processing files:\n");
+    LOG_INF("processing files:\n");
     for (auto & context_file : params.context_files) {
-        printf("%s\n", context_file.c_str());
+        LOG_INF("%s\n", context_file.c_str());
     }
 
     std::vector<chunk> chunks;
@@ -141,7 +143,7 @@ int main(int argc, char ** argv) {
         std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
     }
-    printf("Number of chunks: %ld\n", chunks.size());
+    LOG_INF("Number of chunks: %ld\n", chunks.size());
 
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -153,7 +155,7 @@ int main(int argc, char ** argv) {
     llama_context * ctx = llama_init.context;
 
     if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
+        LOG_ERR("%s: unable to load model\n", __func__);
         return 1;
     }
 
@@ -162,19 +164,19 @@ int main(int argc, char ** argv) {
 
     const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
     if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
-        fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
+        LOG_ERR("%s: pooling type NONE not supported\n", __func__);
         return 1;
     }
 
     if (n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
+        LOG_WRN("%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, n_ctx);
     }
 
     // print system information
     {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
+        LOG_INF("\n");
+        LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
     }
 
     // max batch size
@@ -185,7 +187,7 @@ int main(int argc, char ** argv) {
     for (auto & chunk : chunks) {
         auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
         if (inp.size() > n_batch) {
-            fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
+            LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
             return 1;
         }
@@ -199,12 +201,12 @@ int main(int argc, char ** argv) {
     // tokenization stats
     if (params.verbose_prompt) {
         for (int i = 0; i < (int) chunks.size(); i++) {
-            fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
+            LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
+            LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
             for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
-                fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
+                LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
             }
-            fprintf(stderr, "\n\n");
+            LOG_INF("\n\n");
         }
     }
 
@@ -256,7 +258,7 @@ int main(int argc, char ** argv) {
     // start loop, receive query and return top k similar chunks based on cosine similarity
     std::string query;
     while (true) {
-        printf("Enter query: ");
+        LOG("Enter query: ");
         std::getline(std::cin, query);
         std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
 
@@ -280,18 +282,18 @@ int main(int argc, char ** argv) {
                 return a.second > b.second;
             });
 
-            printf("Top %d similar chunks:\n", params.sparams.top_k);
+            LOG("Top %d similar chunks:\n", params.sparams.top_k);
             for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
-                printf("filename: %s\n", chunks[similarities[i].first].filename.c_str());
-                printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
-                printf("similarity: %f\n", similarities[i].second);
-                printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
-                printf("--------------------\n");
+                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
+                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
+                LOG("similarity: %f\n", similarities[i].second);
+                LOG("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str());
+                LOG("--------------------\n");
             }
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_context_print(ctx);
 
     // clean up
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt
index 580f3a824..3e717e882 100644
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)
+
+option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
 
@@ -46,9 +46,6 @@ endforeach()
 
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
 
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
 
diff --git a/examples/server/README.md b/examples/server/README.md
index 44a73ca0a..168e14a99 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -121,7 +121,6 @@ The project is under active development, and we are [looking for feedback and co
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
-| `--log-format {text, json}` | log output format: json or text (default: json) |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md
index 0f18ca396..353368e13 100644
--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@@ -40,7 +40,6 @@ server --host localhost --port 8080 \
   --parallel 8 \
   --batch-size 512 \
   --ctx-size 4096 \
-  --log-format text \
   -ngl 33
 ```
 
diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py
index 2daac0884..a9ed747f5 100644
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@@ -272,7 +272,6 @@ def start_server_background(args):
     server_args.append('--cont-batching')
     server_args.append('--metrics')
     server_args.append('--flash-attn')
-    server_args.extend(['--log-format', "text"])
     args = [str(arg) for arg in [server_path, *server_args]]
     print(f"bench: starting server with: {' '.join(args)}")
     pkwargs = {
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 14c4af3d9..b5f264ff1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2,6 +2,7 @@
 
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
@@ -31,22 +32,34 @@
 #include "loading.html.hpp"
 
 #include <atomic>
-#include <chrono>
 #include <condition_variable>
 #include <cstddef>
-#include <mutex>
-#include <thread>
-#include <signal.h>
-#include <memory>
-#include <unordered_set>
-#include <unordered_map>
+#include <cinttypes>
 #include <deque>
+#include <memory>
+#include <mutex>
+#include <signal.h>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+
+#define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_ERR(slot, fmt, ...) LOG_ERR("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+#define SLT_DBG(slot, fmt, ...) LOG_DBG("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
+
+#define SRV_INF(fmt, ...) LOG_INF("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_WRN(fmt, ...) LOG_WRN("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_ERR(fmt, ...) LOG_ERR("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define SRV_DBG(fmt, ...) LOG_DBG("srv  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+
+#define QUE_INF(fmt, ...) LOG_INF("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_WRN(fmt, ...) LOG_WRN("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
+#define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 
 using json = nlohmann::ordered_json;
 
-bool server_verbose = false;
-bool server_log_json = true;
-
 enum stop_type {
     STOP_TYPE_FULL,
     STOP_TYPE_PARTIAL,
@@ -197,6 +210,8 @@ struct server_slot {
     std::function<void(int)> callback_on_release;
 
     void reset() {
+        SLT_DBG(*this, "%s", "\n");
+
         n_prompt_tokens    = 0;
         generated_text     = "";
         truncated          = false;
@@ -234,8 +249,9 @@ struct server_slot {
         return state != SLOT_STATE_IDLE;
     }
 
-    void add_token_string(const completion_token_output & token) {
+    void add_token(const completion_token_output & token) {
         if (!is_processing()) {
+            SLT_WRN(*this, "%s", "slot is not processing\n");
             return;
         }
         generated_token_probs.push_back(token);
@@ -243,14 +259,10 @@ struct server_slot {
 
     void release() {
         if (is_processing()) {
+            SLT_INF(*this, "stop processing: n_past = %d, truncated = %d\n", n_past, truncated);
+
             t_token_generation = (ggml_time_us() - t_start_generation) / 1e3;
             state = SLOT_STATE_IDLE;
-            LOG_INFO("slot released", {
-                {"id_slot",   id},
-                {"id_task",   id_task},
-                {"n_past",    n_past},
-                {"truncated", truncated},
-            });
             callback_on_release(id);
         }
     }
@@ -298,49 +310,20 @@ struct server_slot {
     }
 
     void print_timings() const {
-        char buffer[512];
+        const double t_prompt        =       t_prompt_processing / n_prompt_tokens_processed;
+        const double n_prompt_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
 
-        double t_token = t_prompt_processing / n_prompt_tokens_processed;
-        double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+        const double t_gen        =       t_token_generation / n_decoded;
+        const double n_gen_second = 1e3 / t_token_generation * n_decoded;
 
-        snprintf(buffer, 512, "prompt eval time     = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
-                t_prompt_processing, n_prompt_tokens_processed,
-                t_token, n_tokens_second);
-
-        LOG_INFO(buffer, {
-            {"id_slot",                   id},
-            {"id_task",                   id_task},
-            {"t_prompt_processing",       t_prompt_processing},
-            {"n_prompt_tokens_processed", n_prompt_tokens_processed},
-            {"t_token",                   t_token},
-            {"n_tokens_second",           n_tokens_second},
-        });
-
-        t_token = t_token_generation / n_decoded;
-        n_tokens_second = 1e3 / t_token_generation * n_decoded;
-
-        snprintf(buffer, 512, "generation eval time = %10.2f ms / %5d runs   (%8.2f ms per token, %8.2f tokens per second)",
-                t_token_generation, n_decoded,
-                t_token, n_tokens_second);
-
-        LOG_INFO(buffer, {
-            {"id_slot",            id},
-            {"id_task",            id_task},
-            {"t_token_generation", t_token_generation},
-            {"n_decoded",          n_decoded},
-            {"t_token",            t_token},
-            {"n_tokens_second",    n_tokens_second},
-        });
-
-        snprintf(buffer, 512, "          total time = %10.2f ms", t_prompt_processing + t_token_generation);
-
-        LOG_INFO(buffer, {
-            {"id_slot",             id},
-            {"id_task",             id_task},
-            {"t_prompt_processing", t_prompt_processing},
-            {"t_token_generation",  t_token_generation},
-            {"t_total",             t_prompt_processing + t_token_generation},
-        });
+        SLT_INF(*this,
+                "\n"
+                "\rprompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "\r       eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n"
+                "\r      total time = %10.2f ms / %5d tokens\n",
+                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
+                t_token_generation, n_decoded, t_gen, n_gen_second,
+                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
     }
 };
 
@@ -416,8 +399,8 @@ struct server_queue {
         std::unique_lock<std::mutex> lock(mutex_tasks);
         if (task.id == -1) {
             task.id = id++;
-            LOG_VERBOSE("new task id", {{"new_id", task.id}});
         }
+        QUE_DBG("new task, id = %d, front = %d\n", task.id, front);
         if (front) {
             queue_tasks.push_front(std::move(task));
         } else {
@@ -433,8 +416,8 @@ struct server_queue {
         for (auto & task : tasks) {
             if (task.id == -1) {
                 task.id = id++;
-                LOG_VERBOSE("new task id", {{"new_id", task.id}});
             }
+            QUE_DBG("new task, id = %d/%d, front = %d\n", task.id, (int) tasks.size(), front);
             if (front) {
                 queue_tasks.push_front(std::move(task));
             } else {
@@ -448,6 +431,7 @@ struct server_queue {
     // Add a new task, but defer until one slot is available
     void defer(server_task task) {
         std::unique_lock<std::mutex> lock(mutex_tasks);
+        QUE_DBG("defer task, id = %d\n", task.id);
         queue_tasks_deferred.push_back(std::move(task));
         condition_tasks.notify_one();
     }
@@ -456,7 +440,6 @@ struct server_queue {
     int get_new_id() {
         std::unique_lock<std::mutex> lock(mutex_tasks);
         int new_id = id++;
-        LOG_VERBOSE("new task id", {{"new_id", new_id}});
         return new_id;
     }
 
@@ -498,7 +481,7 @@ struct server_queue {
         running = true;
 
         while (true) {
-            LOG_VERBOSE("new task may arrive", {});
+            QUE_DBG("%s", "processing new tasks\n");
 
             while (true) {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
@@ -509,21 +492,22 @@ struct server_queue {
                 server_task task = queue_tasks.front();
                 queue_tasks.pop_front();
                 lock.unlock();
-                LOG_VERBOSE("callback_new_task", {{"id_task", task.id}});
+
+                QUE_DBG("processing task, id = %d\n", task.id);
                 callback_new_task(task);
             }
 
             // all tasks in the current loop is processed, slots data is now ready
-            LOG_VERBOSE("callback_update_slots", {});
+            QUE_DBG("%s", "update slots\n");
 
             callback_update_slots();
 
-            LOG_VERBOSE("wait for new task", {});
+            QUE_DBG("%s", "waiting for new tasks\n");
             {
                 std::unique_lock<std::mutex> lock(mutex_tasks);
                 if (queue_tasks.empty()) {
                     if (!running) {
-                        LOG_VERBOSE("ending start_loop", {});
+                        QUE_DBG("%s", "terminate\n");
                         return;
                     }
                     condition_tasks.wait(lock, [&]{
@@ -547,7 +531,7 @@ struct server_response {
 
     // add the id_task to the list of tasks waiting for response
     void add_waiting_task_id(int id_task) {
-        LOG_VERBOSE("waiting for task id", {{"id_task", id_task}});
+        SRV_DBG("waiting for task id = %d\n", id_task);
 
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.insert(id_task);
@@ -561,7 +545,7 @@ struct server_response {
 
     // when the request is finished, we can remove task associated with it
     void remove_waiting_task_id(int id_task) {
-        LOG_VERBOSE("remove waiting for task id", {{"id_task", id_task}});
+        SRV_DBG("task id = %d is done\n", id_task);
 
         std::unique_lock<std::mutex> lock(mutex_results);
         waiting_task_ids.erase(id_task);
@@ -595,12 +579,13 @@ struct server_response {
 
     // Send a new result to a waiting id_task
     void send(server_task_result & result) {
-        LOG_VERBOSE("send new result", {{"id_task", result.id}});
+        SRV_DBG("sending result for task id = %d\n", result.id);
 
         std::unique_lock<std::mutex> lock(mutex_results);
         for (const auto & id_task : waiting_task_ids) {
             if (result.id == id_task) {
-                LOG_VERBOSE("queue_results.push_back", {{"id_task", id_task}});
+                SRV_DBG("task id = %d moved to result queue\n", result.id);
+
                 queue_results.push_back(std::move(result));
                 condition_results.notify_all();
                 return;
@@ -612,7 +597,7 @@ struct server_response {
 struct server_context {
     llama_model * model = nullptr;
     llama_context * ctx = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
+    std::vector<llama_lora_adapter_container> loras;
 
     gpt_params params;
 
@@ -672,11 +657,13 @@ struct server_context {
         llama_init_result llama_init = llama_init_from_gpt_params(params);
 
         model = llama_init.model;
-        ctx = llama_init.context;
-        lora_adapters = llama_init.lora_adapters;
+        ctx   = llama_init.context;
+        loras = llama_init.lora_adapters;
+
         params.n_parallel -= 1; // but be sneaky about it
+
         if (model == nullptr) {
-            LOG_ERROR("unable to load model", {{"model", params.model}});
+            SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
             return false;
         }
 
@@ -699,7 +686,7 @@ struct server_context {
     void init() {
         const int32_t n_ctx_slot = n_ctx / params.n_parallel;
 
-        LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
+        SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
 
         for (int i = 0; i < params.n_parallel; i++) {
             server_slot slot;
@@ -708,10 +695,7 @@ struct server_context {
             slot.n_ctx = n_ctx_slot;
             slot.n_predict = params.n_predict;
 
-            LOG_INFO("new slot", {
-                {"id_slot",    slot.id},
-                {"n_ctx_slot", slot.n_ctx}
-            });
+            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
 
             const int ga_n = params.grp_attn_n;
             const int ga_w = params.grp_attn_w;
@@ -722,11 +706,7 @@ struct server_context {
                 //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
                 //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
 
-                LOG_INFO("slot self-extend", {
-                    {"id_slot", slot.id},
-                    {"ga_n",    ga_n},
-                    {"ga_w",    ga_w}
-                });
+                SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
             }
 
             slot.ga_i = 0;
@@ -849,11 +829,7 @@ struct server_context {
             }
 
             if (ret != nullptr) {
-                LOG_VERBOSE("selected slot by lcp similarity", {
-                    {"id_slot", ret->id},
-                    {"max_lcp_len", max_lcp_len},
-                    {"similarity", similarity},
-                });
+                SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
             }
         }
 
@@ -874,10 +850,7 @@ struct server_context {
             }
 
             if (ret != nullptr) {
-                LOG_VERBOSE("selected slot by lru", {
-                    {"id_slot", ret->id},
-                    {"t_last", t_last},
-                });
+                SLT_DBG(*ret, "selected slot by lru, t_last = %" PRId64 "\n", t_last);
             }
         }
 
@@ -941,17 +914,14 @@ struct server_context {
         }
 
         if (slot.params.cache_prompt && slot.ga_n != 1) {
-            LOG_WARNING("cache_prompt is not supported with group-attention", {});
             slot.params.cache_prompt = false;
+            SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
         }
 
         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
             // Might be better to reject the request with a 400 ?
-            LOG_WARNING("Max tokens to predict exceeds server configuration", {
-                {"params.n_predict", slot.params.n_predict},
-                {"slot.n_predict",   slot.n_predict},
-            });
             slot.params.n_predict = slot.n_predict;
+            SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.n_predict, slot.n_predict);
         }
 
         // infill
@@ -1060,16 +1030,13 @@ struct server_context {
         slot.state = SLOT_STATE_PROCESSING_PROMPT;
         slot.prompt_tokens.clear();
 
-        LOG_INFO("slot is processing task", {
-            {"id_slot", slot.id},
-            {"id_task", slot.id_task},
-        });
+        SLT_INF(slot, "%s", "processing task\n");
 
         return true;
     }
 
     void kv_cache_clear() {
-        LOG_VERBOSE("clearing KV cache", {});
+        SRV_DBG("%s", "clearing KV cache\n");
 
         // clear the entire KV cache
         llama_kv_cache_clear(ctx);
@@ -1077,9 +1044,7 @@ struct server_context {
     }
 
     void system_prompt_update() {
-        LOG_VERBOSE("system prompt update", {
-            {"system_prompt", system_prompt},
-        });
+        SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
 
         kv_cache_clear();
         system_tokens.clear();
@@ -1100,7 +1065,7 @@ struct server_context {
                 }
 
                 if (llama_decode(ctx, batch) != 0) {
-                    LOG_ERROR("llama_decode() failed", {});
+                    SRV_ERR("%s", "llama_decode() failed\n");
                     return;
                 }
             }
@@ -1115,11 +1080,9 @@ struct server_context {
     }
 
     bool system_prompt_set(const std::string & sys_prompt) {
-        system_prompt = sys_prompt;
+        SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
 
-        LOG_VERBOSE("system prompt process", {
-            {"system_prompt",  system_prompt},
-        });
+        system_prompt = sys_prompt;
 
         // release all slots
         for (server_slot & slot : slots) {
@@ -1187,7 +1150,7 @@ struct server_context {
                 // add the token to slot queue and cache
             }
 
-            slot.add_token_string(result);
+            slot.add_token(result);
             if (slot.params.stream) {
                 send_partial_response(slot, result);
             }
@@ -1202,55 +1165,30 @@ struct server_context {
             slot.stopped_limit  = true;
             slot.has_next_token = false;
 
-            LOG_VERBOSE("stopped by limit", {
-                {"id_slot",   slot.id},
-                {"id_task",   slot.id_task},
-                {"n_decoded", slot.n_decoded},
-                {"n_predict", slot.params.n_predict},
-            });
+            SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
         }
 
         if (llama_token_is_eog(model, result.tok)) {
             slot.stopped_eos    = true;
             slot.has_next_token = false;
 
-            LOG_VERBOSE("eos token found", {});
+            SLT_DBG(slot, "%s", "stopped by EOS\n");
         }
 
-        auto n_ctx_train = llama_n_ctx_train(model);
-        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
-                    && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
-            LOG_WARNING("n_predict is not set and self-context extend is disabled."
-                        " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
-                    { "id_slot",              slot.id },
-                    { "params.n_predict",     slot.params.n_predict },
-                    { "slot.n_prompt_tokens", slot.n_prompt_tokens },
-                    { "slot.n_decoded",       slot.n_decoded },
-                    { "slot.n_predict",       slot.n_predict },
-                    { "n_slots",              params.n_parallel },
-                    { "slot.n_ctx",           slot.n_ctx },
-                    { "n_ctx",                n_ctx },
-                    { "n_ctx_train",          n_ctx_train },
-                    { "ga_n",                 slot.ga_n },
-                });
+        const auto n_ctx_train = llama_n_ctx_train(model);
+
+        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
             slot.truncated      = true;
             slot.stopped_limit  = true;
             slot.has_next_token = false; // stop prediction
+
+            SLT_WRN(slot,
+                    "n_predict (%d) is not set and self-context extend is disabled. "
+                    "Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
+                    slot.params.n_predict, n_ctx_train);
         }
 
-        LOG_VERBOSE("next token", {
-            {"id_slot",        slot.id},
-            {"id_task",        slot.id_task},
-            {"token",          result.tok},
-            {"token_text",     tokens_to_output_formatted_string(ctx, result.tok)},
-            {"has_next_token", slot.has_next_token},
-            {"n_remain",       slot.n_remaining},
-            {"n_decoded",      slot.n_decoded},
-            {"stopped_eos",    slot.stopped_eos},
-            {"stopped_word",   slot.stopped_word},
-            {"stopped_limit",  slot.stopped_limit},
-            {"stopping_word",  slot.stopping_word},
-        });
+        SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
 
         return slot.has_next_token; // continue
     }
@@ -1307,10 +1245,7 @@ struct server_context {
     }
 
     void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
-        LOG_ERROR("task error", {
-            {"id_task", id_task},
-            {"error", error},
-        });
+        SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
 
         server_task_result res;
         res.id       = id_task;
@@ -1429,10 +1364,7 @@ struct server_context {
             }
 
             if (embd == NULL) {
-                LOG_ERROR("failed to get embeddings", {
-                    {"token",  batch.token [i]},
-                        {"seq_id", batch.seq_id[i][0]}
-                });
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
                 res.data = json {
                     {"embedding", std::vector<float>(n_embd, 0.0f)},
@@ -1449,6 +1381,8 @@ struct server_context {
             };
         }
 
+        SLT_DBG(slot, "%s", "sending embeddings\n");
+
         queue_results.send(res);
     }
 
@@ -1465,7 +1399,7 @@ struct server_context {
             task.type      = SERVER_TASK_TYPE_COMPLETION;
             if (replace_prompt) {
                 task.data  = task_data;
-                task.data["prompt"] = prompt;
+                task.data["prompt"] = std::move(prompt);
             } else {
                 task.data  = std::move(task_data);
             }
@@ -1509,7 +1443,8 @@ struct server_context {
         std::vector<server_task> cancel_tasks;
         cancel_tasks.reserve(id_tasks.size());
         for (const auto & id_task : id_tasks) {
-            LOG_VERBOSE("cancel task", {{"id_task", id_task}});
+            SRV_WRN("cancel task, id_task = %d\n", id_task);
+
             server_task task;
             task.type      = SERVER_TASK_TYPE_CANCEL;
             task.id_target = id_task;
@@ -1521,7 +1456,10 @@ struct server_context {
     }
 
     // receive the results from task(s) created by create_tasks_cmpl
-    void receive_cmpl_results(const std::unordered_set<int> & id_tasks, std::function<void(std::vector<server_task_result>&)> result_handler, std::function<void(json)> error_handler) {
+    void receive_cmpl_results(
+            const std::unordered_set<int> & id_tasks,
+            const std::function<void(std::vector<server_task_result>&)> & result_handler,
+            const std::function<void(json)> & error_handler) {
         // TODO: currently, there is no way to detect the client has cancelled the request
         std::vector<server_task_result> results(id_tasks.size());
         for (size_t i = 0; i < id_tasks.size(); i++) {
@@ -1540,7 +1478,10 @@ struct server_context {
     }
 
     // receive the results from task(s) created by create_tasks_cmpl, in stream mode
-    void receive_cmpl_results_stream(const std::unordered_set<int> & id_tasks, std::function<bool(server_task_result&)> result_handler, std::function<void(json)> error_handler) {
+    void receive_cmpl_results_stream(
+            const std::unordered_set<int> & id_tasks, const
+            std::function<bool(server_task_result&)> & result_handler, const
+            std::function<void(json)> & error_handler) {
         size_t n_finished = 0;
         while (true) {
             server_task_result result = queue_results.recv(id_tasks);
@@ -1588,13 +1529,13 @@ struct server_context {
 
                     if (slot == nullptr) {
                         // if no slot is available, we defer this task for processing later
-                        LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
+                        SRV_DBG("no slot is available, defer task, id_task = %d\n", task.id);
                         queue_tasks.defer(task);
                         break;
                     }
                     if (slot->is_processing()) {
                         // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                         queue_tasks.defer(task);
                         break;
                     }
@@ -1616,7 +1557,7 @@ struct server_context {
                     slot->index     = json_value(task.data, "index", 0);
 
                     if (!launch_slot_with_task(*slot, task)) {
-                        LOG_ERROR("error while launching slot", task.data);
+                        SRV_ERR("failed to launch slot with task, id_task = %d\n", task.id);
                         break;
                     }
                 } break;
@@ -1665,18 +1606,7 @@ struct server_context {
 
                         slots_data.push_back(slot_data);
                     }
-                    LOG_INFO("slot data", {
-                        {"id_task",            task.id},
-                        {"n_idle_slots",       n_idle_slots},
-                        {"n_processing_slots", n_processing_slots}
-                    });
-
-                    LOG_VERBOSE("slot data", {
-                        {"id_task",            task.id},
-                        {"n_idle_slots",       n_idle_slots},
-                        {"n_processing_slots", n_processing_slots},
-                        {"slots",              slots_data}
-                    });
+                    SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
 
                     server_task_result res;
                     res.id       = task.id;
@@ -1722,7 +1652,7 @@ struct server_context {
                     }
                     if (slot->is_processing()) {
                         // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                         queue_tasks.defer(task);
                         break;
                     }
@@ -1763,7 +1693,7 @@ struct server_context {
                     }
                     if (slot->is_processing()) {
                         // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                         queue_tasks.defer(task);
                         break;
                     }
@@ -1811,7 +1741,7 @@ struct server_context {
                     }
                     if (slot->is_processing()) {
                         // if requested slot is unavailable, we defer this task for processing later
-                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
                         queue_tasks.defer(task);
                         break;
                     }
@@ -1833,7 +1763,7 @@ struct server_context {
                 } break;
             case SERVER_TASK_TYPE_SET_LORA:
                 {
-                    llama_lora_adapters_apply(ctx, lora_adapters);
+                    llama_lora_adapters_apply(ctx, loras);
                     server_task_result result;
                     result.id = task.id;
                     result.stop = true;
@@ -1861,7 +1791,7 @@ struct server_context {
             }
 
             if (all_idle) {
-                LOG_INFO("all slots are idle", {});
+                SRV_INF("%s", "all slots are idle\n");
                 if (system_prompt.empty() && clean_kv_cache) {
                     kv_cache_clear();
                 }
@@ -1871,7 +1801,7 @@ struct server_context {
         }
 
         {
-            LOG_VERBOSE("posting NEXT_RESPONSE", {});
+            SRV_DBG("%s", "posting NEXT_RESPONSE\n");
 
             server_task task;
             task.type      = SERVER_TASK_TYPE_NEXT_RESPONSE;
@@ -1890,17 +1820,7 @@ struct server_context {
                     const int n_left    = (int) system_tokens.size() + slot.n_past - n_keep;
                     const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
 
-                    LOG_INFO("slot context shift", {
-                        {"id_slot",         slot.id},
-                        {"id_task",         slot.id_task},
-                        {"n_keep",          n_keep},
-                        {"n_left",          n_left},
-                        {"n_discard",       n_discard},
-                        {"n_ctx",           n_ctx},
-                        {"n_past",          slot.n_past},
-                        {"n_system_tokens", system_tokens.size()},
-                        {"n_cache_tokens",  slot.cache_tokens.size()}
-                    });
+                    SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
 
                     llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
                     llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
@@ -1943,15 +1863,8 @@ struct server_context {
                 slot.cache_tokens.push_back(slot.sampled);
             }
 
-            LOG_VERBOSE("slot decode token", {
-                {"id_slot",         slot.id},
-                {"id_task",         slot.id_task},
-                {"n_ctx",           n_ctx},
-                {"n_past",          slot.n_past},
-                {"n_system_tokens", system_tokens.size()},
-                {"n_cache_tokens",  slot.cache_tokens.size()},
-                {"truncated",       slot.truncated}
-            });
+            SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
+                    slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
         }
 
         // process in chunks of params.n_batch
@@ -1972,10 +1885,7 @@ struct server_context {
 
                     // we haven't tokenized the prompt yet - do it now:
                     if (prompt_tokens.empty()) {
-                        LOG_VERBOSE("tokenizing prompt", {
-                            {"id_slot", slot.id},
-                            {"id_task", slot.id_task}
-                        });
+                        SLT_INF(slot, "tokenizing prompt, len = %d\n", (int) slot.prompt.size());
 
                         slot.t_start_process_prompt = ggml_time_us();
                         slot.t_start_generation = 0;
@@ -2019,21 +1929,11 @@ struct server_context {
                         slot.n_past = 0;
                         slot.n_prompt_tokens = prompt_tokens.size();
 
-                        LOG_VERBOSE("prompt tokenized", {
-                            {"id_slot",         slot.id},
-                            {"id_task",         slot.id_task},
-                            {"n_ctx",           slot.n_ctx},
-                            {"n_keep",          slot.params.n_keep},
-                            {"n_prompt_tokens", slot.n_prompt_tokens},
-                            {"prompt_tokens",   tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
-                        });
+                        SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
 
                         // empty prompt passed -> release the slot and send empty response
                         if (prompt_tokens.empty()) {
-                            LOG_INFO("empty prompt - releasing slot", {
-                                {"id_slot", slot.id},
-                                {"id_task", slot.id_task}
-                            });
+                            SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
 
                             slot.release();
                             slot.print_timings();
@@ -2075,15 +1975,7 @@ struct server_context {
                                 slot.truncated = true;
                                 slot.n_prompt_tokens = prompt_tokens.size();
 
-                                LOG_VERBOSE("input truncated", {
-                                    {"id_slot",         slot.id},
-                                    {"id_task",         slot.id_task},
-                                    {"n_ctx",           slot.n_ctx},
-                                    {"n_keep",          slot.params.n_keep},
-                                    {"n_left",          n_left},
-                                    {"n_prompt_tokens", slot.n_prompt_tokens},
-                                    {"prompt_tokens",   tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())},
-                                });
+                                SLT_WRN(slot, "input truncated, n_ctx = %d, n_keep = %d, n_left = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, n_left, slot.n_prompt_tokens);
 
                                 GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                             }
@@ -2108,10 +2000,7 @@ struct server_context {
 
                         if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
                             // we have to evaluate at least 1 token to generate logits.
-                            LOG_INFO("we have to evaluate at least 1 token to generate logits", {
-                                { "id_slot", slot.id },
-                                { "id_task", slot.id_task }
-                            });
+                            SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
 
                             slot.n_past--;
                             if (slot.ga_i > 0) {
@@ -2160,11 +2049,7 @@ struct server_context {
                     // remove the non-common part from the cache
                     slot.cache_tokens.resize(slot.n_past);
 
-                    LOG_INFO("kv cache rm [p0, end)", {
-                        { "id_slot", slot.id },
-                        { "id_task", slot.id_task },
-                        { "p0",      p0 }
-                    });
+                    SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
 
                     int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
 
@@ -2193,13 +2078,7 @@ struct server_context {
                         slot_npast++;
                     }
 
-                    LOG_VERBOSE("prompt processing progress", {
-                        {"id_slot",  slot.id},
-                        {"n_past",   slot.n_past},
-                        {"n_ctx",    n_ctx},
-                        {"n_tokens", batch.n_tokens},
-                        {"progress", (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens},
-                    });
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
@@ -2213,12 +2092,7 @@ struct server_context {
                         slot.n_decoded = 0;
                         slot.i_batch   = batch.n_tokens - 1;
 
-                        LOG_VERBOSE("prompt done", {
-                            {"id_slot",  slot.id},
-                            {"n_past",   slot.n_past},
-                            {"n_ctx",    n_ctx},
-                            {"n_tokens", batch.n_tokens},
-                        });
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
                     }
                 }
 
@@ -2229,13 +2103,11 @@ struct server_context {
         }
 
         if (batch.n_tokens == 0) {
-            LOG_VERBOSE("no tokens to decode", {});
+            SRV_WRN("%s", "no tokens to decode\n");
             return;
         }
 
-        LOG_VERBOSE("decoding batch", {
-            {"n_tokens", batch.n_tokens},
-        });
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
 
         // make sure we're in the right embedding mode
         llama_set_embeddings(ctx, batch_type == 1);
@@ -2253,10 +2125,9 @@ struct server_context {
                         const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
                         const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
 
-                        LOG_TEE("\n");
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
-                        LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
-                        LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
+                        SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
+                        SLT_DBG(slot, "div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
+                        SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
 
                         llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
                         llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
@@ -2266,7 +2137,7 @@ struct server_context {
 
                         slot.ga_i += slot.ga_w / slot.ga_n;
 
-                        LOG_TEE("\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
+                        SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
                     }
 
                     slot.n_past_se += n_tokens;
@@ -2290,11 +2161,7 @@ struct server_context {
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
-                    LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
-                        {"i",       i},
-                        {"n_batch", n_batch},
-                        {"ret",     ret},
-                    });
+                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
                     for (auto & slot : slots) {
                         slot.release();
                         send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
@@ -2306,11 +2173,7 @@ struct server_context {
                 n_batch /= 2;
                 i -= n_batch;
 
-                LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
-                    {"i",       i},
-                    {"n_batch", n_batch},
-                    {"ret",     ret},
-                });
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
 
                 continue; // continue loop of n_batch
             }
@@ -2370,7 +2233,7 @@ struct server_context {
             }
         }
 
-        LOG_VERBOSE("run slots completed", {});
+        SRV_DBG("%s", "run slots completed\n");
     }
 
     json model_meta() const {
@@ -2391,19 +2254,18 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
         return;
     }
 
-    LOG_INFO("request", {
-        {"remote_addr", req.remote_addr},
-        {"remote_port", req.remote_port},
-        {"status",      res.status},
-        {"method",      req.method},
-        {"path",        req.path},
-        {"params",      req.params},
-    });
+    //LOG_INFO("request", {
+    //    {"remote_addr", req.remote_addr},
+    //    {"remote_port", req.remote_port},
+    //    {"status",      res.status},
+    //    {"method",      req.method},
+    //    {"path",        req.path},
+    //    {"params",      req.params},
+    //});
+    LOG_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
 
-    LOG_VERBOSE("request", {
-        {"request",  req.body},
-        {"response", res.body},
-    });
+    LOG_DBG("request:  %s\n", req.body.c_str());
+    LOG_DBG("response: %s\n", res.body.c_str());
 }
 
 std::function<void(int)> shutdown_handler;
@@ -2421,9 +2283,6 @@ inline void signal_handler(int signal) {
 }
 
 int main(int argc, char ** argv) {
-#if SERVER_VERBOSE != 1
-    log_disable();
-#endif
     // own arguments required by this example
     gpt_params params;
 
@@ -2431,9 +2290,11 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    // TODO: not great to use extern vars
-    server_log_json = params.log_json;
-    server_verbose = params.verbosity > 0;
+    gpt_init();
+
+    // enabling this will output extra debug information in the HTTP responses from the server
+    // see format_final_response_oaicompat()
+    const bool verbose = params.verbosity > 9;
 
     // struct that contains llama context and inference
     server_context ctx_server;
@@ -2449,17 +2310,10 @@ int main(int argc, char ** argv) {
     llama_backend_init();
     llama_numa_init(params.numa);
 
-    LOG_INFO("build info", {
-        {"build",  LLAMA_BUILD_NUMBER},
-        {"commit", LLAMA_COMMIT}
-    });
-
-    LOG_INFO("system info", {
-        {"n_threads",       params.cpuparams.n_threads},
-        {"n_threads_batch", params.cpuparams_batch.n_threads},
-        {"total_threads",   std::thread::hardware_concurrency()},
-        {"system_info",     llama_print_system_info()},
-    });
+    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
+    LOG_INF("\n");
+    LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
+    LOG_INF("\n");
 
     std::unique_ptr<httplib::Server> svr;
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
@@ -2491,13 +2345,13 @@ int main(int argc, char ** argv) {
 
     svr->set_logger(log_server_request);
 
-    auto res_error = [](httplib::Response & res, json error_data) {
+    auto res_error = [](httplib::Response & res, const json & error_data) {
         json final_response {{"error", error_data}};
         res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
         res.status = json_value(error_data, "code", 500);
     };
 
-    auto res_ok = [](httplib::Response & res, json data) {
+    auto res_ok = [](httplib::Response & res, const json & data) {
         res.set_content(data.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
         res.status = 200;
     };
@@ -2505,7 +2359,7 @@ int main(int argc, char ** argv) {
     svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
         std::string message;
         try {
-            std::rethrow_exception(std::move(ep));
+            std::rethrow_exception(ep);
         } catch (std::exception & e) {
             message = e.what();
         } catch (...) {
@@ -2513,7 +2367,7 @@ int main(int argc, char ** argv) {
         }
 
         json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
-        LOG_VERBOSE("Got exception", formatted_error);
+        LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
         res_error(res, formatted_error);
     });
 
@@ -2588,7 +2442,7 @@ int main(int argc, char ** argv) {
         // API key is invalid or not provided
         res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
 
-        LOG_WARNING("Unauthorized: Invalid API Key", {});
+        LOG_WRN("Unauthorized: Invalid API Key\n");
 
         return false;
     };
@@ -2925,14 +2779,14 @@ int main(int argc, char ** argv) {
                     }
                     res_ok(res, arr);
                 }
-            }, [&](json error_data) {
+            }, [&](const json & error_data) {
                 res_error(res, error_data);
             });
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool {
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
                     return server_sent_event(sink, "data", result.data);
-                }, [&](json error_data) {
+                }, [&](const json & error_data) {
                     server_sent_event(sink, "error", error_data);
                 });
                 sink.done();
@@ -2953,7 +2807,7 @@ int main(int argc, char ** argv) {
     };
 
     // TODO: maybe merge this function with "handle_completions_generic"
-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
         if (ctx_server.params.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
@@ -2970,16 +2824,16 @@ int main(int argc, char ** argv) {
         const auto completion_id = gen_chatcmplid();
 
         if (!stream) {
-            ctx_server.receive_cmpl_results(task_ids, [&](std::vector<server_task_result> & results) {
+            ctx_server.receive_cmpl_results(task_ids, [&](const std::vector<server_task_result> & results) {
                 // multitask is never support in chat completion, there is only one result
-                json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id);
+                json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose);
                 res_ok(res, result_oai);
-            }, [&](json error_data) {
+            }, [&](const json & error_data) {
                 res_error(res, error_data);
             });
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result result) -> bool {
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
                     std::vector<json> result_array = format_partial_response_oaicompat(result.data, completion_id);
                     for (auto & event_data : result_array) {
                         if (event_data.empty()) {
@@ -2990,7 +2844,7 @@ int main(int argc, char ** argv) {
                         }
                     }
                     return true; // ok
-                }, [&](json error_data) {
+                }, [&](const json & error_data) {
                     server_sent_event(sink, "error", error_data);
                 });
                 static const std::string ev_done = "data: [DONE]\n\n";
@@ -3103,7 +2957,7 @@ int main(int argc, char ** argv) {
                 for (const auto & res : results) {
                     responses.push_back(res.data);
                 }
-            }, [&](json error_data) {
+            }, [&](const json & error_data) {
                 res_error(res, error_data);
                 error = true;
             });
@@ -3122,12 +2976,12 @@ int main(int argc, char ** argv) {
 
     const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
         json result = json::array();
-        for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
-            auto & la = ctx_server.lora_adapters[i];
+        for (size_t i = 0; i < ctx_server.loras.size(); ++i) {
+            auto & lora = ctx_server.loras[i];
             result.push_back({
                 {"id", i},
-                {"path", la.path},
-                {"scale", la.scale},
+                {"path", lora.path},
+                {"scale", lora.scale},
             });
         }
         res_ok(res, result);
@@ -3136,11 +2990,11 @@ int main(int argc, char ** argv) {
 
     const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
         const std::vector<json> body = json::parse(req.body);
-        int max_idx = ctx_server.lora_adapters.size();
+        int max_idx = ctx_server.loras.size();
 
         // clear existing value
-        for (auto & la : ctx_server.lora_adapters) {
-            la.scale = 0.0f;
+        for (auto & lora : ctx_server.loras) {
+            lora.scale = 0.0f;
         }
 
         // set value
@@ -3148,7 +3002,7 @@ int main(int argc, char ** argv) {
             int id      = entry.at("id");
             float scale = entry.at("scale");
             if (0 <= id && id < max_idx) {
-                ctx_server.lora_adapters[id].scale = scale;
+                ctx_server.loras[id].scale = scale;
             } else {
                 throw std::runtime_error("invalid adapter id");
             }
@@ -3243,59 +3097,59 @@ int main(int argc, char ** argv) {
 
     // bind HTTP listen port, run the HTTP server in a thread
     if (!svr->bind_to_port(params.hostname, params.port)) {
-        LOG_ERROR("couldn't bind HTTP server socket", {
-            {"hostname", params.hostname},
-            {"port", params.port},
-        });
+        //LOG_ERROR("couldn't bind HTTP server socket", {
+        //    {"hostname", params.hostname},
+        //    {"port", params.port},
+        //});
+        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
         clean_up();
-        LOG_ERROR("exiting due to HTTP server error", {});
         return 1;
     }
     std::thread t([&]() { svr->listen_after_bind(); });
     svr->wait_until_ready();
 
-    LOG_INFO("HTTP server is listening", log_data);
+    //LOG_INFO("HTTP server is listening", log_data);
+    LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
 
     // load the model
-    LOG_INFO("loading model", log_data);
+    LOG_INF("%s: loading model\n", __func__);
+
     if (!ctx_server.load_model(params)) {
         clean_up();
         t.join();
-        LOG_ERROR("exiting due to model loading error", {});
+        LOG_ERR("%s: exiting due to model loading error\n", __func__);
         return 1;
-    } else {
-        ctx_server.init();
-        state.store(SERVER_STATE_READY);
-
-        LOG_INFO("model loaded", {});
-
-        // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-        if (params.chat_template.empty()) {
-            if (!ctx_server.validate_model_chat_template()) {
-                LOG_WARNING("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-                params.chat_template = "chatml";
-            }
-        }
-
-        // print sample chat example to make it clear which template is used
-        {
-            LOG_INFO("chat template", {
-                {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
-                {"built_in",     params.chat_template.empty()},
-            });
-        }
-
-        ctx_server.queue_tasks.on_new_task(std::bind(
-            &server_context::process_single_task, &ctx_server, std::placeholders::_1));
-        ctx_server.queue_tasks.on_update_slots(std::bind(
-            &server_context::update_slots, &ctx_server));
-
-        shutdown_handler = [&](int) {
-            ctx_server.queue_tasks.terminate();
-        };
-        ctx_server.queue_tasks.start_loop();
     }
 
+    ctx_server.init();
+    state.store(SERVER_STATE_READY);
+
+    LOG_INF("%s: model loaded\n", __func__);
+
+    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+    if (params.chat_template.empty()) {
+        if (!ctx_server.validate_model_chat_template()) {
+            LOG_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
+            params.chat_template = "chatml";
+        }
+    }
+
+    // print sample chat example to make it clear which template is used
+    LOG_INF("%s: chat template, built_in: %d, chat_example: '%s\n'", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
+
+    ctx_server.queue_tasks.on_new_task(std::bind(
+                &server_context::process_single_task, &ctx_server, std::placeholders::_1));
+    ctx_server.queue_tasks.on_update_slots(std::bind(
+                &server_context::update_slots, &ctx_server));
+
+    shutdown_handler = [&](int) {
+        ctx_server.queue_tasks.terminate();
+    };
+
+    LOG_INF("%s: server is listening on %s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
+
+    ctx_server.queue_tasks.start_loop();
+
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;
     sigint_action.sa_handler = signal_handler;
diff --git a/examples/server/tests/.gitignore b/examples/server/tests/.gitignore
new file mode 100644
index 000000000..1d17dae13
--- /dev/null
+++ b/examples/server/tests/.gitignore
@@ -0,0 +1 @@
+.venv
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 5e6cb277b..10f22c447 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -40,7 +40,6 @@ It's possible to override some scenario steps values with environment variables:
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
 | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
 | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
-| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
 
 ### Run @bug, @wip or @wrong_usage annotated scenario
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 0f4249b13..062f084be 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1372,8 +1372,6 @@ def start_server_background(context):
         server_args.append('--verbose')
     if context.lora_file:
         server_args.extend(['--lora', context.lora_file])
-    if 'SERVER_LOG_FORMAT_JSON' not in os.environ:
-        server_args.extend(['--log-format', "text"])
 
     args = [str(arg) for arg in [context.server_path, *server_args]]
     print(f"bench: starting server with: {' '.join(args)}")
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index adb1a1cb9..537c8a223 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,7 +1,8 @@
 #pragma once
 
-#include "llama.h"
 #include "common.h"
+#include "log.h"
+#include "llama.h"
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -15,10 +16,10 @@
 #define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 
+#include <random>
+#include <sstream>
 #include <string>
 #include <vector>
-#include <sstream>
-#include <random>
 
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 
@@ -35,32 +36,6 @@ enum error_type {
     ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 
-extern bool server_verbose;
-extern bool server_log_json;
-
-#ifndef SERVER_VERBOSE
-#define SERVER_VERBOSE 1
-#endif
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                            \
-    do                                                                   \
-    {                                                                    \
-        if (server_verbose)                                              \
-        {                                                                \
-            server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \
-        }                                                                \
-    } while (0)
-#endif
-
-#define LOG_ERROR(  MSG, ...) server_log("ERR",  __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-
-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
-
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
     // Fallback null to default value
@@ -68,9 +43,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
         try {
             return body.at(key);
         } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
-            std::stringstream ss;
-            ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
-            LOG_WARNING(ss.str().c_str(), body);
+            LOG_WRN("Wrong type supplied for parameter '%s'. Expected '%s', using default value\n", key.c_str(), json(default_value).type_name());
             return default_value;
         }
     } else {
@@ -78,48 +51,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul
     }
 }
 
-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
-    std::stringstream ss_tid;
-    ss_tid << std::this_thread::get_id();
-    json log = json{
-        {"tid",       ss_tid.str()},
-        {"timestamp", time(nullptr)},
-    };
-
-    if (server_log_json) {
-        log.merge_patch({
-            {"level",    level},
-            {"function", function},
-            {"line",     line},
-            {"msg",      message},
-        });
-
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-
-        printf("%s\n", log.dump(-1, ' ', false, json::error_handler_t::replace).c_str());
-    } else {
-        char buf[1024];
-        snprintf(buf, 1024, "%4s [%24s] %s", level, function, message);
-
-        if (!extra.empty()) {
-            log.merge_patch(extra);
-        }
-        std::stringstream ss;
-        ss << buf << " |";
-        for (const auto & el : log.items())
-        {
-            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
-            ss << " " << el.key() << "=" << value;
-        }
-
-        const std::string str = ss.str();
-        printf("%.*s\n", (int)str.size(), str.data());
-    }
-    fflush(stdout);
-}
-
 //
 // chat template utils
 //
@@ -153,8 +84,9 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
         chat.push_back({role, content});
     }
 
-    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
-    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
+    const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
+    LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
+
     return formatted_chat;
 }
 
@@ -243,10 +175,7 @@ static std::string random_string() {
 }
 
 static std::string gen_chatcmplid() {
-    std::stringstream chatcmplid;
-    chatcmplid << "chatcmpl-" << random_string();
-
-    return chatcmplid.str();
+    return "chatcmpl-" + random_string();
 }
 
 //
@@ -287,7 +216,7 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
     return std::string::npos;
 }
 
-static bool json_is_array_of_numbers(json data) {
+static bool json_is_array_of_numbers(const json & data) {
     if (data.is_array()) {
         for (const auto & e : data) {
             if (!e.is_number()) {
@@ -363,15 +292,13 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
     return out;
 }
 
-static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
+static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
     const std::string str =
         std::string(event) + ": " +
         data.dump(-1, ' ', false, json::error_handler_t::replace) +
-        "\n\n";
+        "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
 
-    LOG_VERBOSE("data stream", {
-        { "to_send", str }
-    });
+    LOG_DBG("data stream, to_send: %s", str.c_str());
 
     return sink.write(str.c_str(), str.size());
 }
@@ -425,7 +352,7 @@ static json oaicompat_completion_params_parse(
 
     // Params supported by OAI but unsupported by llama.cpp
     static const std::vector<std::string> unsupported_params { "tools", "tool_choice" };
-    for (auto & param : unsupported_params) {
+    for (const auto & param : unsupported_params) {
         if (body.contains(param)) {
             throw std::runtime_error("Unsupported param: " + param);
         }
@@ -444,7 +371,7 @@ static json oaicompat_completion_params_parse(
     return llama_params;
 }
 
-static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
+static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
     bool stopped_word        = result.count("stopped_word") != 0;
     bool stopped_eos         = json_value(result, "stopped_eos", false);
     int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@@ -481,7 +408,8 @@ static json format_final_response_oaicompat(const json & request, json result, c
         {"id", completion_id}
     };
 
-    if (server_verbose) {
+    // extra fields for debugging purposes
+    if (verbose) {
         res["__verbose"] = result;
     }
 
@@ -493,7 +421,7 @@ static json format_final_response_oaicompat(const json & request, json result, c
 }
 
 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
+static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
     if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
         return std::vector<json>({result});
     }
@@ -595,7 +523,7 @@ static std::vector<json> format_partial_response_oaicompat(json result, const st
 static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
     json data = json::array();
     int i = 0;
-    for (auto & elem : embeddings) {
+    for (const auto & elem : embeddings) {
         data.push_back(json{
             {"embedding", json_value(elem, "embedding", json::array())},
             {"index",     i++},
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 0c923d4ed..c2b7267c8 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,16 +1,14 @@
 #include "arg.h"
 #include "common.h"
+#include "log.h"
 #include "llama.h"
 
-#include <cmath>
-#include <cstdio>
-#include <string>
 #include <vector>
 
 static void print_usage(int, char ** argv) {
-    LOG_TEE("\nexample usage:\n");
-    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
-    LOG_TEE("\n");
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+    LOG("\n");
 }
 
 int main(int argc, char ** argv) {
@@ -23,6 +21,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     // total length of the sequence including the prompt
     const int n_predict = params.n_predict;
 
@@ -69,25 +69,24 @@ int main(int argc, char ** argv) {
     const int n_ctx    = llama_n_ctx(ctx);
     const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
 
-    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
+    LOG("\n");
+    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_predict or increase n_ctx\n", __func__);
+        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
         return 1;
     }
 
     // print the prompt token-by-token
 
-    fprintf(stderr, "\n");
+    LOG("\n");
 
     for (auto id : tokens_list) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx, id).c_str());
     }
 
-    fflush(stderr);
-
     // create a llama_batch with size 512
     // we use this object to submit token data for decoding
 
@@ -102,7 +101,7 @@ int main(int argc, char ** argv) {
     batch.logits[batch.n_tokens - 1] = true;
 
     if (llama_decode(ctx, batch) != 0) {
-        LOG_TEE("%s: llama_decode() failed\n", __func__);
+        LOG("%s: llama_decode() failed\n", __func__);
         return 1;
     }
 
@@ -116,16 +115,16 @@ int main(int argc, char ** argv) {
     while (n_cur <= n_predict) {
         // sample the next token
         {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
 
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
-                LOG_TEE("\n");
+                LOG("\n");
 
                 break;
             }
 
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
             fflush(stdout);
 
             // prepare the next batch
@@ -141,23 +140,23 @@ int main(int argc, char ** argv) {
 
         // evaluate the current batch with the transformer model
         if (llama_decode(ctx, batch)) {
-            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
             return 1;
         }
     }
 
-    LOG_TEE("\n");
+    LOG("\n");
 
     const auto t_main_end = ggml_time_us();
 
-    LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
             __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
 
-    LOG_TEE("\n");
+    LOG("\n");
     llama_perf_sampler_print(smpl);
     llama_perf_context_print(ctx);
 
-    fprintf(stderr, "\n");
+    LOG("\n");
 
     llama_batch_free(batch);
     llama_sampler_free(smpl);
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 843579acd..fbac21811 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,13 +1,16 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
+#include "log.h"
 #include "llama.h"
 
+#include <algorithm>
 #include <cstdio>
+#include <cstring>
+#include <random>
+#include <set>
 #include <string>
 #include <vector>
-#include <set>
-#include <random>
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@@ -33,8 +36,10 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    gpt_init();
+
     if (params.model_draft.empty()) {
-        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
+        LOG_ERR("%s: --model-draft is required\n", __func__);
         return 1;
     }
 
@@ -47,12 +52,6 @@ int main(int argc, char ** argv) {
     std::default_random_engine rng(params.sparams.seed);
     std::uniform_real_distribution<> u_dist;
 
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("speculative", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -81,14 +80,14 @@ int main(int argc, char ** argv) {
     ctx_dft = llama_init_dft.context;
 
     const bool vocab_type_tgt = llama_vocab_type(model_tgt);
-    LOG("vocab_type tgt: %d\n", vocab_type_tgt);
+    LOG_DBG("vocab_type tgt: %d\n", vocab_type_tgt);
 
     const bool vocab_type_dft = llama_vocab_type(model_dft);
-    LOG("vocab_type dft: %d\n", vocab_type_dft);
+    LOG_DBG("vocab_type dft: %d\n", vocab_type_dft);
 
     if (vocab_type_tgt != vocab_type_dft) {
-        fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__);
-        fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
+        LOG_ERR("%s: draft model vocab type must match target model to use speculation but ", __func__);
+        LOG_ERR("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
         return 1;
     }
 
@@ -98,7 +97,7 @@ int main(int argc, char ** argv) {
         llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
         llama_token_eos(model_tgt) != llama_token_eos(model_dft)
     ) {
-        fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__);
+        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
         return 1;
     }
 
@@ -110,8 +109,8 @@ int main(int argc, char ** argv) {
             : n_vocab_dft - n_vocab_tgt;
 
         if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
-            fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
-            fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
+            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but ", __func__);
+            LOG_ERR("target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                     n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
             return 1;
         }
@@ -120,8 +119,8 @@ int main(int argc, char ** argv) {
             const char * token_text_tgt = llama_token_get_text(model_tgt, i);
             const char * token_text_dft = llama_token_get_text(model_dft, i);
             if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
-                fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
-                fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
+                LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
+                LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
                         llama_token_to_piece(ctx_tgt, i).c_str(),
                         llama_token_to_piece(ctx_dft, i).c_str());
                 return 1;
@@ -138,18 +137,16 @@ int main(int argc, char ** argv) {
     const int max_tokens_list_size = max_context_size - 4;
 
     if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
+        LOG_ERR("%s: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
         return 1;
     }
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
+        LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
     }
 
-    fflush(stderr);
-
     const int n_input = inp.size();
 
     const auto t_enc_start = ggml_time_us();
@@ -211,7 +208,7 @@ int main(int argc, char ** argv) {
             active_seqs.insert(s);
             const auto & tokens = drafts[s].tokens;
 
-            LOG("draft %d: %s\n", s, LOG_TOKENS_TOSTR_PRETTY(ctx_dft, tokens).c_str());
+            LOG_DBG("draft %d: %s\n", s, string_from(ctx_dft, tokens).c_str());
         }
 
         int i_dft  = 0;
@@ -254,7 +251,7 @@ int main(int argc, char ** argv) {
                             continue;
                         }
 
-                        LOG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
+                        LOG_DBG("verifying sequence #%d at pos #%d from %d active sequence(s)\n", s, i_dft, (int) active_seqs.size());
                         float r = u_dist(rng);
                         llama_token_data_array dist_dft = { drafts[s].dists[i_dft].data() , drafts[s].dists[i_dft].size(), LLAMA_TOKEN_NULL, true };
 
@@ -272,7 +269,7 @@ int main(int argc, char ** argv) {
                                 break;
                             }
                         }
-                        LOG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
+                        LOG_DBG("r = %f, p_dft = %f, p_tgt = %f\n", r, p_dft, p_tgt);
                         if (r <= p_tgt / p_dft) {
                             s_keep = s;
                             accept = true;
@@ -280,10 +277,10 @@ int main(int argc, char ** argv) {
                             token_str = llama_token_to_piece(ctx_tgt, token_id);
                             gpt_sampler_accept(smpl, token_id, true);
 
-                            LOG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
                             break;
                         } else {
-                            LOG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
+                            LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
                             drafts[s].active = false;
 
                             // calculate residual probability
@@ -338,7 +335,7 @@ int main(int argc, char ** argv) {
                     if (!accept) {
                         // all drafted tokens were rejected
                         // sample from the target model
-                        LOG("all drafted tokens were rejected, sampling from residual distribution\n");
+                        LOG_DBG("all drafted tokens were rejected, sampling from residual distribution\n");
                         std::vector<float> probs(dist_tgt.size);
                         for (size_t i = 0; i < dist_tgt.size; ++i) {
                             probs[i] = dist_tgt.data[i].p;
@@ -356,13 +353,11 @@ int main(int argc, char ** argv) {
                     // greedy verification
 
                     // sample from the target model
-                    LOG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
+                    LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
                     token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
 
                     gpt_sampler_accept(smpl, token_id, true);
 
-                    //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, smpl->prev).c_str());
-
                     token_str = llama_token_to_piece(ctx_tgt, token_id);
 
                     for (int s = 0; s < n_seq_dft; ++s) {
@@ -371,7 +366,7 @@ int main(int argc, char ** argv) {
                         }
 
                         if (i_dft < (int) drafts[s].tokens.size() && token_id == drafts[s].tokens[i_dft]) {
-                            LOG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
+                            LOG_DBG("the sampled target token matches the %dth drafted token of sequence %d (%d, '%s') - accepted\n", i_dft, s, token_id, token_str.c_str());
 
                             s_keep = s;
                             accept = true;
@@ -393,26 +388,24 @@ int main(int argc, char ** argv) {
                     ++i_dft;
                     if (params.use_color) {
                         // Color token according to its origin sequence
-                        printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
+                        LOG("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
                     } else {
-                        printf("%s", token_str.c_str());
+                        LOG("%s", token_str.c_str());
                     }
-                    fflush(stdout);
                     continue;
                 } else {
-                    printf("%s", token_str.c_str());
-                    fflush(stdout);
+                    LOG("%s", token_str.c_str());
                     break;
                 }
             }
         }
 
         {
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
+            LOG_DBG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", token_id, token_str.c_str());
 
             // TODO: simplify
             {
-                LOG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
+                LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
 
                 llama_kv_cache_seq_keep(ctx_dft, s_keep);
                 llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
@@ -439,7 +432,7 @@ int main(int argc, char ** argv) {
             llama_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
 
             llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
+            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
             llama_decode(ctx_dft, batch_dft);
 
             ++n_past_dft;
@@ -486,7 +479,7 @@ int main(int argc, char ** argv) {
                 const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
 
                 for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
-                    LOG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                    LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
                             k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
                 }
 
@@ -495,7 +488,7 @@ int main(int argc, char ** argv) {
                 // attempt to split the branch if the probability is high enough
                 for (int f = 1; f < 8; ++f) {
                     if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
-                        LOG("splitting seq %3d into %3d\n", s, n_seq_cur);
+                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
 
                         llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
                         llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
@@ -584,7 +577,7 @@ int main(int argc, char ** argv) {
                 llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
             }
 
-            // LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
+            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
             llama_decode(ctx_tgt, batch_tgt);
             ++n_past_tgt;
         }
@@ -602,23 +595,25 @@ int main(int argc, char ** argv) {
 
     auto t_dec_end = ggml_time_us();
 
-    LOG_TEE("\n\n");
+    LOG("\n\n");
 
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
+    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
+    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
 
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
+    LOG_INF("\n");
+    LOG_INF("n_draft   = %d\n", n_draft);
+    LOG_INF("n_predict = %d\n", n_predict);
+    LOG_INF("n_drafted = %d\n", n_drafted);
+    LOG_INF("n_accept  = %d\n", n_accept);
+    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
 
-    LOG_TEE("\ndraft:\n\n");
+    LOG_INF("\n");
+    LOG_INF("draft:\n\n");
     // TODO: print sampling/grammar timings for all drafts
     llama_perf_context_print(ctx_dft);
 
-    LOG_TEE("\ntarget:\n\n");
+    LOG_INF("\n");
+    LOG_INF("target:\n\n");
     gpt_perf_print(ctx_tgt, smpl);
 
     gpt_sampler_free(smpl);
@@ -637,7 +632,7 @@ int main(int argc, char ** argv) {
 
     llama_backend_free();
 
-    fprintf(stderr, "\n\n");
+    LOG("\n\n");
 
     return 0;
 }
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index c817be566..a9af6471f 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -1,11 +1,13 @@
 #include "common.h"
+//#include "log.h" // TODO: start using log.h
 #include "llama.h"
 
-#include <cmath>
 #include <cstdio>
+#include <cstring>
 #include <fstream>
 #include <string>
 #include <vector>
+#include <iostream> // TODO: remove me
 
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@@ -13,25 +15,25 @@
 #include <shellapi.h>   // For CommandLineToArgvW
 #endif
 
-static void print_usage_information(const char * argv0, FILE * stream) {
-    fprintf(stream, "usage: %s [options]\n\n", argv0);
-    fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
-    fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
-    fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
-    fprintf(stream, "to control the behavior of the tokenizer.\n\n");
-    fprintf(stream, "    The possible options are:\n");
-    fprintf(stream, "\n");
-    fprintf(stream, "    -h, --help                           print this help and exit\n");
-    fprintf(stream, "    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
-    fprintf(stream, "    --ids                                if given, only print numerical token IDs, and not token strings.\n");
-    fprintf(stream, "                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
-    fprintf(stream, "    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
-    fprintf(stream, "    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
-    fprintf(stream, "    --stdin                              read prompt from standard input.\n");
-    fprintf(stream, "    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
-    fprintf(stream, "    --no-parse-special                   do not parse control tokens.\n");
-    fprintf(stream, "    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
-    fprintf(stream, "    --show-count                         print the total number of tokens.\n");
+static void print_usage_information(const char * argv0) {
+    printf("usage: %s [options]\n\n", argv0);
+    printf("The tokenize program tokenizes a prompt using a given model,\n");
+    printf("and prints the resulting tokens to standard output.\n\n");
+    printf("It needs a model file, a prompt, and optionally other flags\n");
+    printf("to control the behavior of the tokenizer.\n\n");
+    printf("    The possible options are:\n");
+    printf("\n");
+    printf("    -h, --help                           print this help and exit\n");
+    printf("    -m MODEL_PATH, --model MODEL_PATH    path to model.\n");
+    printf("    --ids                                if given, only print numerical token IDs, and not token strings.\n");
+    printf("                                         The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
+    printf("    -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+    printf("    -p PROMPT, --prompt PROMPT           read prompt from the argument.\n");
+    printf("    --stdin                              read prompt from standard input.\n");
+    printf("    --no-bos                             do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+    printf("    --no-parse-special                   do not parse control tokens.\n");
+    printf("    --log-disable                        disable logs. Makes stderr quiet when loading the model.\n");
+    printf("    --show-count                         print the total number of tokens.\n");
 }
 
 static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
@@ -185,7 +187,7 @@ int main(int raw_argc, char ** raw_argv) {
     const int argc = argv.size();
 
     if (argc <= 1) {
-        print_usage_information(argv[0].c_str(), stderr);
+        print_usage_information(argv[0].c_str());
         return 1;
     }
 
@@ -214,7 +216,7 @@ int main(int raw_argc, char ** raw_argv) {
     for (; iarg < argc; ++iarg) {
         std::string arg{argv[iarg]};
         if (arg == "-h" || arg == "--help") {
-            print_usage_information(argv[0].c_str(), stdout);
+            print_usage_information(argv[0].c_str());
             return 0;
         }
         else if (arg == "--ids") {
@@ -323,10 +325,6 @@ int main(int raw_argc, char ** raw_argv) {
     // Start actually doing the tokenizing stuff.
     //////
 
-#ifdef LOG_DISABLE_LOGS
-    disable_logging = true;
-#endif
-
     if (disable_logging) {
         llama_log_set(llama_log_callback_null, NULL);
     }
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 13026ab32..a413df357 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -564,10 +564,11 @@ extern "C" {
     };
 
     enum ggml_log_level {
-        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_INFO  = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
+        GGML_LOG_LEVEL_NONE  = 0,
+        GGML_LOG_LEVEL_INFO  = 1,
+        GGML_LOG_LEVEL_WARN  = 2,
+        GGML_LOG_LEVEL_ERROR = 3,
+        GGML_LOG_LEVEL_DEBUG = 4,
     };
 
     enum ggml_tensor_flag {
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 6c85acfec..7f85dc30d 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -13,13 +13,16 @@
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 #ifdef GGML_METAL_NDEBUG
+#define GGML_METAL_LOG(...)
 #define GGML_METAL_LOG_INFO(...)
 #define GGML_METAL_LOG_WARN(...)
 #define GGML_METAL_LOG_ERROR(...)
 #else
+#define GGML_METAL_LOG(...)       ggml_metal_log(GGML_LOG_LEVEL_NONE,  __VA_ARGS__)
 #define GGML_METAL_LOG_INFO(...)  ggml_metal_log(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
 #define GGML_METAL_LOG_WARN(...)  ggml_metal_log(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
 #define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define GGML_METAL_LOG_DEBUG(...) ggml_metal_log(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #endif
 
 #define UNUSED(x) (void)(x)
@@ -3183,7 +3186,7 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
 #ifndef GGML_METAL_NDEBUG
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
-        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
+        GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
                 __func__,
                 size_aligned / 1024.0 / 1024.0,
                 device.currentAllocatedSize / 1024.0 / 1024.0,
@@ -3191,8 +3194,6 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
 
         if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
             GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-        } else {
-            GGML_METAL_LOG_INFO("\n");
         }
     } else {
         GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 87012617f..2bde75ec1 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
 void llama_log_internal        (ggml_log_level level, const char * format, ...);
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 
+#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
diff --git a/src/llama.cpp b/src/llama.cpp
index 1986a90fb..c917d1c7b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -18074,9 +18074,9 @@ struct llama_model * llama_load_model_from_file(
             unsigned percentage = (unsigned) (100 * progress);
             while (percentage > *cur_percentage_p) {
                 *cur_percentage_p = percentage;
-                LLAMA_LOG_INFO(".");
+                LLAMA_LOG(".");
                 if (percentage >= 100) {
-                    LLAMA_LOG_INFO("\n");
+                    LLAMA_LOG("\n");
                 }
             }
             return true;
@@ -20781,8 +20781,8 @@ static void llama_log_internal_v(ggml_log_level level, const char * format, va_l
     if (len < 128) {
         g_state.log_callback(level, buffer, g_state.log_callback_user_data);
     } else {
-        char* buffer2 = new char[len+1];
-        vsnprintf(buffer2, len+1, format, args_copy);
+        char * buffer2 = new char[len + 1];
+        vsnprintf(buffer2, len + 1, format, args_copy);
         buffer2[len] = 0;
         g_state.log_callback(level, buffer2, g_state.log_callback_user_data);
         delete[] buffer2;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 30e71cfd4..7dcd3fce8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -108,6 +108,7 @@ llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU
 #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 
 # llama_target_and_test(test-double-float.cpp) # SLOW
+llama_target_and_test(test-log.cpp)
 llama_target_and_test(test-arg-parser.cpp)
 llama_target_and_test(test-quantize-fns.cpp)
 llama_target_and_test(test-quantize-perf.cpp)
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index f26707910..e07d09733 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -85,7 +85,7 @@ int main(void) {
 
     argv = {"binary_name", "--verbose"};
     assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
-    assert(params.verbosity == 1);
+    assert(params.verbosity > 1);
 
     argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
     assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
diff --git a/tests/test-log.cpp b/tests/test-log.cpp
new file mode 100644
index 000000000..211222369
--- /dev/null
+++ b/tests/test-log.cpp
@@ -0,0 +1,39 @@
+#include "log.h"
+
+#include <cstdlib>
+#include <thread>
+
+int main() {
+    const int n_thread = 8;
+
+    std::thread threads[n_thread];
+    for (int i = 0; i < n_thread; i++) {
+        threads[i] = std::thread([i]() {
+            const int n_msg = 1000;
+
+            for (int j = 0; j < n_msg; j++) {
+                const int log_type = std::rand() % 4;
+
+                switch (log_type) {
+                    case 0: LOG_INF("Thread %d: %d\n", i, j); break;
+                    case 1: LOG_WRN("Thread %d: %d\n", i, j); break;
+                    case 2: LOG_ERR("Thread %d: %d\n", i, j); break;
+                    case 3: LOG_DBG("Thread %d: %d\n", i, j); break;
+                    default:
+                        break;
+                }
+
+                if (rand () % 10 < 5) {
+                    gpt_log_set_timestamps(gpt_log_main(), rand() % 2);
+                    gpt_log_set_prefix    (gpt_log_main(), rand() % 2);
+                }
+            }
+        });
+    }
+
+    for (int i = 0; i < n_thread; i++) {
+        threads[i].join();
+    }
+
+    return 0;
+}

From 90a2fff0e7f80c6fea3fc6cf9a7b482744f3f164 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Sep 2024 05:14:23 +0300
Subject: [PATCH 55/65] flake.lock: Update (#9488)

---
 flake.lock | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/flake.lock b/flake.lock
index e9382ff3d..0db5ff92a 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "nixpkgs-lib": "nixpkgs-lib"
       },
       "locked": {
-        "lastModified": 1725234343,
-        "narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
+        "lastModified": 1726153070,
+        "narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=",
         "owner": "hercules-ci",
         "repo": "flake-parts",
-        "rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
+        "rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1725634671,
-        "narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
+        "lastModified": 1726062873,
+        "narHash": "sha256-IiA3jfbR7K/B5+9byVi9BZGWTD4VSbWe8VLpp9B/iYk=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
+        "rev": "4f807e8940284ad7925ebd0a0993d2a1791acb2f",
         "type": "github"
       },
       "original": {

From c4965a64f72ac9434c21cf0e1c3421d13e889155 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Sep 2024 09:05:56 +0300
Subject: [PATCH 56/65] metal : handle zero-sized allocs (#9466)

---
 ggml/src/ggml-metal.m | 48 ++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index 7f85dc30d..f87181d19 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -3225,15 +3225,19 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
     ctx->n_buffers = 1;
 
     if (ctx->all_data != NULL) {
-        ctx->buffers[0].data = ctx->all_data;
-        ctx->buffers[0].size = size;
-        ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                        length:size_aligned
-                        options:MTLResourceStorageModeShared
-                        deallocator:nil];
+        ctx->buffers[0].data  = ctx->all_data;
+        ctx->buffers[0].size  = size;
+        ctx->buffers[0].metal = nil;
+
+        if (size_aligned > 0) {
+            ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
+                            length:size_aligned
+                            options:MTLResourceStorageModeShared
+                            deallocator:nil];
+        }
     }
 
-    if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
+    if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
         GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
         free(ctx);
         ggml_backend_metal_free_device();
@@ -3310,14 +3314,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
 
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
-        ctx->buffers[ctx->n_buffers].data = data;
-        ctx->buffers[ctx->n_buffers].size = size;
+        ctx->buffers[ctx->n_buffers].data  = data;
+        ctx->buffers[ctx->n_buffers].size  = size;
+        ctx->buffers[ctx->n_buffers].metal = nil;
 
-        ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+        if (size_aligned > 0) {
+            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
-        if (ctx->buffers[ctx->n_buffers].metal == nil) {
-            GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
-            return false;
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+                return false;
+            }
         }
 
         ggml_backend_metal_log_allocated_size(device, size_aligned);
@@ -3333,14 +3340,17 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
         for (size_t i = 0; i < size; i += size_step) {
             const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
 
-            ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
-            ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+            ctx->buffers[ctx->n_buffers].data  = (void *) ((uint8_t *) data + i);
+            ctx->buffers[ctx->n_buffers].size  = size_step_aligned;
+            ctx->buffers[ctx->n_buffers].metal = nil;
 
-            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+            if (size_step_aligned > 0) {
+                ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
 
-            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
-                return false;
+                if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                    return false;
+                }
             }
 
             ggml_backend_metal_log_allocated_size(device, size_step_aligned);

From 441b72b91f818fe69497e5816f87969e90c73c43 Mon Sep 17 00:00:00 2001
From: Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com>
Date: Mon, 16 Sep 2024 01:20:01 -0500
Subject: [PATCH 57/65] main : option to disable context shift (#9484)

* added cli arg to disable context shift

* reverted precommit

* updated README.md for main

* white space

* allow disabling context shift in the server

* Update common/arg.cpp

no-context-shift only works for main example

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* added server example to --no-context-shift args

* removed server changes

* white space

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/arg.cpp          |  8 ++++++-
 common/common.h         |  1 +
 examples/main/README.md |  2 ++
 examples/main/main.cpp  | 46 +++++++++++++++++++++++------------------
 4 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8fcb8c25f..60e37a89a 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -685,6 +685,13 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
             params.n_keep = value;
         }
     ));
+    add_opt(llama_arg(
+        {"--no-context-shift"},
+        format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
+        [](gpt_params & params) {
+            params.ctx_shift = false;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(llama_arg(
         {"--chunks"}, "N",
         format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
@@ -1985,4 +1992,3 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
 
     return ctx_arg;
 }
-
diff --git a/common/common.h b/common/common.h
index e100c8fa7..cb87c4479 100644
--- a/common/common.h
+++ b/common/common.h
@@ -246,6 +246,7 @@ struct gpt_params {
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
     bool flash_attn        = false; // flash attention
     bool no_perf           = false; // disable performance metrics
+    bool ctx_shift         = true;  // context shift on inifinite text generation
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool logits_all        = false; // return logits for all tokens in the batch
diff --git a/examples/main/README.md b/examples/main/README.md
index 9396a34fa..6730effdf 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -161,6 +161,8 @@ A value of -1 will enable infinite text generation, even though we have a finite
 
 If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.
 
+The `--no-context-shift` option allows you to stop the infinite text generation once the finite context window is full.
+
 It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode, text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `--predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
 
 ### Temperature
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d9e45ce2f..91fea9326 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -559,29 +559,35 @@ int main(int argc, char ** argv) {
                 // if we run out of context:
                 // - take the n_keep first tokens from the original prompt (via n_past)
                 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
+
                 if (n_past + (int) embd.size() >= n_ctx) {
-                    if (params.n_predict == -2) {
-                        LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    if (!params.ctx_shift){
+                        LOG_DBG("\n\n%s: context full and context shift is disabled => stopping\n", __func__);
                         break;
+                    } else {
+                        if (params.n_predict == -2) {
+                            LOG_DBG("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                            break;
+                        }
+
+                        const int n_left    = n_past - params.n_keep;
+                        const int n_discard = n_left/2;
+
+                        LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
+                                n_past, n_left, n_ctx, params.n_keep, n_discard);
+
+                        llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                        llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+
+                        n_past -= n_discard;
+
+                        LOG_DBG("after swap: n_past = %d\n", n_past);
+
+                        LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
+
+                        LOG_DBG("clear session path\n");
+                        path_session.clear();
                     }
-
-                    const int n_left    = n_past - params.n_keep;
-                    const int n_discard = n_left/2;
-
-                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                            n_past, n_left, n_ctx, params.n_keep, n_discard);
-
-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
-
-                    n_past -= n_discard;
-
-                    LOG_DBG("after swap: n_past = %d\n", n_past);
-
-                    LOG_DBG("embd: %s\n", string_from(ctx, embd).c_str());
-
-                    LOG_DBG("clear session path\n");
-                    path_session.clear();
                 }
             } else {
                 // context extension via Self-Extend

From 95ca85168b5b089b80a811b59528ce0a2f1bd1dd Mon Sep 17 00:00:00 2001
From: CarryFun <76023481+CarryFun@users.noreply.github.com>
Date: Mon, 16 Sep 2024 14:45:20 +0800
Subject: [PATCH 58/65] llama : support MiniCPM3 (#9322)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: 范睿凯 <fanruikai@modelbest.cn>
---
 convert_hf_to_gguf.py     |  54 +++++++
 gguf-py/gguf/constants.py |  19 +++
 src/llama.cpp             | 298 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 371 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2c6d5d95b..d995ed764 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1841,6 +1841,60 @@ class MiniCPMModel(Model):
         return [(self.map_tensor_name(name), data_torch)]
 
 
+@Model.register("MiniCPM3ForCausalLM")
+class MiniCPM3Model(Model):
+    model_arch = gguf.MODEL_ARCH.MINICPM3
+
+    def set_gguf_parameters(self):
+        hparams = self.hparams
+
+        rope_dims = hparams["qk_rope_head_dim"]
+
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
+        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
+        self.gguf_writer.add_block_count(self.block_count)
+        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
+        self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"])
+        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
+            self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+        rope_scaling = self.find_hparam(['rope_scaling'], True)
+        if rope_scaling is None:
+            return
+
+        long_factors = rope_scaling.get('long_factor', None)
+        short_factors = rope_scaling.get('short_factor', None)
+
+        if long_factors is None or short_factors is None:
+            raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+        if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+            raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG]  + ".weight", np.array(long_factors, dtype=np.float32))
+        self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
+
+    def set_vocab(self):
+        self._set_vocab_llama_hf()
+
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+
 @Model.register("QWenLMHeadModel")
 class QwenModel(Model):
     model_arch = gguf.MODEL_ARCH.QWEN
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c87d08782..2c8545455 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -210,6 +210,7 @@ class MODEL_ARCH(IntEnum):
     ORION        = auto()
     INTERNLM2    = auto()
     MINICPM      = auto()
+    MINICPM3     = auto()
     GEMMA        = auto()
     GEMMA2       = auto()
     STARCODER2   = auto()
@@ -364,6 +365,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.ORION:          "orion",
     MODEL_ARCH.INTERNLM2:      "internlm2",
     MODEL_ARCH.MINICPM:        "minicpm",
+    MODEL_ARCH.MINICPM3:       "minicpm3",
     MODEL_ARCH.GEMMA:          "gemma",
     MODEL_ARCH.GEMMA2:         "gemma2",
     MODEL_ARCH.STARCODER2:     "starcoder2",
@@ -867,6 +869,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.MINICPM3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_KV_A_MQA,
+        MODEL_TENSOR.ATTN_KV_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.GEMMA: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/src/llama.cpp b/src/llama.cpp
index c917d1c7b..aa565723b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -193,6 +193,7 @@ enum llm_arch {
     LLM_ARCH_ORION,
     LLM_ARCH_INTERNLM2,
     LLM_ARCH_MINICPM,
+    LLM_ARCH_MINICPM3,
     LLM_ARCH_GEMMA,
     LLM_ARCH_GEMMA2,
     LLM_ARCH_STARCODER2,
@@ -241,6 +242,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_ORION,           "orion"        },
     { LLM_ARCH_INTERNLM2,       "internlm2"    },
     { LLM_ARCH_MINICPM,         "minicpm"      },
+    { LLM_ARCH_MINICPM3,        "minicpm3"     },
     { LLM_ARCH_GEMMA,           "gemma"        },
     { LLM_ARCH_GEMMA2,          "gemma2"       },
     { LLM_ARCH_STARCODER2,      "starcoder2"   },
@@ -1034,6 +1036,29 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP_EXP,      "blk.%d.ffn_up.%d" },
         },
     },
+    {
+        LLM_ARCH_MINICPM3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ROPE_FACTORS_LONG,  "rope_factors_long" },
+            { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q_A_NORM,      "blk.%d.attn_q_a_norm" },
+            { LLM_TENSOR_ATTN_KV_A_NORM,     "blk.%d.attn_kv_a_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_A,           "blk.%d.attn_q_a" },
+            { LLM_TENSOR_ATTN_Q_B,           "blk.%d.attn_q_b" },
+            { LLM_TENSOR_ATTN_KV_A_MQA,      "blk.%d.attn_kv_a_mqa" },
+            { LLM_TENSOR_ATTN_KV_B,          "blk.%d.attn_kv_b" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+        },
+    },
     {
         LLM_ARCH_GEMMA,
         {
@@ -5390,6 +5415,17 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_MINICPM3:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
+                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
+
+                switch (hparams.n_layer) {
+                    case 62: model.type = e_model::MODEL_4B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_GROK:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -6897,6 +6933,54 @@ static bool llm_load_tensors(
                         }
                     }
                 } break;
+            case LLM_ARCH_MINICPM3:
+                {
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+
+                    const int64_t q_lora_rank  = hparams.n_lora_q;
+                    const int64_t kv_lora_rank = hparams.n_lora_kv;
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
+                        // if output is NULL, init from the input tok embed
+                        if (model.output == NULL) {
+                            model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
+                        }
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_q_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank});
+
+                        layer.attn_kv_a_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
+
+                        layer.wq_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
+                        layer.wq_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
+
+                        layer.wkv_a_mqa = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
+                        layer.wkv_b     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B,     "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)});
+                        layer.wo        = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {              n_head * (                      n_embd_head_v), n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
+
+                        layer.rope_long  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                        layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head_qk_rope/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
+                    }
+                } break;
             case LLM_ARCH_GROK:
                 {
                     if (n_expert == 0) {
@@ -12843,6 +12927,215 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_minicpm3() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        //TODO: if the model varies, these parameters need to be read from the model
+        const int64_t n_embd_base = 256;
+        const float scale_embd  = 12.0f;
+        const float scale_depth = 1.4f;
+        const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+
+        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+        const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+        const uint32_t kv_lora_rank = hparams.n_lora_kv;
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // scale the input embeddings
+        inpL = ggml_scale(ctx0, inpL, scale_embd);
+        cb(inpL, "inp_scaled", -1);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            struct ggml_tensor * rope_factors = build_rope_factors(il);
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                struct ggml_tensor * q = NULL;
+                // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
+                q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
+                cb(q, "q", il);
+
+                q = llm_build_norm(ctx0, q, hparams,
+                        model.layers[il].attn_q_a_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(q, "q", il);
+
+                // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+                q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
+                cb(q, "q", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        0);
+                cb(q_nope, "q_nope", il);
+
+                // and {n_head * n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+                        ggml_row_size(q->type, hparams.n_embd_head_k),
+                        ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                        ggml_row_size(q->type, n_embd_head_qk_nope));
+                cb(q_pe, "q_pe", il);
+
+                // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+
+                // split into {kv_lora_rank, n_tokens}
+                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        0);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // and {n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                        kv_pe_compresseed->nb[1],
+                        kv_pe_compresseed->nb[1],
+                        ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
+                cb(k_pe, "k_pe", il);
+
+                kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
+                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                        model.layers[il].attn_kv_a_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(kv_compressed, "kv_compressed", il);
+
+                // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
+                struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
+                cb(kv, "kv", il);
+
+                // split into {n_head * n_embd_head_qk_nope, n_tokens}
+                struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
+                        ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
+                        ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        0);
+                cb(k_nope, "k_nope", il);
+
+                // and {n_head * n_embd_head_v, n_tokens}
+                struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+                        ggml_row_size(kv->type, (n_embd_head_qk_nope)));
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_cont(ctx0, v_states);
+                cb(v_states, "v_states", il);
+
+                v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
+                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+                    0);
+                cb(v_states, "v_states", il);
+
+                q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                q_pe = ggml_rope_ext(
+                    ctx0, q_pe, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(q_pe, "q_pe", il);
+
+                // shared RoPE key
+                k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
+                k_pe = ggml_rope_ext(
+                    ctx0, k_pe, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(k_pe, "k_pe", il);
+
+                struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+                cb(q_states, "q_states", il);
+
+                struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                cb(k_states, "k_states", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            // scale_res - scale the hidden states for residual connection
+            const float scale_res = scale_depth/sqrtf(float(n_layer));
+            cur = ggml_scale(ctx0, cur, scale_res);
+            cb(cur, "hidden_scaled", il);
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // feed-forward network
+            {
+                cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                        model.layers[il].ffn_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(cur, "ffn_norm", il);
+
+                cur = llm_build_ffn(ctx0, lctx, cur,
+                        model.layers[il].ffn_up,   NULL, NULL,
+                        model.layers[il].ffn_gate, NULL, NULL,
+                        model.layers[il].ffn_down, NULL, NULL,
+                        NULL,
+                        LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
+                cb(cur, "ffn_out", il);
+            }
+
+            // scale the hidden states for residual connection
+            cur = ggml_scale(ctx0, cur, scale_res);
+            cb(cur, "hidden_scaled_ffn", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head scaling
+        const float scale_lmhead = float(n_embd_base)/float(n_embd);
+        cur = ggml_scale(ctx0, cur, scale_lmhead);
+        cb(cur, "lmhead_scaling", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_gemma() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
@@ -15383,6 +15676,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_minicpm();
             } break;
+        case LLM_ARCH_MINICPM3:
+            {
+                result = llm.build_minicpm3();
+            } break;
         case LLM_ARCH_GEMMA:
             {
                 result = llm.build_gemma();
@@ -18609,6 +18906,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_CODESHELL:
         case LLM_ARCH_NEMOTRON:
         case LLM_ARCH_EXAONE:
+        case LLM_ARCH_MINICPM3:
             return LLAMA_ROPE_TYPE_NEOX;
 
         // all model arches should be listed explicitly here

From 0aadac10c7dd704f8285ddf5a63d6f764cb340aa Mon Sep 17 00:00:00 2001
From: Shane A <shanea@allenai.org>
Date: Sun, 15 Sep 2024 23:47:37 -0700
Subject: [PATCH 59/65] llama : support OLMoE (#9462)

---
 README.md                      |   1 +
 convert_hf_to_gguf.py          |  60 ++++++++++
 gguf-py/gguf/constants.py      |  19 +++
 gguf-py/gguf/tensor_mapping.py |  30 ++---
 src/llama.cpp                  | 203 +++++++++++++++++++++++++++++++++
 5 files changed, 298 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 9a10ead83..4d24dd591 100644
--- a/README.md
+++ b/README.md
@@ -77,6 +77,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
 - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
 - [x] [OLMo](https://allenai.org/olmo)
+- [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924)
 - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
 - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
 - [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index d995ed764..f026977e9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2998,6 +2998,66 @@ class OlmoModel(Model):
         return [(self.map_tensor_name(name), data_torch)]
 
 
+@Model.register("OlmoeForCausalLM")
+class OlmoeModel(Model):
+    model_arch = gguf.MODEL_ARCH.OLMOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
+        if (n_experts := self.hparams.get("num_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    # Copied from: Qwen2MoeModel
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # process the experts separately
+        if name.find("experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    # Copied from: Qwen2MoeModel
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
 @Model.register("JinaBertModel", "JinaBertForMaskedLM")
 class JinaBertV2Model(BertModel):
     model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 2c8545455..0d88649d8 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -220,6 +220,7 @@ class MODEL_ARCH(IntEnum):
     COMMAND_R    = auto()
     DBRX         = auto()
     OLMO         = auto()
+    OLMOE        = auto()
     OPENELM      = auto()
     ARCTIC       = auto()
     DEEPSEEK2    = auto()
@@ -375,6 +376,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.COMMAND_R:      "command-r",
     MODEL_ARCH.DBRX:           "dbrx",
     MODEL_ARCH.OLMO:           "olmo",
+    MODEL_ARCH.OLMOE:          "olmoe",
     MODEL_ARCH.OPENELM:        "openelm",
     MODEL_ARCH.ARCTIC:         "arctic",
     MODEL_ARCH.DEEPSEEK2:      "deepseek2",
@@ -1027,6 +1029,23 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.OLMOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+    ],
     MODEL_ARCH.OPENELM: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index bc9a13ee5..2ebfa2b43 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -13,7 +13,7 @@ class TensorNameMap:
             "transformer.wte",                           # gpt2 gpt-j mpt refact qwen dbrx jais exaone
             "transformer.word_embeddings",               # falcon
             "word_embeddings",                           # bloom
-            "model.embed_tokens",                        # llama-hf nemotron
+            "model.embed_tokens",                        # llama-hf nemotron olmoe
             "tok_embeddings",                            # llama-pth
             "embeddings.word_embeddings",                # bert nomic-bert
             "language_model.embedding.word_embeddings",  # persimmon
@@ -54,7 +54,7 @@ class TensorNameMap:
         # Output
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe
             "output",                    # llama-pth bloom internlm2
             "word_embeddings_for_head",  # persimmon
             "lm_head.linear",            # phi2
@@ -66,7 +66,7 @@ class TensorNameMap:
         MODEL_TENSOR.OUTPUT_NORM: (
             "gpt_neox.final_layer_norm",               # gptneox
             "transformer.ln_f",                        # gpt2 gpt-j falcon jais exaone
-            "model.norm",                              # llama-hf baichuan internlm2
+            "model.norm",                              # llama-hf baichuan internlm2 olmoe
             "norm",                                    # llama-pth
             "transformer.norm_f",                      # mpt dbrx
             "ln_f",                                    # refact bloom qwen gpt2
@@ -98,7 +98,7 @@ class TensorNameMap:
             "transformer.h.{bid}.input_layernorm",                  # falcon7b
             "h.{bid}.input_layernorm",                              # bloom
             "transformer.h.{bid}.ln_mlp",                           # falcon40b
-            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron
+            "model.layers.{bid}.input_layernorm",                   # llama-hf nemotron olmoe
             "layers.{bid}.attention_norm",                          # llama-pth
             "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
             "model.layers.{bid}.ln1",                               # yi
@@ -142,7 +142,7 @@ class TensorNameMap:
 
         # Attention query
         MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron
+            "model.layers.{bid}.self_attn.q_proj",                       # llama-hf nemotron olmoe
             "layers.{bid}.attention.wq",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.query",                  # bert
             "transformer.h.{bid}.attn.q_proj",                           # gpt-j
@@ -154,7 +154,7 @@ class TensorNameMap:
 
         # Attention key
         MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron
+            "model.layers.{bid}.self_attn.k_proj",                     # llama-hf nemotron olmoe
             "layers.{bid}.attention.wk",                               # llama-pth
             "encoder.layer.{bid}.attention.self.key",                  # bert
             "transformer.h.{bid}.attn.k_proj",                         # gpt-j
@@ -167,7 +167,7 @@ class TensorNameMap:
 
         # Attention value
         MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron
+            "model.layers.{bid}.self_attn.v_proj",                       # llama-hf nemotron olmoe
             "layers.{bid}.attention.wv",                                 # llama-pth
             "encoder.layer.{bid}.attention.self.value",                  # bert
             "transformer.h.{bid}.attn.v_proj",                           # gpt-j
@@ -185,7 +185,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.attn.out_proj",                       # mpt
             "transformer.h.{bid}.self_attention.dense",                     # falcon
             "h.{bid}.self_attention.dense",                                 # bloom
-            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron
+            "model.layers.{bid}.self_attn.o_proj",                          # llama-hf nemotron olmoe
             "layers.{bid}.attention.wo",                                    # llama-pth
             "encoder.layer.{bid}.attention.output.dense",                   # bert
             "transformer.h.{bid}.attn.out_proj",                            # gpt-j
@@ -229,7 +229,7 @@ class TensorNameMap:
             "transformer.h.{bid}.ln_2",                                      # gpt2 refact qwen jais exaone
             "h.{bid}.post_attention_layernorm",                              # bloom
             "transformer.blocks.{bid}.norm_2",                               # mpt
-            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf nemotron
+            "model.layers.{bid}.post_attention_layernorm",                   # llama-hf nemotron olmoe
             "layers.{bid}.ffn_norm",                                         # llama-pth
             "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
             "model.layers.{bid}.ln2",                                        # yi
@@ -253,7 +253,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_GATE_INP: (
             "layers.{bid}.feed_forward.gate",             # mixtral
             "model.layers.{bid}.block_sparse_moe.gate",   # mixtral
-            "model.layers.{bid}.mlp.gate",                # qwen2moe
+            "model.layers.{bid}.mlp.gate",                # qwen2moe olmoe
             "transformer.decoder_layer.{bid}.router",     # Grok
             "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
         ),
@@ -295,7 +295,7 @@ class TensorNameMap:
             "layers.{bid}.feed_forward.experts.w3",          # mixtral (merged)
             "transformer.decoder_layer.{bid}.moe.linear_v",  # Grok (merged)
             "transformer.blocks.{bid}.ffn.experts.mlp.v1",   # dbrx
-            "model.layers.{bid}.mlp.experts.up_proj",        # qwen2moe (merged)
+            "model.layers.{bid}.mlp.experts.up_proj",        # qwen2moe olmoe (merged)
         ),
 
         MODEL_TENSOR.FFN_UP_SHEXP: (
@@ -327,7 +327,7 @@ class TensorNameMap:
             "layers.{bid}.feed_forward.experts.w1",         # mixtral (merged)
             "transformer.decoder_layer.{bid}.moe.linear",   # Grok (merged)
             "transformer.blocks.{bid}.ffn.experts.mlp.w1",  # dbrx
-            "model.layers.{bid}.mlp.experts.gate_proj",     # qwen2moe (merged)
+            "model.layers.{bid}.mlp.experts.gate_proj",     # qwen2moe olmoe (merged)
         ),
 
         MODEL_TENSOR.FFN_GATE_SHEXP: (
@@ -367,7 +367,7 @@ class TensorNameMap:
             "layers.{bid}.feed_forward.experts.w2",          # mixtral (merged)
             "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
             "transformer.blocks.{bid}.ffn.experts.mlp.w2",   # dbrx
-            "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe (merged)
+            "model.layers.{bid}.mlp.experts.down_proj",      # qwen2moe olmoe (merged)
         ),
 
         MODEL_TENSOR.FFN_DOWN_SHEXP: (
@@ -378,7 +378,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_Q_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.q_norm",                            # cohere
+            "model.layers.{bid}.self_attn.q_norm",                            # cohere olmoe
             "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_q",                # jina-bert-v2
             "transformer.layers.{bid}.attn.q_norm",                           # openelm
@@ -387,7 +387,7 @@ class TensorNameMap:
         MODEL_TENSOR.ATTN_K_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
-            "model.layers.{bid}.self_attn.k_norm",                            # cohere
+            "model.layers.{bid}.self_attn.k_norm",                            # cohere olmoe
             "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
             "encoder.layer.{bid}.attention.self.layer_norm_k",                # jina-bert-v2
             "transformer.layers.{bid}.attn.k_norm",                           # openelm
diff --git a/src/llama.cpp b/src/llama.cpp
index aa565723b..30997bf15 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -202,6 +202,7 @@ enum llm_arch {
     LLM_ARCH_COMMAND_R,
     LLM_ARCH_DBRX,
     LLM_ARCH_OLMO,
+    LLM_ARCH_OLMOE,
     LLM_ARCH_OPENELM,
     LLM_ARCH_ARCTIC,
     LLM_ARCH_DEEPSEEK2,
@@ -251,6 +252,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_COMMAND_R,       "command-r"    },
     { LLM_ARCH_DBRX,            "dbrx"         },
     { LLM_ARCH_OLMO,            "olmo"         },
+    { LLM_ARCH_OLMOE,           "olmoe"        },
     { LLM_ARCH_OPENELM,         "openelm"      },
     { LLM_ARCH_ARCTIC,          "arctic"       },
     { LLM_ARCH_DEEPSEEK2,       "deepseek2"    },
@@ -1193,6 +1195,26 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_OLMOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+        },
+    },
     {
         LLM_ARCH_OPENELM,
         {
@@ -2277,6 +2299,7 @@ enum e_model {
     MODEL_MEDIUM,
     MODEL_LARGE,
     MODEL_XL,
+    MODEL_A1_7B,
     MODEL_A2_7B,
     MODEL_8x7B,
     MODEL_8x22B,
@@ -5241,6 +5264,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_MEDIUM:        return "0.4B";
         case MODEL_LARGE:         return "0.8B";
         case MODEL_XL:            return "1.5B";
+        case MODEL_A1_7B:         return "A1.7B";
         case MODEL_A2_7B:         return "A2.7B";
         case MODEL_8x7B:          return "8x7B";
         case MODEL_8x22B:         return "8x22B";
@@ -5791,6 +5815,14 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_OLMOE:
+            {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+                switch (hparams.n_layer) {
+                    case 16: model.type = e_model::MODEL_A1_7B; break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            } break;
         case LLM_ARCH_OPENELM:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -8018,6 +8050,44 @@ static bool llm_load_tensors(
                         layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
+            case LLM_ARCH_OLMOE:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
+                    {
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                    }
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd});
+                        layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+
+                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
+
+                        GGML_ASSERT(n_expert      > 0);
+                        GGML_ASSERT(n_expert_used > 0);
+
+                        // MoE branch
+                        layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff,   n_expert});
+                        layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff,   n_embd, n_expert});
+                        layer.ffn_up_exps   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd, n_ff,   n_expert});
+                    }
+                } break;
             case LLM_ARCH_OPENELM:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -13832,6 +13902,134 @@ struct llm_build_context {
         return gf;
     }
 
+    // based on the build_qwen2moe() function, changes:
+    //   * removed shared experts
+    //   * removed bias
+    //   * added q, k norm
+    struct ggml_cgraph * build_olmoe() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
+
+        // mutable variable, needed during the last layer of the computation to skip unused tokens
+        int32_t n_tokens = this->n_tokens;
+
+        const int64_t n_embd_head = hparams.n_embd_head_v;
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        GGML_ASSERT(n_embd_head == hparams.n_rot);
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
+
+        // inp_pos - contains the positions
+        struct ggml_tensor * inp_pos = build_inp_pos();
+
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
+        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inpSA = inpL;
+
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "attn_norm", il);
+
+            // self_attention
+            {
+                // compute Q and K and RoPE them
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
+                cb(Qcur, "Qcur", il);
+
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
+                cb(Kcur, "Kcur", il);
+
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
+                cb(Vcur, "Vcur", il);
+
+                Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Qcur, "Qcur_normed", il);
+
+                Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(Kcur, "Kcur_normed", il);
+
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+
+                Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Qcur, "Qcur_rope", il);
+
+                Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                );
+                cb(Kcur, "Kcur_rope", il);
+
+                cur = llm_build_kv(ctx0, lctx, kv_self, gf,
+                        model.layers[il].wo, NULL,
+                        Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+            }
+
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                n_tokens = n_outputs;
+                cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
+            struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+            cb(ffn_inp, "ffn_inp", il);
+
+            // MoE branch
+            cur = llm_build_norm(ctx0, ffn_inp, hparams,
+                    model.layers[il].ffn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            cb(cur, "ffn_norm", il);
+
+            cur = llm_build_moe_ffn(ctx0, lctx, cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, false,
+                    false, 0.0,
+                    cb, il);
+            cb(cur, "ffn_moe_out", il);
+
+            cur = ggml_add(ctx0, cur, ffn_inp);
+            cur = lctx.cvec.apply_to(ctx0, cur, il);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        cur = inpL;
+
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_openelm() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
 
@@ -15712,6 +15910,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_olmo();
             } break;
+        case LLM_ARCH_OLMOE:
+            {
+                result = llm.build_olmoe();
+            } break;
         case LLM_ARCH_OPENELM:
             {
                 result = llm.build_openelm();
@@ -18896,6 +19098,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_QWEN:
         case LLM_ARCH_QWEN2:
         case LLM_ARCH_QWEN2MOE:
+        case LLM_ARCH_OLMOE:
         case LLM_ARCH_PHI2:
         case LLM_ARCH_PHI3:
         case LLM_ARCH_GEMMA:

From 5c3d0f1824714e9a97fc9b06e046eefcb6ecc721 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Mon, 16 Sep 2024 06:48:24 +0000
Subject: [PATCH 60/65] ggml : IQ4_NL sgemm + Q4_0 AVX optimization (#9422)

* squashed

readd my iq4_nl sgemm PR https://github.com/ggerganov/llama.cpp/pull/8049

have ggml_vec_dot_q4_0 do two blocks per loop for avx

try out f16c ggml_vec_dot_iq4_nl, but it's not really faster. as per https://github.com/ggerganov/llama.cpp/pull/8549 we can calculate several blocks at a time with no issue

* shuffle

* remove f16c iq4_nl as i cant make it faster than before
---
 ggml/src/ggml-quants.c       | 69 +++++++++++++++++-------------------
 ggml/src/llamafile/sgemm.cpp | 38 ++++++++++++++++++++
 2 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 322c85d2a..7c1ec8d54 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -230,6 +230,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
 
     return _mm_packus_epi16( bytes1, bytes2);
 }
+
+static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
+    const __m128i ax = _mm_sign_epi8(x, x);
+    const __m128i sy = _mm_sign_epi8(y, x);
+    return _mm_maddubs_epi16(ax, sy);
+}
 #endif
 #elif defined(__SSSE3__)
 // horizontally add 4x4 floats
@@ -4206,37 +4212,37 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
 
     sumf = hsum_float_8(acc);
 #elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
+    const __m128i mone = _mm_set1_epi16(1);
 
-    // Main loop
-    for (; ib < nb; ++ib) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d) );
+    __m256 accum1 = _mm256_setzero_ps();
+    __m256 accum2 = _mm256_setzero_ps();
+    for (; ib + 1 < nb; ib += 2) {
+        const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
+        const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
+        const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
+        const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
+        const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
+        const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
 
-        const __m128i lowMask = _mm_set1_epi8(0xF);
-        const __m128i off = _mm_set1_epi8(8);
-
-        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-        by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
-
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
+        const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
+        const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
+        const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
+        const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
+        const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
+        const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
+        const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
+        const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
+        const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
+        const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
+        const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
+        const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
+        accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
+        accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
+                _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
     }
 
-    sumf = hsum_float_8(acc);
+    sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
 #elif defined(__SSSE3__)
     // set constants
     const __m128i lowMask = _mm_set1_epi8(0xF);
@@ -11819,15 +11825,6 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void *
 #endif
 }
 
-
-#if defined(__AVX__)
-static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
-    const __m128i ax = _mm_sign_epi8(x, x);
-    const __m128i sy = _mm_sign_epi8(y, x);
-    return _mm_maddubs_epi16(ax, sy);
-}
-#endif
-
 #if defined(__AVX2__)
 static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
     const __m256i ax = _mm256_sign_epi8(x, x);
diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp
index d0c2bb284..1be6fdb57 100644
--- a/ggml/src/llamafile/sgemm.cpp
+++ b/ggml/src/llamafile/sgemm.cpp
@@ -235,6 +235,14 @@ template <> inline __m512 load(const ggml_fp16_t *p) {
 }
 #endif // __AVX512F__
 
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// CONSTANTS
+
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
+static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl);
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // FLOATING POINT MATRIX MULTIPLICATION
 
@@ -933,6 +941,20 @@ class tinyBLAS_Q0_AVX {
         return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
     }
 
+    inline __m256i load(const block_iq4_nl *b) {
+        return MM256_SET_M128I(load1(b), load0(b));
+    }
+
+    inline __m128i load0(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), x));
+    }
+
+    inline __m128i load1(const block_iq4_nl *b) {
+        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
+        return _mm_shuffle_epi8(iq4nlt, _mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)));
+    }
+
     inline __m256 updot(__m256i u, __m256i s) {
         __m256i res;
 #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -1159,6 +1181,22 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
 #endif
     }
 
+    case GGML_TYPE_IQ4_NL: {
+        if (Btype != GGML_TYPE_Q8_0)
+            return false;
+#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
+            k, (const block_iq4_nl *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            ith, nth};
+        tb.matmul(m, n);
+        return true;
+#else
+        return false;
+#endif
+    }
+
     default:
         return false;
     }

From 19514d632e5274bc0c27c2269e8f2ad88b526d62 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 16 Sep 2024 10:27:50 +0300
Subject: [PATCH 61/65] cmake : do not hide GGML options + rename option
 (#9465)

* cmake : do not hide GGML options

ggml-ci

* build : rename flag GGML_CUDA_USE_GRAPHS -> GGML_CUDA_GRAPHS

for consistency

ggml-ci
---
 CMakeLists.txt          |  6 +++---
 Makefile                |  2 +-
 ggml/CMakeLists.txt     | 13 +++++++++++--
 ggml/src/CMakeLists.txt |  2 +-
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 244019313..973907819 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,11 +82,11 @@ set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
 
 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE ON)
+    set(GGML_LLAMAFILE_DEFAULT ON)
 endif()
 
-if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
-    set(GGML_CUDA_USE_GRAPHS ON)
+if (NOT DEFINED GGML_CUDA_GRAPHS)
+    set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
 
 # transition helpers
diff --git a/Makefile b/Makefile
index cb5ff9f9d..f922f7083 100644
--- a/Makefile
+++ b/Makefile
@@ -619,7 +619,7 @@ ifdef GGML_CUDA
 			CUDA_PATH ?= /usr/local/cuda
 		endif
 
-		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 		MK_NVCCFLAGS += -use_fast_math
 	endif # GGML_MUSA
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 532534bcb..89fdf9d1c 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -56,6 +56,15 @@ else()
     set(GGML_NATIVE_DEFAULT ON)
 endif()
 
+# defaults
+if (NOT GGML_LLAMAFILE_DEFAULT)
+    set(GGML_LLAMAFILE_DEFAULT OFF)
+endif()
+
+if (NOT GGML_CUDA_GRAPHS_DEFAULT)
+    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
+endif()
+
 # general
 option(GGML_STATIC "ggml: static link libraries"         OFF)
 option(GGML_NATIVE "ggml: enable -march=native flag"     ${GGML_NATIVE_DEFAULT})
@@ -110,7 +119,7 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             OFF)
+option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
 
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
@@ -127,7 +136,7 @@ set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
-option(GGML_CUDA_USE_GRAPHS                 "ggml: use CUDA graphs (llama.cpp only)"          OFF)
+option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 11b877e19..042ea9b77 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -329,7 +329,7 @@ if (GGML_CUDA)
         add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
         add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
 
-        if (GGML_CUDA_USE_GRAPHS)
+        if (GGML_CUDA_GRAPHS)
             add_compile_definitions(GGML_CUDA_USE_GRAPHS)
         endif()
 

From d54c21df7e2669c6cd7492713479d1aeb5846883 Mon Sep 17 00:00:00 2001
From: compilade <git@compilade.net>
Date: Mon, 16 Sep 2024 03:30:22 -0400
Subject: [PATCH 62/65] convert : identify missing model files (#9397)

---
 convert_hf_to_gguf.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index f026977e9..c7e6ae0ca 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -132,12 +132,14 @@ class Model:
     def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
         tensor_names_from_parts: set[str] = set()
 
-        if len(self.part_names) > 1:
+        index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
+        index_name += ".index.json"
+        index_file = self.dir_model / index_name
+
+        if index_file.is_file():
             self.tensor_names = set()
-            index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
-            index_name += ".index.json"
             logger.info(f"gguf: loading model weight map from '{index_name}'")
-            with open(self.dir_model / index_name, "r", encoding="utf-8") as f:
+            with open(index_file, "r", encoding="utf-8") as f:
                 index: dict[str, Any] = json.load(f)
                 weight_map = index.get("weight_map")
                 if weight_map is None or not isinstance(weight_map, dict):
@@ -145,6 +147,7 @@ class Model:
                 self.tensor_names.update(weight_map.keys())
         else:
             self.tensor_names = tensor_names_from_parts
+            weight_map = {}
 
         for part_name in self.part_names:
             logger.info(f"gguf: loading model part '{part_name}'")
@@ -171,9 +174,17 @@ class Model:
                             data = LazyTorchTensor.from_eager(data)
                     yield name, data
 
-        # only verify tensor name presence; it doesn't matter if they are not in the right files
-        if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
-            raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
+        # verify tensor name presence and identify potentially missing files
+        if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
+            missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
+            extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
+            missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
+            if len(extra) == 0 and len(missing_files) > 0:
+                raise ValueError(f"Missing or incomplete model files: {missing_files}")
+            else:
+                raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
+                                 f"Missing tensors: {missing}\n"
+                                 f"Extra tensors: {extra}")
 
     def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
         if key not in gguf.MODEL_TENSORS[self.model_arch]:

From a6a3a5c531c73aef85750a847d21e7d4671e723d Mon Sep 17 00:00:00 2001
From: Michael Podvitskiy <podvitskiymichael@gmail.com>
Date: Mon, 16 Sep 2024 13:06:50 +0200
Subject: [PATCH 63/65] ggml : link MATH_LIBRARY not by its full path (#9339)

---
 ggml/src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 042ea9b77..527c22c68 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1341,7 +1341,7 @@ list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
     if (NOT WIN32 OR NOT GGML_SYCL)
-        target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
     endif()
 endif()
 

From acb2c32c336ce60d765bb189563cc216e57e9fc2 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 16 Sep 2024 13:07:13 +0200
Subject: [PATCH 64/65] llama : rename n_embed to n_embd in rwkv6_time_mix
 (#9504)

This commit renames n_embed to n_embd in llm_build_rwkv6_time_mix.

The motivation for this change is consistency with the other rwkv6
functions like build_rwkv6 (and other parts of the code base).
---
 src/llama.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 30997bf15..0da764f9d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9571,7 +9571,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
         struct ggml_tensor * cur,
         struct ggml_tensor * x_prev,
         struct ggml_tensor ** wkv_state) {
-    size_t n_embed      = cur->ne[0];
+    size_t n_embd       = cur->ne[0];
     size_t n_seq_tokens = cur->ne[1];
     size_t n_seqs       = cur->ne[2];
 
@@ -9582,8 +9582,8 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
 
     struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
 
-    sx  = ggml_reshape_2d(ctx, sx,  n_embed, n_tokens);
-    cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
+    sx  = ggml_reshape_2d(ctx, sx,  n_embd, n_tokens);
+    cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
 
     struct ggml_tensor * xxx = ggml_add(ctx, ggml_mul(ctx, sx, layer->time_mix_lerp_x), cur);
 
@@ -9608,11 +9608,11 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
         xxx
     );
 
-    struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], 0);
-    struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * sizeof(float));
-    struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 2 * sizeof(float));
-    struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 3 * sizeof(float));
-    struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embed, n_tokens, xxx->nb[1], n_embed * n_tokens * 4 * sizeof(float));
+    struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
+    struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
+    struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
+    struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
+    struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
 
     struct ggml_tensor * xw = ggml_add(
         ctx,
@@ -9681,7 +9681,7 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
         )
     );
 
-    w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embed));
+    w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
     w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
     w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
 
@@ -9690,21 +9690,21 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
     r = ggml_transpose(ctx, r);
 
     struct ggml_tensor * wkv_output = ggml_rwkv_wkv(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
-    cur = ggml_view_1d(ctx, wkv_output, n_embed * n_tokens, 0);
-    *wkv_state = ggml_view_1d(ctx, wkv_output, n_embed * head_size * n_seqs, n_embed * n_tokens * sizeof(float));
+    cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
+    *wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
 
     // group norm with head_count groups
-    cur = ggml_reshape_3d(ctx, cur, n_embed / head_count, head_count, n_tokens);
+    cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
     cur = ggml_norm(ctx, cur, 64e-5f);
 
     // Convert back to regular vectors.
-    cur = ggml_reshape_2d(ctx, cur, n_embed, n_tokens);
+    cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
     cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
 
     cur = ggml_mul(ctx, cur, g);
     cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
 
-    return ggml_reshape_3d(ctx, cur, n_embed, n_seq_tokens, n_seqs);
+    return ggml_reshape_3d(ctx, cur, n_embd, n_seq_tokens, n_seqs);
 }
 
 static struct ggml_tensor * llm_build_rwkv6_channel_mix(

From 23e0d70bacaaca1429d365a44aa9e7434f17823b Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Mon, 16 Sep 2024 16:22:07 +0200
Subject: [PATCH 65/65] ggml : move common CPU backend impl to new header
 (#9509)

---
 ggml/src/ggml-aarch64.c      |   1 +
 ggml/src/ggml-cpu-impl.h     | 614 ++++++++++++++++++++++++++++++++++
 ggml/src/ggml-impl.h         | 622 +----------------------------------
 ggml/src/ggml-quants.c       |   1 +
 ggml/src/ggml.c              |   1 +
 ggml/src/llamafile/sgemm.cpp |   1 +
 6 files changed, 631 insertions(+), 609 deletions(-)
 create mode 100644 ggml/src/ggml-cpu-impl.h

diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c
index 72cb83c9b..27375d0d7 100644
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@@ -4,6 +4,7 @@
 
 #include "ggml-quants.h"
 #include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
 
 #include <math.h>
 #include <string.h>
diff --git a/ggml/src/ggml-cpu-impl.h b/ggml/src/ggml-cpu-impl.h
new file mode 100644
index 000000000..5b45155b0
--- /dev/null
+++ b/ggml/src/ggml-cpu-impl.h
@@ -0,0 +1,614 @@
+#pragma once
+
+// GGML CPU internal header
+
+#include "ggml.h"
+#include "ggml-impl.h"
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
+//#include <stddef.h>
+#include <stdbool.h>
+#include <string.h> // memcpy
+#include <math.h>   // fabsf
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_MSC_VER)
+
+#define m512bh(p) p
+#define m512i(p) p
+
+#else
+
+#define m512bh(p) (__m512bh)(p)
+#define m512i(p) (__m512i)(p)
+
+#endif
+
+/**
+ * Converts brain16 to float32.
+ *
+ * The bfloat16 floating point format has the following structure:
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───┐
+ *     0b0000000000000000 brain16
+ *
+ * Since bf16 has the same number of exponent bits as a 32bit float,
+ * encoding and decoding numbers becomes relatively straightforward.
+ *
+ *       ┌sign
+ *       │
+ *       │   ┌exponent
+ *       │   │
+ *       │   │      ┌mantissa
+ *       │   │      │
+ *       │┌──┴───┐┌─┴───────────────────┐
+ *     0b00000000000000000000000000000000 IEEE binary32
+ *
+ * For comparison, the standard fp16 format has fewer exponent bits.
+ *
+ *       ┌sign
+ *       │
+ *       │  ┌exponent
+ *       │  │
+ *       │  │    ┌mantissa
+ *       │  │    │
+ *       │┌─┴─┐┌─┴──────┐
+ *     0b0000000000000000 IEEE binary16
+ *
+ * @see IEEE 754-2008
+ */
+static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.i = (uint32_t)h.bits << 16;
+    return u.f;
+}
+
+/**
+ * Converts float32 to brain16.
+ *
+ * This is binary identical with Google Brain float conversion.
+ * Floats shall round to nearest even, and NANs shall be quiet.
+ * Subnormals aren't flushed to zero, except perhaps when used.
+ * This code should vectorize nicely if using modern compilers.
+ */
+static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
+    ggml_bf16_t h;
+    union {
+        float f;
+        uint32_t i;
+    } u;
+    u.f = s;
+    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
+        h.bits = (u.i >> 16) | 64; /* force to quiet */
+        return h;
+    }
+    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
+    return h;
+}
+
+#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
+#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+
+// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
+#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __FMA__
+#define __FMA__
+#endif
+#ifndef __F16C__
+#define __F16C__
+#endif
+#endif
+
+// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
+#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
+#ifndef __SSE3__
+#define __SSE3__
+#endif
+#ifndef __SSSE3__
+#define __SSSE3__
+#endif
+#endif
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+#include <sys/prctl.h>
+#endif
+
+// 16-bit float
+// on Arm, we use __fp16
+// on x86, we use uint16_t
+#if defined(__ARM_NEON)
+
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+
+#ifdef _MSC_VER
+
+typedef uint16_t ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
+
+#else
+
+typedef __fp16 ggml_fp16_internal_t;
+
+#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
+
+#endif // _MSC_VER
+
+#if !defined(__aarch64__)
+
+// 32-bit ARM compatibility
+
+// vaddlvq_s16
+// vpaddq_s16
+// vpaddq_s32
+// vaddvq_s32
+// vaddvq_f32
+// vmaxvq_f32
+// vcvtnq_s32_f32
+// vzip1_u8
+// vzip2_u8
+
+inline static int32_t vaddlvq_s16(int16x8_t v) {
+    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
+    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
+}
+
+inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
+    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
+    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
+    return vcombine_s16(a0, b0);
+}
+
+inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+    return vcombine_s32(a0, b0);
+}
+
+inline static int32_t vaddvq_s32(int32x4_t v) {
+    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
+}
+
+inline static float vaddvq_f32(float32x4_t v) {
+    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
+}
+
+inline static float vmaxvq_f32(float32x4_t v) {
+    return
+        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
+            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
+}
+
+inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
+inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[0]; res[1] = b[0];
+    res[2] = a[1]; res[3] = b[1];
+    res[4] = a[2]; res[5] = b[2];
+    res[6] = a[3]; res[7] = b[3];
+
+    return res;
+}
+
+inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
+    uint8x8_t res;
+
+    res[0] = a[4]; res[1] = b[4];
+    res[2] = a[5]; res[3] = b[5];
+    res[4] = a[6]; res[5] = b[6];
+    res[6] = a[7]; res[7] = b[7];
+
+    return res;
+}
+
+// vld1q_s16_x2
+// vld1q_u8_x2
+// vld1q_u8_x4
+// vld1q_s8_x2
+// vld1q_s8_x4
+// TODO: double-check these work correctly
+
+typedef struct ggml_int16x8x2_t {
+    int16x8_t val[2];
+} ggml_int16x8x2_t;
+
+inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
+    ggml_int16x8x2_t res;
+
+    res.val[0] = vld1q_s16(ptr + 0);
+    res.val[1] = vld1q_s16(ptr + 8);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x2_t {
+    uint8x16_t val[2];
+} ggml_uint8x16x2_t;
+
+inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
+    ggml_uint8x16x2_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_uint8x16x4_t {
+    uint8x16_t val[4];
+} ggml_uint8x16x4_t;
+
+inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
+    ggml_uint8x16x4_t res;
+
+    res.val[0] = vld1q_u8(ptr + 0);
+    res.val[1] = vld1q_u8(ptr + 16);
+    res.val[2] = vld1q_u8(ptr + 32);
+    res.val[3] = vld1q_u8(ptr + 48);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x2_t {
+    int8x16_t val[2];
+} ggml_int8x16x2_t;
+
+inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
+    ggml_int8x16x2_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+
+    return res;
+}
+
+typedef struct ggml_int8x16x4_t {
+    int8x16_t val[4];
+} ggml_int8x16x4_t;
+
+inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
+    ggml_int8x16x4_t res;
+
+    res.val[0] = vld1q_s8(ptr + 0);
+    res.val[1] = vld1q_s8(ptr + 16);
+    res.val[2] = vld1q_s8(ptr + 32);
+    res.val[3] = vld1q_s8(ptr + 48);
+
+    return res;
+}
+
+// NOTE: not tested
+inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
+    int8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+// NOTE: not tested
+inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
+    uint8x16_t res;
+
+    res[ 0] = a[b[ 0]];
+    res[ 1] = a[b[ 1]];
+    res[ 2] = a[b[ 2]];
+    res[ 3] = a[b[ 3]];
+    res[ 4] = a[b[ 4]];
+    res[ 5] = a[b[ 5]];
+    res[ 6] = a[b[ 6]];
+    res[ 7] = a[b[ 7]];
+    res[ 8] = a[b[ 8]];
+    res[ 9] = a[b[ 9]];
+    res[10] = a[b[10]];
+    res[11] = a[b[11]];
+    res[12] = a[b[12]];
+    res[13] = a[b[13]];
+    res[14] = a[b[14]];
+    res[15] = a[b[15]];
+
+    return res;
+}
+
+#else
+
+#define ggml_int16x8x2_t  int16x8x2_t
+#define ggml_uint8x16x2_t uint8x16x2_t
+#define ggml_uint8x16x4_t uint8x16x4_t
+#define ggml_int8x16x2_t  int8x16x2_t
+#define ggml_int8x16x4_t  int8x16x4_t
+
+#define ggml_vld1q_s16_x2 vld1q_s16_x2
+#define ggml_vld1q_u8_x2  vld1q_u8_x2
+#define ggml_vld1q_u8_x4  vld1q_u8_x4
+#define ggml_vld1q_s8_x2  vld1q_s8_x2
+#define ggml_vld1q_s8_x4  vld1q_s8_x4
+#define ggml_vqtbl1q_s8   vqtbl1q_s8
+#define ggml_vqtbl1q_u8   vqtbl1q_u8
+
+#endif // !defined(__aarch64__)
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif // !defined(__ARM_FEATURE_DOTPROD)
+
+#endif // defined(__ARM_NEON)
+
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    ggml_fp16_internal_t tmp;
+    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+    return (float)tmp;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    ggml_fp16_t res;
+    ggml_fp16_internal_t tmp = f;
+    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+    return res;
+}
+
+#else
+
+#ifdef __wasm_simd128__
+#include <wasm_simd128.h>
+#else
+#ifdef __POWER9_VECTOR__
+#include <altivec.h>
+#undef bool
+#define bool _Bool
+#else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
+#if !defined(__riscv)
+#include <immintrin.h>
+#endif
+#endif
+#endif
+#endif
+#endif
+
+#ifdef __riscv_v_intrinsic
+#include <riscv_vector.h>
+#endif
+
+#if defined(__loongarch64)
+#if defined(__loongarch_asx)
+#include <lasxintrin.h>
+#endif
+#if defined(__loongarch_sx)
+#include <lsxintrin.h>
+#endif
+#endif
+
+#if defined(__loongarch_asx)
+
+typedef union {
+    int32_t i;
+    float f;
+} ft_union;
+
+/* float type data load instructions */
+static __m128 __lsx_vreplfr2vr_s(float val) {
+    ft_union fi_tmpval = {.f = val};
+    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+
+static __m256 __lasx_xvreplfr2vr_s(float val) {
+    ft_union fi_tmpval = {.f = val};
+    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
+}
+#endif
+
+#ifdef __F16C__
+
+#ifdef _MSC_VER
+#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+#else
+#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+#endif
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+/* the inline asm below is about 12% faster than the lookup method */
+#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    register float f;
+    register double d;
+    __asm__(
+        "mtfprd %0,%2\n"
+        "xscvhpdp %0,%0\n"
+        "frsp %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=f"(f):
+        /* in */   "r"(h));
+    return f;
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    register double d;
+    register ggml_fp16_t r;
+    __asm__( /* xscvdphp can work on double or single precision */
+        "xscvdphp %0,%2\n"
+        "mffprd %1,%0\n" :
+        /* temp */ "=d"(d),
+        /* out */  "=r"(r):
+        /* in */   "f"(f));
+    return r;
+}
+
+#else
+
+// FP16 <-> FP32
+// ref: https://github.com/Maratyszcza/FP16
+
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+
+#endif // __F16C__
+
+#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+// precomputed f32 table for f16 (256 KB)
+// defined in ggml.c, initialized in ggml_init()
+extern float ggml_table_f32_f16[1 << 16];
+
+// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
+// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
+// This is also true for POWER9.
+#if !defined(GGML_FP16_TO_FP32)
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
+#endif
+
+#if !defined(GGML_FP32_TO_FP16)
+#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index cb7f7728b..833984190 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -1,15 +1,17 @@
 #pragma once
 
-#include "ggml.h"
-
 // GGML internal header
 
+#include "ggml.h"
+
 #include <assert.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-#include <stddef.h>
 #include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h>   // fabsf
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #undef MIN
 #undef MAX
@@ -17,96 +19,6 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
-#if defined(_MSC_VER)
-
-#define m512bh(p) p
-#define m512i(p) p
-
-#else
-
-#define m512bh(p) (__m512bh)(p)
-#define m512i(p) (__m512i)(p)
-
-#endif
-
-/**
- * Converts brain16 to float32.
- *
- * The bfloat16 floating point format has the following structure:
- *
- *       ┌sign
- *       │
- *       │   ┌exponent
- *       │   │
- *       │   │      ┌mantissa
- *       │   │      │
- *       │┌──┴───┐┌─┴───┐
- *     0b0000000000000000 brain16
- *
- * Since bf16 has the same number of exponent bits as a 32bit float,
- * encoding and decoding numbers becomes relatively straightforward.
- *
- *       ┌sign
- *       │
- *       │   ┌exponent
- *       │   │
- *       │   │      ┌mantissa
- *       │   │      │
- *       │┌──┴───┐┌─┴───────────────────┐
- *     0b00000000000000000000000000000000 IEEE binary32
- *
- * For comparison, the standard fp16 format has fewer exponent bits.
- *
- *       ┌sign
- *       │
- *       │  ┌exponent
- *       │  │
- *       │  │    ┌mantissa
- *       │  │    │
- *       │┌─┴─┐┌─┴──────┐
- *     0b0000000000000000 IEEE binary16
- *
- * @see IEEE 754-2008
- */
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.i = (uint32_t)h.bits << 16;
-    return u.f;
-}
-
-/**
- * Converts float32 to brain16.
- *
- * This is binary identical with Google Brain float conversion.
- * Floats shall round to nearest even, and NANs shall be quiet.
- * Subnormals aren't flushed to zero, except perhaps when used.
- * This code should vectorize nicely if using modern compilers.
- */
-static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
-    ggml_bf16_t h;
-    union {
-        float f;
-        uint32_t i;
-    } u;
-    u.f = s;
-    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
-        h.bits = (u.i >> 16) | 64; /* force to quiet */
-        return h;
-    }
-    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
-    return h;
-}
-
-#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
-#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
@@ -121,520 +33,6 @@ extern "C" {
 #endif
 #endif
 
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#endif
-
-// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
-#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#ifndef __SSSE3__
-#define __SSSE3__
-#endif
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-#include <sys/prctl.h>
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#ifdef _MSC_VER
-
-typedef uint16_t ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
-#else
-
-typedef __fp16 ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
-#endif // _MSC_VER
-
-#if !defined(__aarch64__)
-
-// 32-bit ARM compatibility
-
-// vaddlvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddlvq_s16(int16x8_t v) {
-    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
-    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
-    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
-    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
-    return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
-    int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
-    ggml_int16x8x2_t res;
-
-    res.val[0] = vld1q_s16(ptr + 0);
-    res.val[1] = vld1q_s16(ptr + 8);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
-    uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
-    ggml_uint8x16x2_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
-    uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
-    ggml_uint8x16x4_t res;
-
-    res.val[0] = vld1q_u8(ptr + 0);
-    res.val[1] = vld1q_u8(ptr + 16);
-    res.val[2] = vld1q_u8(ptr + 32);
-    res.val[3] = vld1q_u8(ptr + 48);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x2_t {
-    int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
-    ggml_int8x16x2_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-
-    return res;
-}
-
-typedef struct ggml_int8x16x4_t {
-    int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
-    ggml_int8x16x4_t res;
-
-    res.val[0] = vld1q_s8(ptr + 0);
-    res.val[1] = vld1q_s8(ptr + 16);
-    res.val[2] = vld1q_s8(ptr + 32);
-    res.val[3] = vld1q_s8(ptr + 48);
-
-    return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
-    int8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-// NOTE: not tested
-inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[ 0] = a[b[ 0]];
-    res[ 1] = a[b[ 1]];
-    res[ 2] = a[b[ 2]];
-    res[ 3] = a[b[ 3]];
-    res[ 4] = a[b[ 4]];
-    res[ 5] = a[b[ 5]];
-    res[ 6] = a[b[ 6]];
-    res[ 7] = a[b[ 7]];
-    res[ 8] = a[b[ 8]];
-    res[ 9] = a[b[ 9]];
-    res[10] = a[b[10]];
-    res[11] = a[b[11]];
-    res[12] = a[b[12]];
-    res[13] = a[b[13]];
-    res[14] = a[b[14]];
-    res[15] = a[b[15]];
-
-    return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t  int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t  int8x16x2_t
-#define ggml_int8x16x4_t  int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2  vld1q_u8_x2
-#define ggml_vld1q_u8_x4  vld1q_u8_x4
-#define ggml_vld1q_s8_x2  vld1q_s8_x2
-#define ggml_vld1q_s8_x4  vld1q_s8_x4
-#define ggml_vqtbl1q_s8   vqtbl1q_s8
-#define ggml_vqtbl1q_u8   vqtbl1q_u8
-
-#endif // !defined(__aarch64__)
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
-    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
-    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
-    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif // !defined(__ARM_FEATURE_DOTPROD)
-
-#endif // defined(__ARM_NEON)
-
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    ggml_fp16_internal_t tmp;
-    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-    return (float)tmp;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    ggml_fp16_t res;
-    ggml_fp16_internal_t tmp = f;
-    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
-    return res;
-}
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#if defined(__loongarch64)
-#if defined(__loongarch_asx)
-#include <lasxintrin.h>
-#endif
-#if defined(__loongarch_sx)
-#include <lsxintrin.h>
-#endif
-#endif
-
-#if defined(__loongarch_asx)
-
-typedef union {
-    int32_t i;
-    float f;
-} ft_union;
-
-/* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
-}
-
-static __m256 __lasx_xvreplfr2vr_s(float val) {
-    ft_union fi_tmpval = {.f = val};
-    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
-}
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
-
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif // __ARM_FEATURE_SVE
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
-
-#if !defined(GGML_FP32_TO_FP16)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
-
-enum ggml_cgraph_eval_order {
-    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-    GGML_CGRAPH_EVAL_ORDER_COUNT
-};
-
 // bitset
 
 typedef uint32_t ggml_bitset_t;
@@ -761,6 +159,12 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
 
 // computation graph
 
+enum ggml_cgraph_eval_order {
+    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
 struct ggml_cgraph {
     int size;
     int n_nodes;
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7c1ec8d54..8bffce860 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3,6 +3,7 @@
 
 #include "ggml-quants.h"
 #include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
 
 
 #include <math.h>
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 490c8d602..3a8aadae8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2,6 +2,7 @@
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
 #include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 #include "ggml.h"
 #include "ggml-aarch64.h"
diff --git a/ggml/src/llamafile/sgemm.cpp b/ggml/src/llamafile/sgemm.cpp
index 1be6fdb57..0193a463a 100644
--- a/ggml/src/llamafile/sgemm.cpp
+++ b/ggml/src/llamafile/sgemm.cpp
@@ -50,6 +50,7 @@
 
 #include "sgemm.h"
 #include "ggml-impl.h"
+#include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 
 #ifdef _MSC_VER