From 168324a388c86334605694f2dc0f7025267af2f4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 11 Jan 2025 17:52:45 +0200
Subject: [PATCH 01/15] cmake : enable -Wshadow for C++ code [no ci]

---
 cmake/common.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/common.cmake b/cmake/common.cmake
index 0f54871e4..c64ddbc3d 100644
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -13,7 +13,7 @@ function(llama_add_compile_flags)
             list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
                                 -Werror=implicit-int -Werror=implicit-function-declaration)
 
-            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+            list(APPEND CXX_FLAGS -Wshadow -Wmissing-declarations -Wmissing-noreturn)
 
             list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
 

From 0bebe45a25614401c372959770f89bab01165c47 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 12:15:19 +0200
Subject: [PATCH 02/15] llama : de-shadow (wip) [no ci]

---
 examples/gguf/gguf.cpp | 12 +++++++-----
 src/llama-kv-cache.h   |  6 +++---
 src/llama.cpp          | 42 +++++++++++++++++++++---------------------
 3 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index f31989c8c..d928db8fe 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -204,13 +204,15 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
                 __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
 
             // print first 10 elements
-            const float * data = (const float *) cur->data;
+            {
+                const float * data = (const float *) cur->data;
 
-            printf("%s data[:10] : ", name);
-            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
-                printf("%f ", data[j]);
+                printf("%s data[:10] : ", name);
+                for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
+                    printf("%f ", data[j]);
+                }
+                printf("\n\n");
             }
-            printf("\n\n");
 
             // check data
             if (check_data) {
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index dca6f3998..2645fd23b 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -58,12 +58,12 @@ struct llama_kv_cache {
     std::vector<ggml_backend_buffer_ptr> bufs;
 
     size_t total_size() const {
-        size_t size = 0;
+        size_t size_all = 0;
         for (const auto & buf : bufs) {
-            size += ggml_backend_buffer_get_size(buf.get());
+            size_all += ggml_backend_buffer_get_size(buf.get());
         }
 
-        return size;
+        return size_all;
     }
 
     // TODO: better data structures to reduce the cost of this operation
diff --git a/src/llama.cpp b/src/llama.cpp
index daf1b7c97..83822668e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1174,14 +1174,15 @@ struct llm_build_context {
         ggml_set_input(lctx.inp_K_shift);
 
         for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head_kv = hparams.n_head_kv(il);
-            const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
+            const int64_t n_head_kv_i    = hparams.n_head_kv(il);
+            const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
+
             struct ggml_tensor * rope_factors = build_rope_factors(il);
             struct ggml_tensor * k =
                 ggml_view_3d(ctx0, kv_self.k_l[il],
-                    n_embd_head_k, n_head_kv, n_ctx,
+                    n_embd_head_k, n_head_kv_i, n_ctx,
                     ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
-                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
+                    ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
                     0);
 
             struct ggml_tensor * tmp;
@@ -1231,18 +1232,18 @@ struct llm_build_context {
             }
 
             for (int il = 0; il < n_layer; ++il) {
-                const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
-                const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
+                const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il);
+                const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il);
 
                 ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
+                        n_embd_k_gqa_i, nm,
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*i));
 
                 ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
-                        n_embd_k_gqa, nm,
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
-                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
+                        n_embd_k_gqa_i, nm,
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i),
+                        ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*id));
 
                 ggml_tensor * view_v_src;
                 ggml_tensor * view_v_dst;
@@ -1250,22 +1251,22 @@ struct llm_build_context {
                 if (flash_attn) {
                     // NOTE: the V cache is not transposed when using flash attention
                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i));
+                            n_embd_v_gqa_i, nm,
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*i));
 
                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            n_embd_v_gqa, nm,
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa),
-                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id));
+                            n_embd_v_gqa_i, nm,
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i),
+                            ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*id));
                 } else {
                     view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
+                            nm, n_embd_v_gqa_i,
                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
                             ggml_row_size(kv_self.v_l[il]->type, i));
 
                     view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
-                            nm, n_embd_v_gqa,
+                            nm, n_embd_v_gqa_i,
                             ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
                             ggml_row_size(kv_self.v_l[il]->type, id));
                 }
@@ -1459,7 +1460,6 @@ struct llm_build_context {
     }
 
     struct ggml_tensor * llm_build_inp_embd_enc() {
-        const int64_t n_embd = hparams.n_embd;
         lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc);
         ggml_set_input(lctx.inp_embd_enc);
         cb(lctx.inp_embd_enc, "embd_enc", -1);

From 0127774ae4410abf58ef816bf5a8deaa194afbd0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 12:17:24 +0200
Subject: [PATCH 03/15] llama : remove unused mutable n_tokens [no ci]

---
 src/llama.cpp | 60 ---------------------------------------------------
 1 file changed, 60 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 83822668e..ed99094be 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1476,9 +1476,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_llama() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -1553,7 +1550,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -1642,9 +1638,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_deci() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -1730,7 +1723,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -2141,9 +2133,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_grok() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -2218,7 +2207,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -2300,9 +2288,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_dbrx() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -2370,7 +2355,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -3553,9 +3537,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_qwen2moe() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -3620,7 +3601,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5440,9 +5420,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_olmo() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5513,7 +5490,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5564,9 +5540,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_olmo2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5637,7 +5610,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -5692,9 +5664,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_olmoe() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -5764,7 +5733,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6085,9 +6053,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_arctic() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6146,7 +6111,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6219,9 +6183,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_deepseek() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -6295,7 +6256,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6376,9 +6336,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_deepseek2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         bool is_lite = (hparams.n_layer == 27);
 
         // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
@@ -6527,7 +6484,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6757,9 +6713,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_t5_enc() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -6833,7 +6786,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -6889,9 +6841,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_t5_dec() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@@ -7033,7 +6982,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
                 inpCA = ggml_get_rows(ctx0, inpCA, inp_out_ids);
@@ -7421,9 +7369,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_exaone() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7497,7 +7442,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
@@ -7779,9 +7723,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_chameleon() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        // mutable variable, needed during the last layer of the computation to skip unused tokens
-        int32_t n_tokens = this->n_tokens;
-
         const int64_t n_embd_head = hparams.n_embd_head_v;
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
@@ -7878,7 +7819,6 @@ struct llm_build_context {
             if (il == n_layer - 1) {
                 // skip computing output for unused tokens
                 struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                n_tokens = n_outputs;
                 cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }

From 32e7b9dc995a27d52bcf7cb2c77c87a534b2f1ef Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 12:30:54 +0200
Subject: [PATCH 04/15] llama : de-shadow (cont) [no ci]

---
 src/llama-vocab.cpp | 17 +++++++++++-----
 src/llama.cpp       | 47 +++++++++++++++++++++------------------------
 2 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d0fb85cea..cd943b97c 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -24,25 +24,30 @@
 struct naive_trie {
     naive_trie() : has_value(false), value(0) {
     }
-    void insert(const char * key, size_t len, int32_t value = 0) {
+
+    void insert(const char * key, size_t len, int32_t val = 0) {
         if (len == 0) {
-            this->has_value = true;
-            this->value = value;
+            has_value = true;
+            value = val;
+
             return;
         }
+
         char c = key[0];
         auto res = children.find(c);
         if (res != children.end()) {
-            res->second.insert(key + 1, len - 1, value);
+            res->second.insert(key + 1, len - 1, val);
         } else {
             auto res = children.insert(std::make_pair(c, naive_trie()));
-            res.first->second.insert(key + 1, len - 1, value);
+            res.first->second.insert(key + 1, len - 1, val);
         }
     }
+
     std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
         if (len == 0 || offset == len) {
             return std::make_pair(key, offset);
         }
+
         char c = key[offset];
         auto res = children.find(c);
         if (res != children.end()) {
@@ -51,6 +56,7 @@ struct naive_trie {
 
         return std::make_pair(key, offset);
     }
+
     const struct naive_trie * traverse(const char c) const {
         auto res = children.find(c);
         if (res != children.end()) {
@@ -59,6 +65,7 @@ struct naive_trie {
 
         return NULL;
     }
+
     std::map<char, struct naive_trie> children;
     bool has_value;
     llama_token value;
diff --git a/src/llama.cpp b/src/llama.cpp
index ed99094be..d907c2d6e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1656,10 +1656,10 @@ struct llm_build_context {
         const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
-            const int64_t n_head_kv = hparams.n_head_kv(il);
-            const int64_t n_head    = hparams.n_head(il);
+            const int64_t n_head_kv_i = hparams.n_head_kv(il);
+            const int64_t n_head_i    = hparams.n_head(il);
 
-            if (n_head == 0) {
+            if (n_head_i == 0) {
                 // attention-free layer of Llama-3_1-Nemotron-51B
                 cur = inpL;
             } else {
@@ -1670,11 +1670,11 @@ struct llm_build_context {
                 cb(cur, "attn_norm", il);
             }
 
-            if (n_head > 0 && n_head_kv == 0) {
+            if (n_head_i > 0 && n_head_kv_i == 0) {
                 // "linear attention" of Llama-3_1-Nemotron-51B
                 cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur);
                 cb(cur, "wo", il);
-            } else if (n_head > 0) {
+            } else if (n_head_i > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
                 struct ggml_tensor * rope_factors = build_rope_factors(il);
@@ -1702,14 +1702,14 @@ struct llm_build_context {
                 }
 
                 Qcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
+                    ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head_i, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
+                    ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv_i, n_tokens), inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
@@ -1734,7 +1734,7 @@ struct llm_build_context {
 
             // modified to support attention-free layer of Llama-3_1-Nemotron-51B
             struct ggml_tensor * ffn_inp = cur;
-            if (n_head > 0) {
+            if (n_head_i > 0) {
                 ffn_inp = ggml_add(ctx0, cur, inpSA);
                 cb(ffn_inp, "ffn_inp", il);
             }
@@ -2643,7 +2643,7 @@ struct llm_build_context {
 
         // iterate layers
         for (int il = 0; il < n_layer; ++il) {
-            struct ggml_tensor * cur = inpL;
+            cur = inpL;
 
             struct ggml_tensor * Qcur;
             struct ggml_tensor * Kcur;
@@ -4717,8 +4717,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_gemma() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
@@ -4825,8 +4823,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_gemma2() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
@@ -4962,6 +4958,7 @@ struct llm_build_context {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5800,9 +5797,9 @@ struct llm_build_context {
         struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
 
         for (int il = 0; il < n_layer; ++il) {
-            const int64_t n_head    = hparams.n_head(il);
-            const int64_t n_head_kv = hparams.n_head_kv(il);
-            const int64_t n_head_qkv = 2*n_head_kv + n_head;
+            const int64_t n_head_i     = hparams.n_head(il);
+            const int64_t n_head_kv_i  = hparams.n_head_kv(il);
+            const int64_t n_head_qkv_i = 2*n_head_kv_i + n_head_i;
 
             cur = inpL;
             struct ggml_tensor * residual = cur;
@@ -5818,15 +5815,15 @@ struct llm_build_context {
                 cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
-                cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv, n_tokens);
+                cur = ggml_reshape_3d(ctx0, cur, n_embd_head_k, n_head_qkv_i, n_tokens);
 
-                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, cur->nb[1], cur->nb[2], 0));
+                struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_i, n_tokens, cur->nb[1], cur->nb[2], 0));
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head));
+                struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*n_head_i));
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head+n_head_kv)));
+                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv_i, n_tokens, cur->nb[1], cur->nb[2], cur->nb[1]*(n_head_i+n_head_kv_i)));
                 cb(Vcur, "Vcur", il);
 
                 Qcur = llm_build_norm(ctx0, Qcur, hparams,
@@ -5851,7 +5848,7 @@ struct llm_build_context {
                 );
                 cb(Kcur, "Kcur", il);
 
-                Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv, n_tokens);
+                Vcur = ggml_reshape_2d(ctx0, Vcur, n_embd_head * n_head_kv_i, n_tokens);
                 cb(Qcur, "Vcur", il);
 
                 cur = llm_build_kv(ctx0, lctx, kv_self, gf,
@@ -7495,9 +7492,9 @@ struct llm_build_context {
         // Token shift state dimensions should be 2 * n_emb
         GGML_ASSERT(n_embd == hparams.n_embd_k_s() / 2);
 
-        const int64_t n_seqs = ubatch.n_seqs;
+        const int64_t n_seqs       = ubatch.n_seqs;
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
+
         GGML_ASSERT(n_seqs != 0);
         GGML_ASSERT(ubatch.equal_seqs);
         GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
@@ -7608,9 +7605,9 @@ struct llm_build_context {
 
         GGML_ASSERT(n_embd == hparams.n_embd_k_s());
 
-        const int64_t n_seqs = ubatch.n_seqs;
+        const int64_t n_seqs       = ubatch.n_seqs;
         const int64_t n_seq_tokens = ubatch.n_seq_tokens;
-        const int64_t n_tokens = ubatch.n_tokens;
+
         GGML_ASSERT(n_seqs != 0);
         GGML_ASSERT(ubatch.equal_seqs);
         GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);

From 82caffa74e4e101df3adba878ecb99f6e25e3d84 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 13:22:16 +0200
Subject: [PATCH 05/15] llama : de-shadow libllama [no ci]

---
 ci/run.sh                  |   3 +
 src/llama-batch.cpp        |  83 ++++++++++-----------
 src/llama-batch.h          |   4 +-
 src/llama-context.cpp      |   4 +-
 src/llama-grammar.cpp      |   2 +-
 src/llama-mmap.cpp         |  10 +--
 src/llama-model-loader.cpp |  16 ++--
 src/llama-model-loader.h   |   2 +-
 src/llama-model.cpp        | 147 +++++++++++++++++++------------------
 src/llama-model.h          |   2 -
 src/llama-quant.cpp        |  39 +++++-----
 src/llama-sampling.cpp     |  20 ++---
 src/llama-vocab.cpp        |  28 +++----
 13 files changed, 181 insertions(+), 179 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index abf08a4ff..bd3420e48 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -13,6 +13,9 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with METAL support
+# GG_BUILD_METAL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
index 01d5ca57f..e92e5ba9d 100644
--- a/src/llama-batch.cpp
+++ b/src/llama-batch.cpp
@@ -7,9 +7,9 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
     // clear empty sequences
     // the previous ubatch is assumed to be gone,
     // so nothing should refer to values in these sequences anymore.
-    for (size_t i = seq.size(); i-- > 0;) {
-        if (seq[i].length == 0) {
-            seq.pop_back();
+    for (size_t i = seqs.size(); i-- > 0;) {
+        if (seqs[i].length == 0) {
+            seqs.pop_back();
         } else {
             break;
         }
@@ -36,48 +36,48 @@ llama_ubatch llama_sbatch::reserve_ubatch(size_t n_ubatch, bool has_embd) {
 }
 
 void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length) {
-    GGML_ASSERT(batch != nullptr);
+    GGML_ASSERT(batch_ptr != nullptr);
     GGML_ASSERT(length <= seq.length);
     // Can only add sequences of equal lengths to a batch,
     // otherwise it isn't clear to which sequence a token belongs
     GGML_ASSERT(seq.n_seq_id == 0 || ubatch.n_seqs == 0 || length == (size_t) ubatch.n_tokens / ubatch.n_seqs);
     GGML_ASSERT((seq.n_seq_id != 0) == ubatch.equal_seqs);
     // NOTE: loops are separated for cache-friendliness
-    if (batch->token) {
+    if (batch_ptr->token) {
         if (ubatch.equal_seqs) {
             for (size_t i = 0; i < length; ++i) {
-                ubatch.token[ubatch.n_tokens + i] = batch->token[ids[seq.offset + i]];
+                ubatch.token[ubatch.n_tokens + i] = batch_ptr->token[ids[seq.offset + i]];
             }
         } else {
             // simple split
-            ubatch.token = batch->token + seq.offset;
+            ubatch.token = batch_ptr->token + seq.offset;
         }
     } else {
         ubatch.token = nullptr;
     }
-    if (batch->embd) {
+    if (batch_ptr->embd) {
         if (ubatch.equal_seqs) {
             for (size_t i = 0; i < length; ++i) {
                 memcpy(
                         ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
-                        batch->embd + (n_embd * ids[seq.offset + i]),
+                        batch_ptr->embd + (n_embd * ids[seq.offset + i]),
                         n_embd * sizeof(float)
                       );
             }
         } else {
             // simple split
-            ubatch.embd = batch->embd + (n_embd * seq.offset);
+            ubatch.embd = batch_ptr->embd + (n_embd * seq.offset);
         }
     } else {
         ubatch.embd = nullptr;
     }
     if (ubatch.equal_seqs) {
         for (size_t i = 0; i < length; ++i) {
-            ubatch.pos[ubatch.n_tokens + i] = batch->pos[ids[seq.offset + i]];
+            ubatch.pos[ubatch.n_tokens + i] = batch_ptr->pos[ids[seq.offset + i]];
         }
     } else {
         // simple split
-        ubatch.pos = batch->pos + seq.offset;
+        ubatch.pos = batch_ptr->pos + seq.offset;
     }
     if (ubatch.equal_seqs) {
         ubatch.n_seq_id[ubatch.n_seqs] = seq.n_seq_id;
@@ -86,15 +86,15 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
         }
     } else {
         // simple split
-        if (batch->n_seq_id) {
-            ubatch.n_seq_id = batch->n_seq_id + seq.offset;
+        if (batch_ptr->n_seq_id) {
+            ubatch.n_seq_id = batch_ptr->n_seq_id + seq.offset;
         } else {
             for (size_t i = 0; i < length; ++i) {
                 ubatch.n_seq_id[ubatch.n_seqs + i] = 1;
             }
         }
-        if (batch->seq_id) {
-            ubatch.seq_id = batch->seq_id + seq.offset;
+        if (batch_ptr->seq_id) {
+            ubatch.seq_id = batch_ptr->seq_id + seq.offset;
         }
     }
     if (logits_all) {
@@ -102,17 +102,17 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
             ubatch.output[ubatch.n_tokens + i] = 1;
             out_ids.push_back(ids[seq.offset + i]);
         }
-    } else if (batch->logits) {
+    } else if (batch_ptr->logits) {
         if (ubatch.equal_seqs) {
             for (size_t i = 0; i < length; ++i) {
                 size_t id = ids[seq.offset + i];
-                int8_t is_output = batch->logits[id];
+                int8_t is_output = batch_ptr->logits[id];
                 ubatch.output[ubatch.n_tokens + i] = is_output;
                 if (is_output) { out_ids.push_back(id); }
             }
         } else {
             // simple split
-            ubatch.output = batch->logits + seq.offset;
+            ubatch.output = batch_ptr->logits + seq.offset;
             for (size_t i = 0; i < length; ++i) {
                 if (ubatch.output[i] != 0) { out_ids.push_back(seq.offset + i); }
             }
@@ -139,12 +139,12 @@ void llama_sbatch::add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & s
 
 llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
     n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
     ubatch.equal_seqs = false;
-    if (!seq.empty()) {
-        llama_sbatch_seq & s = seq[0];
+    if (!seqs.empty()) {
+        llama_sbatch_seq & s = seqs[0];
         size_t length = s.length < n_ubatch ? s.length : n_ubatch;
-        GGML_ASSERT(seq.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
+        GGML_ASSERT(seqs.size() == 1 && s.n_seq_id == 0); // don't mix with other splits
         add_seq_to_ubatch(ubatch, s, length);
     }
     return ubatch;
@@ -152,15 +152,15 @@ llama_ubatch llama_sbatch::split_simple(size_t n_ubatch) {
 
 llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
     n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
-    if (!seq.empty()) {
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
+    if (!seqs.empty()) {
         size_t length = 0;
         size_t n_tokens_in_ubatch = 0;
-        GGML_ASSERT(seq[0].n_seq_id > 0); // should not be mixed with simple splits
+        GGML_ASSERT(seqs[0].n_seq_id > 0); // should not be mixed with simple splits
                                           // smallest first, because it's easier to split this way;
                                           // starting from the end to pop in constant time.
-        for (size_t i = seq.size(); i-- > 0;) {
-            llama_sbatch_seq & s = seq[i];
+        for (size_t i = seqs.size(); i-- > 0;) {
+            llama_sbatch_seq & s = seqs[i];
             GGML_ASSERT(s.length > 0);
             if (length == 0) {
                 length = s.length < n_ubatch ? s.length : n_ubatch;
@@ -179,9 +179,9 @@ llama_ubatch llama_sbatch::split_equal(size_t n_ubatch) {
 
 llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
     n_ubatch = n_tokens < n_ubatch ? n_tokens : n_ubatch;
-    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch->embd != nullptr);
-    if (!seq.empty()) {
-        llama_sbatch_seq & s = seq[seq.size() - 1];
+    llama_ubatch ubatch = reserve_ubatch(n_ubatch, /* has_embd */ batch_ptr->embd != nullptr);
+    if (!seqs.empty()) {
+        llama_sbatch_seq & s = seqs.back();
         size_t length = s.length < n_ubatch ? s.length : n_ubatch;
         GGML_ASSERT(s.n_seq_id > 0); // should not be mixed with simple splits
         add_seq_to_ubatch(ubatch, s, length);
@@ -189,23 +189,24 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
     return ubatch;
 }
 
-void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
+void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd_cur, bool simple_split, bool logits_all_cur) {
     GGML_ASSERT(batch.n_tokens >= 0);
-    this->batch = &batch;
-    this->n_embd = n_embd;
-    this->logits_all = logits_all;
+
+    batch_ptr = &batch;
+    n_embd = n_embd_cur;
+    logits_all = logits_all_cur;
 
     n_tokens = batch.n_tokens;
     ids.resize(n_tokens);
     out_ids.clear();
-    // TODO: reserve out_ids and seq
+    // TODO: reserve out_ids and seqs
 
     for (size_t i = 0; i < n_tokens; ++i) {
         ids[i] = i;
     }
     if (simple_split) {
-        seq.resize(1);
-        llama_sbatch_seq & s = seq[0];
+        seqs.resize(1);
+        llama_sbatch_seq & s = seqs[0];
         s.n_seq_id = 0;
         s.seq_id = nullptr;
         s.offset = 0;
@@ -259,11 +260,11 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
             }
         }
         llama_sbatch_seq new_seq = {n_seqs, seq_ids, i, 1};
-        seq.push_back(new_seq);
-        last_seq = &seq.back();
+        seqs.push_back(new_seq);
+        last_seq = &seqs.back();
     }
     // keep shared prompts first at the end, then sort by length descending.
-    std::sort(seq.begin(), seq.end(),
+    std::sort(seqs.begin(), seqs.end(),
             [](llama_sbatch_seq & a, llama_sbatch_seq & b) {
                 if (a.n_seq_id == b.n_seq_id) {
                     return a.length > b.length;
diff --git a/src/llama-batch.h b/src/llama-batch.h
index 773c3808b..572eb79fd 100644
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@@ -45,9 +45,9 @@ struct llama_sbatch {
     std::vector<size_t> ids;
     // batch indices of the output
     std::vector<size_t> out_ids;
-    std::vector<llama_sbatch_seq> seq;
+    std::vector<llama_sbatch_seq> seqs;
 
-    const llama_batch * batch = nullptr;
+    const llama_batch * batch_ptr = nullptr;
 
     // buffers for the ubatch
     std::vector<llama_token>    ubatch_token;
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 671d2a81a..c761a4a21 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -916,8 +916,8 @@ struct llama_data_write {
                 write(&n_seq_id, sizeof(n_seq_id));
 
                 if (n_seq_id) {
-                    for (auto seq_id : cell.seq_id) {
-                        write(&seq_id, sizeof(seq_id));
+                    for (auto sid : cell.seq_id) {
+                        write(&sid, sizeof(sid));
                     }
                 }
             }
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index bebe4e9a3..bea7d0b1a 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -490,7 +490,7 @@ const char * llama_grammar_parser::parse_sequence(
                     pos = parse_space(pos + 1, is_nested);
 
                     if (is_digit_char(*pos)) {
-                        const char * int_end = parse_int(pos);
+                        int_end = parse_int(pos);
                         max_times = std::stoul(std::string(pos, int_end - pos));
                         pos = parse_space(int_end, is_nested);
                     }
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 57c6e4f51..db4c4bcbe 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -454,8 +454,8 @@ struct llama_mlock::impl {
         return (size_t) sysconf(_SC_PAGESIZE);
     }
 
-    bool raw_lock(const void * addr, size_t size) const {
-        if (!mlock(addr, size)) {
+    bool raw_lock(const void * addr_cur, size_t size_cur) const {
+        if (!mlock(addr_cur, size_cur)) {
             return true;
         }
 
@@ -475,12 +475,12 @@ struct llama_mlock::impl {
         if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
             suggest = false;
         }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
+        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) {
             suggest = false;
         }
 
         LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+                size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
         return false;
     }
 
@@ -535,7 +535,7 @@ struct llama_mlock::impl {
         return (size_t) 65536;
     }
 
-    bool raw_lock(const void * addr, size_t len) const {
+    bool raw_lock(const void * addr_cur, size_t size_cur) const {
         LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
         return false;
     }
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 53175f0e0..a781b2884 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -413,7 +413,7 @@ namespace GGUFMeta {
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
 
-llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
+llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p) {
     int trace = 0;
     if (getenv("LLAMA_TRACE")) {
         trace = atoi(getenv("LLAMA_TRACE"));
@@ -626,11 +626,11 @@ llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap,
 
     if (!llama_mmap::SUPPORTED) {
         LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
-        use_mmap = false;
+        use_mmap_cur = false;
     }
 
-    this->use_mmap = use_mmap;
-    this->check_tensors = check_tensors;
+    use_mmap = use_mmap_cur;
+    check_tensors = check_tensors_cur;
 }
 
 std::string llama_model_loader::get_arch_name() const {
@@ -887,15 +887,15 @@ bool llama_model_loader::load_all_data(
 
         // If the backend is supported, create pinned memory buffers and events for synchronisation.
         for (size_t idx = 0; idx < n_buffers; ++idx) {
-            auto * buf = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
-            if (!buf) {
+            auto * buf_new = ggml_backend_buft_alloc_buffer(host_buft, buffer_size);
+            if (!buf_new) {
                 LLAMA_LOG_DEBUG("%s: failed to allocate host buffer for async uploads for device %s\n", func,
                     ggml_backend_dev_name(dev));
                 return nullptr;
             }
 
-            host_buffers.emplace_back(buf);
-            host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf));
+            host_buffers.emplace_back(buf_new);
+            host_ptrs.emplace_back(ggml_backend_buffer_get_base(buf_new));
 
             auto * event = ggml_backend_event_new(dev);
             if (!event) {
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index b63d158d9..4814bbdc9 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -90,7 +90,7 @@ struct llama_model_loader {
     size_t size_data = 0;
     std::vector<std::pair<size_t, size_t>> mmaps_used;
 
-    llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p);
+    llama_model_loader(const std::string & fname, bool use_mmap_cur, bool check_tensors_cur, const struct llama_model_kv_override * param_overrides_p);
 
     template<typename T>
     typename std::enable_if<std::is_integral<T>::value, bool>::type
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index f90f5e746..1229d8738 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -340,7 +340,8 @@ struct llama_model::impl {
 
     size_t n_bytes = 0;
 
-    std::string desc_str;
+    std::string name_str = "n/a";
+    std::string desc_str = "n/a";
 
     // model memory mapped files
     llama_mmaps mappings;
@@ -390,17 +391,17 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
     // get metadata as string
     for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
-        enum gguf_type type = gguf_get_kv_type(ctx, i);
-        if (type == GGUF_TYPE_ARRAY) {
+        gguf_type type_cur = gguf_get_kv_type(ctx, i);
+        if (type_cur == GGUF_TYPE_ARRAY) {
             continue;
         }
-        const char * name = gguf_get_key(ctx, i);
-        const std::string value = gguf_kv_to_str(ctx, i);
-        gguf_kv.emplace(name, value);
+        const char * name_cur = gguf_get_key(ctx, i);
+        const std::string value_cur = gguf_kv_to_str(ctx, i);
+        gguf_kv.emplace(name_cur, value_cur);
     }
 
     // get general kv
-    ml.get_key(LLM_KV_GENERAL_NAME, name, false);
+    ml.get_key(LLM_KV_GENERAL_NAME, pimpl->name_str, false);
 
     // everything past this point is not vocab-related
     if (hparams.vocab_only) {
@@ -1333,13 +1334,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
         auto it = ctx_map.find(buft);
         if (it == ctx_map.end()) {
-            ggml_init_params params = {
+            ggml_init_params params_cur = {
                 /*.mem_size   =*/ ctx_size,
                 /*.mem_buffer =*/ NULL,
                 /*.no_alloc   =*/ true,
             };
 
-            ggml_context * ctx = ggml_init(params);
+            ggml_context * ctx = ggml_init(params_cur);
             if (!ctx) {
                 throw std::runtime_error(format("failed to create ggml context"));
             }
@@ -1557,31 +1558,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
-                        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
-                        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
-                        const int64_t n_embd_gqa    = hparams.n_embd_v_gqa(i);
-                        const int64_t n_ff          = hparams.n_ff(i);
-                        const int64_t n_head        = hparams.n_head(i);
-                        const int64_t n_head_kv     = hparams.n_head_kv(i);
+                        const int64_t n_embd_k_gqa_i  = hparams.n_embd_k_gqa(i);
+                        const int64_t n_embd_v_gqa_i  = hparams.n_embd_v_gqa(i);
+                        const int64_t n_embd_gqa_i    = hparams.n_embd_v_gqa(i);
+                        const int64_t n_ff_i          = hparams.n_ff(i);
+                        const int64_t n_head_i        = hparams.n_head(i);
+                        const int64_t n_head_kv_i     = hparams.n_head_kv(i);
 
-                        if (n_head_kv == 0 && n_head > 0) {
+                        if (n_head_kv_i == 0 && n_head_i > 0) {
                             // linear attention for DeciLMCausalModel
                             layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
                             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
                         }
-                        else if (n_head_kv > 0) {
+                        else if (n_head_kv_i > 0) {
                             layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
-                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa}, 0);
-                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
-                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+                            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head_i}, 0);
+                            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_k_gqa_i}, 0);
+                            layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa_i}, 0);
+                            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head_i, n_embd}, 0);
                         }
 
                         // optional bias tensors
                         layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa_i}, TENSOR_NOT_REQUIRED);
                         layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
@@ -1594,14 +1595,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
                         }
 
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff_i}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff_i, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff_i}, 0);
 
                         // optional MLP bias
-                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED);
                         layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
-                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                        layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff_i}, TENSOR_NOT_REQUIRED);
                     }
                 } break;
             case LLM_ARCH_MINICPM3:
@@ -2653,23 +2654,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
 
                     for (int i = 0; i < n_layer; ++i) {
-                        const int64_t n_head      =   hparams.n_head(i);
-                        const int64_t n_head_qkv  = 2*hparams.n_head_kv(i) + n_head;
-                        const int64_t n_ff        =   hparams.n_ff(i);
+                        const int64_t n_head_i     = hparams.n_head(i);
+                        const int64_t n_head_qkv_i = 2*hparams.n_head_kv(i) + n_head_i;
+                        const int64_t n_ff_i       = hparams.n_ff(i);
 
                         auto & layer = layers[i];
 
                         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
+                        layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv_i*n_embd_head_k}, 0);
                         layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
                         layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head_i*n_embd_head_k, n_embd}, 0);
 
                         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff_i}, 0);
+                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff_i, n_embd}, 0);
+                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff_i}, 0);
                     }
                 } break;
             case LLM_ARCH_GPTNEOX:
@@ -3167,11 +3168,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
                     output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
 
-                    const int time_mix_extra_dim = hparams.time_mix_extra_dim;
+                    const int time_mix_extra_dim   = hparams.time_mix_extra_dim;
                     const int time_decay_extra_dim = hparams.time_decay_extra_dim;
-                    const int head_size = hparams.wkv_head_size;
-                    const int attn_hidden_size = n_embd;
-                    const int n_head_kv = hparams.n_head_kv();
+                    const int head_size            = hparams.wkv_head_size;
+                    const int attn_hidden_size     = n_embd;
+
                     int attn_key_value_size;
                     if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
                         attn_key_value_size = attn_hidden_size;
@@ -3254,7 +3255,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // posnet
                     {
-                        const int64_t n_embd = hparams.posnet.n_embd;
+                        const int64_t n_embd_cur = hparams.posnet.n_embd;
 
                         for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
                             auto & layer = layers[i].posnet;
@@ -3274,39 +3275,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                                 case 3:
                                 case 4:
                                     {
-                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
-                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd}, 0);
+                                        layer.norm1   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd_cur}, 0);
+                                        layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias",   i), {1, n_embd_cur}, 0);
 
-                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
-                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd}, 0);
+                                        layer.conv1   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd_cur, n_embd_cur}, 0);
+                                        layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias",   i), {1, n_embd_cur}, 0);
 
-                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
-                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd}, 0);
+                                        layer.norm2   = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd_cur}, 0);
+                                        layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias",   i), {1, n_embd_cur}, 0);
 
-                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
-                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd}, 0);
+                                        layer.conv2   = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd_cur, n_embd_cur}, 0);
+                                        layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias",   i), {1, n_embd_cur}, 0);
                                     } break;
                                 case 2:
                                     {
-                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
-                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+                                        layer.attn_norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0);
+                                        layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd_cur}, 0);
 
-                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd}, 0);
+                                        layer.attn_q      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
+                                        layer.attn_q_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q,    "bias",   i), {1, n_embd_cur}, 0);
 
-                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd}, 0);
+                                        layer.attn_k      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
+                                        layer.attn_k_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K,    "bias",   i), {1, n_embd_cur}, 0);
 
-                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd}, 0);
+                                        layer.attn_v      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
+                                        layer.attn_v_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V,    "bias",   i), {1, n_embd_cur}, 0);
 
-                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd, n_embd}, 0);
-                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd}, 0);
+                                        layer.attn_o      = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "weight", i), {1, n_embd_cur, n_embd_cur}, 0);
+                                        layer.attn_o_b    = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT,  "bias",   i), {1, n_embd_cur}, 0);
                                     } break;
                                 case 5:
                                     {
-                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
-                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd}, 0);
+                                        layer.norm   = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd_cur}, 0);
+                                        layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias",   i), {1, n_embd_cur}, 0);
                                     } break;
                                 default: GGML_ABORT("unknown posnet layer");
                             };
@@ -3320,29 +3321,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // convnext
                     {
-                        const int64_t n_embd = hparams.convnext.n_embd;
+                        const int64_t n_embd_cur = hparams.convnext.n_embd;
 
                         for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
                             auto & layer = layers[i].convnext;
 
-                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd}, 0);
-                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd}, 0);
+                            layer.dw     = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "weight", i), {7, 1, n_embd_cur}, 0);
+                            layer.dw_b   = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW,    "bias",   i), {1, n_embd_cur}, 0);
 
-                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd}, 0);
-                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd}, 0);
+                            layer.norm   = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "weight", i), {n_embd_cur}, 0);
+                            layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM,  "bias",   i), {n_embd_cur}, 0);
 
-                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd, n_ff}, 0);
+                            layer.pw1    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "weight", i), {n_embd_cur, n_ff}, 0);
                             layer.pw1_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1,   "bias",   i), {n_ff}, 0);
 
-                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd}, 0);
-                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd}, 0);
+                            layer.pw2    = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "weight", i), {n_ff, n_embd_cur}, 0);
+                            layer.pw2_b  = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2,   "bias",   i), {n_embd_cur}, 0);
 
-                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
+                            layer.gamma  = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd_cur}, 0);
                         }
 
                         // output
-                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, 0);
+                        output_norm   = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd_cur}, 0);
+                        output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd_cur}, 0);
                     }
 
                     output   = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
@@ -3601,7 +3602,7 @@ void llama_model::print_info() const {
     }
 
     // general kv
-    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, name.c_str());
+    LLAMA_LOG_INFO("%s: general.name     = %s\n",    __func__, pimpl->name_str.c_str());
 
     if (arch == LLM_ARCH_DEEPSEEK) {
         LLAMA_LOG_INFO("%s: n_layer_dense_lead   = %d\n",     __func__, hparams.n_layer_dense_lead);
diff --git a/src/llama-model.h b/src/llama-model.h
index 4cc8abb75..39e26fae7 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -290,8 +290,6 @@ struct llama_model {
     llm_type type = LLM_TYPE_UNKNOWN;
     llm_arch arch = LLM_ARCH_UNKNOWN;
 
-    std::string name = "n/a";
-
     llama_hparams hparams = {};
     llama_vocab   vocab;
 
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d4947a780..6c59e1730 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -423,8 +423,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     int64_t counter = 0;
     size_t new_size = 0;
     bool valid = true;
-    auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
-            nrows, n_per_row, imatrix]() {
+    auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() {
         const int64_t nrows_per_chunk = chunk_size / n_per_row;
         size_t local_size = 0;
         while (true) {
@@ -437,6 +436,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
                 break;
             }
             lock.unlock();
+
             const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
             size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
             local_size += this_size;
@@ -445,7 +445,7 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
             const size_t row_size  = ggml_row_size(new_type, n_per_row);
             void * this_data = (char *) new_data + first_row * row_size;
             if (!ggml_validate_row_data(new_type, this_data, this_size)) {
-                std::unique_lock<std::mutex> lock(mutex);
+                lock.lock();
                 valid = false;
                 break;
             }
@@ -589,15 +589,15 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     }
 
     // make a list of weights
-    std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
-    tensors.reserve(ml.weights_map.size());
+    std::vector<const llama_model_loader::llama_tensor_weight *> tensor_weights;
+    tensor_weights.reserve(ml.weights_map.size());
     for (const auto & it : ml.weights_map) {
-        tensors.push_back(&it.second);
+        tensor_weights.push_back(&it.second);
     }
 
     // keep_split requires that the weights are sorted by split index
     if (params->keep_split) {
-        std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
+        std::sort(tensor_weights.begin(), tensor_weights.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
             if (a->idx == b->idx) {
                 return a->offs < b->offs;
             }
@@ -605,8 +605,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
 
-    for (const auto * it : tensors) {
-        const struct ggml_tensor * tensor = it->tensor;
+    for (const auto * tw : tensor_weights) {
+        const ggml_tensor * tensor = tw->tensor;
 
         const std::string name = ggml_get_name(tensor);
 
@@ -650,17 +650,17 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     // Assume split index is continuous
     if (params->keep_split) {
-        for (const auto * it : tensors) {
-            n_split = std::max(uint16_t(it->idx + 1), n_split);
+        for (const auto * tw : tensor_weights) {
+            n_split = std::max(uint16_t(tw->idx + 1), n_split);
         }
     }
     std::vector<gguf_context_ptr> ctx_outs(n_split);
     ctx_outs[0] = std::move(ctx_out);
 
-    // populate the original tensors so we get an initial meta data
-    for (const auto * it : tensors) {
-        uint16_t i_split = params->keep_split ? it->idx : 0;
-        struct ggml_tensor * tensor = it->tensor;
+    // populate the original tensor_weights so we get an initial meta data
+    for (const auto * tw : tensor_weights) {
+        uint16_t i_split = params->keep_split ? tw->idx : 0;
+        ggml_tensor * tensor = tw->tensor;
         if (!ctx_outs[i_split]) {
             ctx_outs[i_split].reset(gguf_init_empty());
         }
@@ -707,12 +707,11 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     const auto tn = LLM_TN(model.arch);
     new_ofstream(0);
-    for (const auto * it : tensors) {
-        const auto & weight = *it;
-        struct ggml_tensor * tensor = weight.tensor;
-        if (weight.idx != cur_split && params->keep_split) {
+    for (const auto * tw : tensor_weights) {
+        ggml_tensor * tensor = tw->tensor;
+        if (tw->idx != cur_split && params->keep_split) {
             close_ofstream();
-            new_ofstream(weight.idx);
+            new_ofstream(tw->idx);
         }
 
         const std::string name = ggml_get_name(tensor);
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index b3a12386e..711de388e 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -412,8 +412,8 @@ static void llama_sampler_chain_accept(struct llama_sampler * smpl, llama_token
 
     time_meas tm(chain->t_sample_us, chain->params.no_perf);
 
-    for (auto * smpl : chain->samplers) {
-        llama_sampler_accept(smpl, token);
+    for (auto * cur : chain->samplers) {
+        llama_sampler_accept(cur, token);
     }
 
     chain->n_sample++;
@@ -424,16 +424,16 @@ static void llama_sampler_chain_apply(struct llama_sampler * smpl, llama_token_d
 
     time_meas tm(chain->t_sample_us, chain->params.no_perf);
 
-    for (auto * smpl : chain->samplers) {
-        llama_sampler_apply(smpl, cur_p);
+    for (auto * cur : chain->samplers) {
+        llama_sampler_apply(cur, cur_p);
     }
 }
 
 static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
     auto * chain = (llama_sampler_chain *) smpl->ctx;
 
-    for (auto * smpl : chain->samplers) {
-        llama_sampler_reset(smpl);
+    for (auto * cur : chain->samplers) {
+        llama_sampler_reset(cur);
     }
 
     chain->t_sample_us = 0;
@@ -445,8 +445,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
 
     auto * result = llama_sampler_chain_init(chain_src->params);
 
-    for (auto * smpl : chain_src->samplers) {
-        llama_sampler_chain_add(result, llama_sampler_clone(smpl));
+    for (auto * cur : chain_src->samplers) {
+        llama_sampler_chain_add(result, llama_sampler_clone(cur));
     }
 
     return result;
@@ -455,8 +455,8 @@ static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampl
 static void llama_sampler_chain_free(struct llama_sampler * smpl) {
     auto * chain = (llama_sampler_chain *) smpl->ctx;
 
-    for (auto * smpl : chain->samplers) {
-        llama_sampler_free(smpl);
+    for (auto * cur : chain->samplers) {
+        llama_sampler_free(cur);
     }
 
     delete chain;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index cd943b97c..df6bcdf6a 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -34,12 +34,12 @@ struct naive_trie {
         }
 
         char c = key[0];
-        auto res = children.find(c);
-        if (res != children.end()) {
-            res->second.insert(key + 1, len - 1, val);
+        auto child = children.find(c);
+        if (child != children.end()) {
+            child->second.insert(key + 1, len - 1, val);
         } else {
-            auto res = children.insert(std::make_pair(c, naive_trie()));
-            res.first->second.insert(key + 1, len - 1, val);
+            auto child_new = children.insert(std::make_pair(c, naive_trie()));
+            child_new.first->second.insert(key + 1, len - 1, val);
         }
     }
 
@@ -49,18 +49,18 @@ struct naive_trie {
         }
 
         char c = key[offset];
-        auto res = children.find(c);
-        if (res != children.end()) {
-            return res->second.get_longest_prefix(key, len, offset + 1);
+        auto child = children.find(c);
+        if (child != children.end()) {
+            return child->second.get_longest_prefix(key, len, offset + 1);
         }
 
         return std::make_pair(key, offset);
     }
 
     const struct naive_trie * traverse(const char c) const {
-        auto res = children.find(c);
-        if (res != children.end()) {
-            return &res->second;
+        auto child = children.find(c);
+        if (child != children.end()) {
+            return &child->second;
         }
 
         return NULL;
@@ -1285,7 +1285,7 @@ struct llama_vocab::impl {
 
     llama_token_attr token_get_attr(llama_token id) const;
 
-    void init_tokenizer(enum llama_vocab_type type);
+    void init_tokenizer();
 
     void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
 
@@ -1675,7 +1675,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
     }
     GGML_ASSERT(id_to_token.size() == token_to_id.size());
 
-    init_tokenizer(type);
+    init_tokenizer();
 
     // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
     if (type == LLAMA_VOCAB_TYPE_SPM) {
@@ -2116,7 +2116,7 @@ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
     return id_to_token.at(id).attr;
 }
 
-void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
+void llama_vocab::impl::init_tokenizer() {
     LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
 
     switch (type) {

From 9a735ae6d84b4bf76a8444d72fabef8ad353abcf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 14:25:32 +0200
Subject: [PATCH 06/15] examplse : de-shadow

ggml-ci
---
 common/arg.cpp                                | 26 +++----
 common/arg.h                                  | 12 ++--
 common/common.cpp                             | 10 +--
 common/console.cpp                            | 32 ++++-----
 common/log.cpp                                |  8 +--
 .../convert-llama2c-to-ggml.cpp               |  6 +-
 examples/gbnf-validator/gbnf-validator.cpp    |  7 --
 examples/imatrix/imatrix.cpp                  | 24 +++----
 examples/infill/infill.cpp                    | 12 ++--
 examples/llama-bench/llama-bench.cpp          |  8 +--
 examples/llava/clip.cpp                       | 39 +++++-----
 examples/perplexity/perplexity.cpp            | 26 +++----
 examples/run/CMakeLists.txt                   |  1 +
 examples/server/server.cpp                    | 72 +++++++++----------
 examples/server/utils.hpp                     |  8 +--
 examples/speculative/speculative.cpp          | 20 +++---
 16 files changed, 152 insertions(+), 159 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 27886b84e..b551596df 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -17,19 +17,19 @@
 
 using json = nlohmann::ordered_json;
 
-common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
-    this->examples = std::move(examples);
+common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> vals) {
+    examples = std::move(vals);
     return *this;
 }
 
-common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> excludes) {
-    this->excludes = std::move(excludes);
+common_arg & common_arg::set_excludes(std::initializer_list<enum llama_example> vals) {
+    excludes = std::move(vals);
     return *this;
 }
 
-common_arg & common_arg::set_env(const char * env) {
-    help = help + "\n(env: " + env + ")";
-    this->env = env;
+common_arg & common_arg::set_env(const char * val) {
+    help = help + "\n(env: " + val + ")";
+    env = val;
     return *this;
 }
 
@@ -46,8 +46,10 @@ bool common_arg::is_exclude(enum llama_example ex) {
     return excludes.find(ex) != excludes.end();
 }
 
-bool common_arg::get_value_from_env(std::string & output) {
-    if (env == nullptr) return false;
+bool common_arg::get_value_from_env(std::string & output) const {
+    if (env == nullptr) {
+        return false;
+    }
     char * value = std::getenv(env);
     if (value) {
         output = value;
@@ -56,7 +58,7 @@ bool common_arg::get_value_from_env(std::string & output) {
     return false;
 }
 
-bool common_arg::has_value_from_env() {
+bool common_arg::has_value_from_env() const {
     return env != nullptr && std::getenv(env);
 }
 
@@ -87,7 +89,7 @@ static std::vector<std::string> break_str_into_lines(std::string input, size_t m
     return result;
 }
 
-std::string common_arg::to_string() {
+std::string common_arg::to_string() const {
     // params for printing to console
     const static int n_leading_spaces = 40;
     const static int n_char_per_line_help = 70; // TODO: detect this based on current console
@@ -192,8 +194,6 @@ static std::string get_all_kv_cache_types() {
 //
 
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
-    std::string arg;
-    const std::string arg_prefix = "--";
     common_params & params = ctx_arg.params;
 
     std::unordered_map<std::string, common_arg *> arg_to_options;
diff --git a/common/arg.h b/common/arg.h
index 49ab8667b..d88efa462 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -53,15 +53,15 @@ struct common_arg {
         void (*handler)(common_params & params, const std::string &, const std::string &)
     ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
 
-    common_arg & set_examples(std::initializer_list<enum llama_example> examples);
-    common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
-    common_arg & set_env(const char * env);
+    common_arg & set_examples(std::initializer_list<enum llama_example> vals);
+    common_arg & set_excludes(std::initializer_list<enum llama_example> vals);
+    common_arg & set_env(const char * val);
     common_arg & set_sparam();
     bool in_example(enum llama_example ex);
     bool is_exclude(enum llama_example ex);
-    bool get_value_from_env(std::string & output);
-    bool has_value_from_env();
-    std::string to_string();
+    bool get_value_from_env(std::string & output) const;
+    bool has_value_from_env() const;
+    std::string to_string() const;
 };
 
 struct common_params_context {
diff --git a/common/common.cpp b/common/common.cpp
index 39bfb0c2e..16cc3f41c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -763,9 +763,11 @@ bool fs_create_directory_with_parents(const std::string & path) {
     return true;
 #else
     // if the path already exists, check whether it's a directory
-    struct stat info;
-    if (stat(path.c_str(), &info) == 0) {
-        return S_ISDIR(info.st_mode);
+    {
+        struct stat info;
+        if (stat(path.c_str(), &info) == 0) {
+            return S_ISDIR(info.st_mode);
+        }
     }
 
     size_t pos_slash = 1; // skip leading slashes for directory creation
@@ -796,7 +798,7 @@ bool fs_create_directory_with_parents(const std::string & path) {
 }
 
 std::string fs_get_cache_directory() {
-    std::string cache_directory = "";
+    std::string cache_directory;
     auto ensure_trailing_slash = [](std::string p) {
         // Make sure to add trailing slash
         if (p.back() != DIRECTORY_SEPARATOR) {
diff --git a/common/console.cpp b/common/console.cpp
index 078a8d678..8d3c8fa5f 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -43,7 +43,7 @@ namespace console {
     static bool      simple_io        = true;
     static display_t current_display  = reset;
 
-    static FILE*     out              = stdout;
+    static FILE*     fout             = stdout;
 
 #if defined (_WIN32)
     static void*     hConsole;
@@ -110,7 +110,7 @@ namespace console {
 
             tty = fopen("/dev/tty", "w+");
             if (tty != nullptr) {
-                out = tty;
+                fout = tty;
             }
         }
 
@@ -126,7 +126,7 @@ namespace console {
         // Restore settings on POSIX systems
         if (!simple_io) {
             if (tty != nullptr) {
-                out = stdout;
+                fout = stdout;
                 fclose(tty);
                 tty = nullptr;
             }
@@ -145,19 +145,19 @@ namespace console {
             fflush(stdout);
             switch(display) {
                 case reset:
-                    fprintf(out, ANSI_COLOR_RESET);
+                    fprintf(fout, ANSI_COLOR_RESET);
                     break;
                 case prompt:
-                    fprintf(out, ANSI_COLOR_YELLOW);
+                    fprintf(fout, ANSI_COLOR_YELLOW);
                     break;
                 case user_input:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
+                    fprintf(fout, ANSI_BOLD ANSI_COLOR_GREEN);
                     break;
                 case error:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
+                    fprintf(fout, ANSI_BOLD ANSI_COLOR_RED);
             }
             current_display = display;
-            fflush(out);
+            fflush(fout);
         }
     }
 
@@ -233,7 +233,7 @@ namespace console {
             return;
         }
 #endif
-        putc('\b', out);
+        putc('\b', fout);
     }
 
     static int estimateWidth(char32_t codepoint) {
@@ -274,7 +274,7 @@ namespace console {
 #else
         // We can trust expectedWidth if we've got one
         if (expectedWidth >= 0 || tty == nullptr) {
-            fwrite(utf8_codepoint, length, 1, out);
+            fwrite(utf8_codepoint, length, 1, fout);
             return expectedWidth;
         }
 
@@ -311,7 +311,7 @@ namespace console {
         pop_cursor();
         put_codepoint(&ch, 1, 1);
 #else
-        fprintf(out, "\b%c", ch);
+        fprintf(fout, "\b%c", ch);
 #endif
     }
 
@@ -353,7 +353,7 @@ namespace console {
     }
 
     static bool readline_advanced(std::string & line, bool multiline_input) {
-        if (out != stdout) {
+        if (fout != stdout) {
             fflush(stdout);
         }
 
@@ -364,7 +364,7 @@ namespace console {
 
         char32_t input_char;
         while (true) {
-            fflush(out); // Ensure all output is displayed before waiting for input
+            fflush(fout); // Ensure all output is displayed before waiting for input
             input_char = getchar32();
 
             if (input_char == '\r' || input_char == '\n') {
@@ -432,7 +432,7 @@ namespace console {
             line.pop_back();
             if (last == '\\') {
                 line += '\n';
-                fputc('\n', out);
+                fputc('\n', fout);
                 has_more = !has_more;
             } else {
                 // llama will just eat the single space, it won't act as a space
@@ -447,11 +447,11 @@ namespace console {
                 has_more = false;
             } else {
                 line += '\n';
-                fputc('\n', out);
+                fputc('\n', fout);
             }
         }
 
-        fflush(out);
+        fflush(fout);
         return has_more;
     }
 
diff --git a/common/log.cpp b/common/log.cpp
index 04c7c0ed1..7a94bf7f9 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -338,16 +338,16 @@ public:
         resume();
     }
 
-    void set_prefix(bool prefix) {
+    void set_prefix(bool val) {
         std::lock_guard<std::mutex> lock(mtx);
 
-        this->prefix = prefix;
+        prefix = val;
     }
 
-    void set_timestamps(bool timestamps) {
+    void set_timestamps(bool val) {
         std::lock_guard<std::mutex> lock(mtx);
 
-        this->timestamps = timestamps;
+        timestamps = val;
     }
 };
 
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index bdf0eed2a..ef0b22a3d 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -471,12 +471,12 @@ struct my_llama_file {
         GGML_ASSERT(ret == 0); // same
     }
 
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
+    void read_raw(void * ptr, size_t size_cur) {
+        if (size_cur == 0) {
             return;
         }
         errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
+        std::size_t ret = std::fread(ptr, size_cur, 1, fp);
         if (ferror(fp)) {
             die_fmt("fread failed: %s", strerror(errno));
         }
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp
index 17a0e27c4..12e7e762d 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/examples/gbnf-validator/gbnf-validator.cpp
@@ -60,13 +60,6 @@ int main(int argc, char** argv) {
     const std::string grammar_filename = argv[1];
     const std::string input_filename = argv[2];
 
-    // Read the GBNF grammar file
-    FILE* grammar_file = fopen(grammar_filename.c_str(), "r");
-    if (!grammar_file) {
-        fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str());
-        return 1;
-    }
-
     std::string grammar_str;
     {
         std::ifstream grammar_file(grammar_filename);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index b5f3feb9f..d4d3fc7c8 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -294,7 +294,7 @@ void IMatrixCollector::save_imatrix(int ncall) const {
 bool IMatrixCollector::load_imatrix(const char * fname) {
     std::ifstream in(fname, std::ios::binary);
     if (!in) {
-        LOG_ERR("%s: failed to open %s\n",__func__, fname);
+        LOG_ERR("%s: failed to open %s\n", __func__, fname);
         return false;
     }
     int n_entries;
@@ -308,7 +308,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
         std::vector<char> name_as_vec(len+1);
         in.read((char *)name_as_vec.data(), len);
         if (in.fail()) {
-            LOG_ERR("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
+            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
             return false;
         }
         name_as_vec[len] = 0;
@@ -319,7 +319,7 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
         int nval;
         in.read((char *)&nval, sizeof(nval));
         if (in.fail() || nval < 1) {
-            LOG_ERR("%s: failed reading number of values for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
             m_stats = {};
             return false;
         }
@@ -332,15 +332,15 @@ bool IMatrixCollector::load_imatrix(const char * fname) {
         std::vector<float> tmp(nval);
         in.read((char*)tmp.data(), nval*sizeof(float));
         if (in.fail()) {
-            LOG_ERR("%s: failed reading data for entry %d\n",__func__,i);
+            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
             m_stats = {};
             return false;
         }
 
         // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
-        for (int i = 0; i < nval; i++) {
-            e.values[i] += tmp[i];
-            e.counts[i] += ncall;
+        for (int j = 0; j < nval; j++) {
+            e.values[j] += tmp[j];
+            e.counts[j] += ncall;
         }
         e.ncall += ncall;
 
@@ -488,12 +488,10 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
         logits.reserve((size_t)n_ctx * n_vocab);
     }
 
-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
+    for (int ich = 0; ich < n_chunk; ++ich) {
+        const int start =   ich * n_ctx;
         const int end   = start + n_ctx;
 
-        std::vector<float> logits;
-
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         // clear the KV cache
@@ -537,7 +535,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
-        if (i == 0) {
+        if (ich == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
@@ -555,7 +553,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                     workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
             count += n_ctx - first - 1;
 
-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count));
             fflush(stdout);
 
             logits.clear();
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 489a208b6..f8d099591 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -462,14 +462,14 @@ int main(int argc, char ** argv) {
                 }
 
                 // tokenize new prefix and suffix
-                std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
-                std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
+                std::vector<llama_token> inp_pfx_cur = common_tokenize(ctx, params.input_prefix, false);
+                std::vector<llama_token> inp_sfx_cur = common_tokenize(ctx, params.input_suffix, false);
 
-                inp_pfx.insert(inp_pfx.begin(), llama_vocab_fim_pre(vocab));
-                inp_sfx.insert(inp_sfx.begin(), llama_vocab_fim_suf(vocab));
+                inp_pfx_cur.insert(inp_pfx_cur.begin(), llama_vocab_fim_pre(vocab));
+                inp_sfx_cur.insert(inp_sfx_cur.begin(), llama_vocab_fim_suf(vocab));
 
-                embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
-                embd_end = params.spm_infill ? inp_pfx : inp_sfx;
+                embd_inp = params.spm_infill ? inp_sfx_cur : inp_pfx_cur;
+                embd_end = params.spm_infill ? inp_pfx_cur : inp_sfx_cur;
                 if (add_bos) {
                     embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab));
                 }
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index a3b4c5ac8..faa8d5f87 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -548,11 +548,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 GGML_ASSERT(split_arg.size() <= llama_max_devices());
 
                 std::vector<float> tensor_split(llama_max_devices());
-                for (size_t i = 0; i < llama_max_devices(); ++i) {
-                    if (i < split_arg.size()) {
-                        tensor_split[i] = std::stof(split_arg[i]);
+                for (size_t is = 0; is < llama_max_devices(); ++is) {
+                    if (is < split_arg.size()) {
+                        tensor_split[is] = std::stof(split_arg[is]);
                     } else {
-                        tensor_split[i] = 0.0f;
+                        tensor_split[is] = 0.0f;
                     }
                 }
                 params.tensor_split.push_back(tensor_split);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 7a8a3156b..dc827e814 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1039,41 +1039,40 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             }
 
             { // attention
-                int hidden_size = 4096;
-                const int d_head = 128;
-                int n_head = hidden_size/d_head;
+                int hidden_size_cur = 4096;
                 int num_query = 96;
                 if (ctx->minicpmv_version == 2) {
-                    hidden_size = 4096;
-                    n_head = hidden_size/d_head;
+                    hidden_size_cur = 4096;
                     num_query = 96;
                 }
                 else if (ctx->minicpmv_version == 3) {
-                    hidden_size = 3584;
-                    n_head = hidden_size/d_head;
+                    hidden_size_cur = 3584;
                     num_query = 64;
                 }
 
+                const int d_head_cur = 128;
+                const int n_head_cur = hidden_size_cur/d_head_cur;
+
                 struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
-                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
+                Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head_cur));
                 struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
                 struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
                 // permute
-                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
+                Q = ggml_reshape_4d(ctx0, Q, d_head_cur, n_head_cur, num_query, batch_size);
                 Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
-                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+                Q = ggml_reshape_3d(ctx0, Q, d_head_cur, num_query, n_head_cur * batch_size);
+                K = ggml_reshape_4d(ctx0, K, d_head_cur, n_head_cur, num_positions, batch_size);
                 K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+                K = ggml_reshape_3d(ctx0, K, d_head_cur, num_positions, n_head_cur * batch_size);
+                V = ggml_reshape_4d(ctx0, V, d_head_cur, n_head_cur, num_positions, batch_size);
                 V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+                V = ggml_reshape_3d(ctx0, V, num_positions, d_head_cur, n_head_cur * batch_size);
                 struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
                 KQ = ggml_soft_max_inplace(ctx0, KQ);
                 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
+                KQV = ggml_reshape_4d(ctx0, KQV, d_head_cur, num_query, n_head_cur, batch_size);
                 KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
+                KQV = ggml_cont_3d(ctx0, KQV, hidden_size_cur, num_query, batch_size);
 
                 embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
             }
@@ -1113,12 +1112,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     struct ggml_context * meta = NULL;
 
-    struct gguf_init_params params = {
+    struct gguf_init_params params_meta = {
         /*.no_alloc = */ true,
         /*.ctx      = */ &meta,
     };
 
-    struct gguf_context * ctx = gguf_init_from_file(fname, params);
+    struct gguf_context * ctx = gguf_init_from_file(fname, params_meta);
     if (!ctx) {
         throw std::runtime_error(format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, fname));
     }
@@ -1310,13 +1309,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     // load tensors
     {
         std::vector<uint8_t> read_buf;
-        struct ggml_init_params params = {
+        struct ggml_init_params params_data = {
             /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
             /*.mem_buffer =*/ NULL,
             /*.no_alloc =*/ true,
         };
 
-        new_clip->ctx_data = ggml_init(params);
+        new_clip->ctx_data = ggml_init(params_data);
         if (!new_clip->ctx_data) {
             LOG_ERR("%s: ggml_init() failed\n", __func__);
             clip_free(new_clip);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9bf6c5743..c9239ecda 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -348,8 +348,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
 
     LOG_INF("%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
 
-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * params.ppl_stride;
+    for (int ich = 0; ich < n_chunk; ++ich) {
+        const int start =   ich * params.ppl_stride;
         const int end   = start + calc_chunk;
 
         const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
@@ -400,7 +400,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
-        if (i == 0) {
+        if (ich == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
@@ -427,9 +427,9 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
         }
         // perplexity is e^(average negative log-likelihood)
         if (params.ppl_output_type == 0) {
-            LOG("[%d]%.4lf,", i + 1, std::exp(nll / count));
+            LOG("[%d]%.4lf,", ich + 1, std::exp(nll / count));
         } else {
-            LOG("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+            LOG("%8d  %.4lf\n", ich*params.ppl_stride, std::exp(nll / count));
         }
     }
     LOG("\n");
@@ -659,7 +659,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
 
 static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
     int prev_outputs = 0;
-    for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
+    for (int i = 0; i < batch.n_tokens; i += n_batch) {
         const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
 
         llama_batch batch_view = {
@@ -679,8 +679,8 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
         }
 
         int n_outputs = 0;
-        for (int i = 0; i < n_tokens; ++i) {
-            n_outputs += batch_view.logits[i] != 0;
+        for (int iv = 0; iv < n_tokens; ++iv) {
+            n_outputs += batch_view.logits[iv] != 0;
         }
 
         memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
@@ -1752,14 +1752,14 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
     auto    kld_ptr =    kld_values.data();
     auto p_diff_ptr = p_diff_values.data();
 
-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
+    for (int ich = 0; ich < n_chunk; ++ich) {
+        const int start =   ich * n_ctx;
         const int end   = start + n_ctx;
 
         const auto t_start = std::chrono::high_resolution_clock::now();
 
         if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, i);
+            LOG_ERR("%s: failed reading log-probs for chunk %d\n", __func__, ich);
             return;
         }
 
@@ -1804,7 +1804,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
 
         const auto t_end = std::chrono::high_resolution_clock::now();
 
-        if (i == 0) {
+        if (ich == 0) {
             const float t_total = std::chrono::duration<float>(t_end - t_start).count();
             LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
             int total_seconds = (int)(t_total * n_chunk);
@@ -1824,7 +1824,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
         p_diff_ptr += n_ctx - 1 - first;
         kld_ptr    += n_ctx - 1 - first;
 
-        LOG("%4d", i+1);
+        LOG("%4d", ich + 1);
 
         auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
         const double ppl_val = exp(log_ppl.first);
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
index 0686d6305..22b43524b 100644
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -3,3 +3,4 @@ add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 64c0c4ef6..aa8b54680 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -122,9 +122,9 @@ struct slot_params {
             samplers.emplace_back(common_sampler_type_to_str(sampler));
         }
 
-        json lora = json::array();
-        for (size_t i = 0; i < this->lora.size(); ++i) {
-            lora.push_back({{"id", i}, {"scale", this->lora[i].scale}});
+        json json_lora = json::array();
+        for (size_t i = 0; i < lora.size(); ++i) {
+            json_lora.push_back({{"id", i}, {"scale", lora[i].scale}});
         }
 
         return json {
@@ -167,7 +167,7 @@ struct slot_params {
             {"speculative.p_min",         speculative.p_min},
             {"timings_per_token",         timings_per_token},
             {"post_sampling_probs",       post_sampling_probs},
-            {"lora",                      lora},
+            {"lora",                      json_lora},
         };
     }
 };
@@ -1641,7 +1641,7 @@ struct server_context {
 
     llama_context_params cparams_dft;
 
-    llama_batch batch = {};
+    llama_batch batch_main = {};
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
@@ -1676,7 +1676,7 @@ struct server_context {
             llama_batch_free(slot.batch_spec);
         }
 
-        llama_batch_free(batch);
+        llama_batch_free(batch_main);
     }
 
     bool load_model(const common_params & params) {
@@ -1797,7 +1797,7 @@ struct server_context {
             const int32_t n_batch = llama_n_batch(ctx);
 
             // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+            batch_main = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
         }
 
         metrics.init();
@@ -2655,7 +2655,7 @@ struct server_context {
         }
 
         // start populating the batch for this iteration
-        common_batch_clear(batch);
+        common_batch_clear(batch_main);
 
         // track if given slot can be batched with slots already in the batch
         server_slot * slot_batched = nullptr;
@@ -2673,9 +2673,9 @@ struct server_context {
                 continue;
             }
 
-            slot.i_batch = batch.n_tokens;
+            slot.i_batch = batch_main.n_tokens;
 
-            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
+            common_batch_add(batch_main, slot.sampled, slot.n_past, { slot.id }, true);
 
             slot.n_past += 1;
 
@@ -2692,7 +2692,7 @@ struct server_context {
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
         // next, batch any pending prompts without exceeding n_batch
-        if (params_base.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch_main.n_tokens == 0) {
             for (auto & slot : slots) {
                 // check if we can batch this slot with the previous one
                 if (slot.is_processing()) {
@@ -2858,7 +2858,7 @@ struct server_context {
                     // non-causal tasks require to fit the entire prompt in the physical batch
                     if (slot.is_non_causal()) {
                         // cannot fit the prompt in the current batch - will try next iter
-                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
+                        if (batch_main.n_tokens + slot.n_prompt_tokens > n_batch) {
                             continue;
                         }
                     }
@@ -2878,11 +2878,11 @@ struct server_context {
                     slot.cache_tokens.resize(slot.n_past);
 
                     // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                    while (slot.n_past < slot.n_prompt_tokens && batch_main.n_tokens < n_batch) {
                         // without pooling, we want to output the embeddings for all the tokens in the batch
                         const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
 
-                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
+                        common_batch_add(batch_main, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
 
                         if (slot.params.cache_prompt) {
                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -2892,13 +2892,13 @@ struct server_context {
                         slot.n_past++;
                     }
 
-                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch_main.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
 
                     // entire prompt has been processed
                     if (slot.n_past == slot.n_prompt_tokens) {
                         slot.state = SLOT_STATE_DONE_PROMPT;
 
-                        GGML_ASSERT(batch.n_tokens > 0);
+                        GGML_ASSERT(batch_main.n_tokens > 0);
 
                         common_sampler_reset(slot.smpl);
 
@@ -2908,27 +2908,27 @@ struct server_context {
                         }
 
                         // extract the logits only for the last token
-                        batch.logits[batch.n_tokens - 1] = true;
+                        batch_main.logits[batch_main.n_tokens - 1] = true;
 
                         slot.n_decoded = 0;
-                        slot.i_batch   = batch.n_tokens - 1;
+                        slot.i_batch   = batch_main.n_tokens - 1;
 
-                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch_main.n_tokens);
                     }
                 }
 
-                if (batch.n_tokens >= n_batch) {
+                if (batch_main.n_tokens >= n_batch) {
                     break;
                 }
             }
         }
 
-        if (batch.n_tokens == 0) {
+        if (batch_main.n_tokens == 0) {
             SRV_WRN("%s", "no tokens to decode\n");
             return;
         }
 
-        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch_main.n_tokens);
 
         if (slot_batched) {
             // make sure we're in the right embedding mode
@@ -2938,17 +2938,17 @@ struct server_context {
         }
 
         // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+        for (int32_t i_batch = 0; i_batch < batch_main.n_tokens; i_batch += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch_main.n_tokens - i_batch);
 
             llama_batch batch_view = {
                 n_tokens,
-                batch.token    + i,
+                batch_main.token    + i_batch,
                 nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
+                batch_main.pos      + i_batch,
+                batch_main.n_seq_id + i_batch,
+                batch_main.seq_id   + i_batch,
+                batch_main.logits   + i_batch,
             };
 
             const int ret = llama_decode(ctx, batch_view);
@@ -2957,7 +2957,7 @@ struct server_context {
             if (ret != 0) {
                 if (n_batch == 1 || ret < 0) {
                     // if you get here, it means the KV cache is full - try increasing it via the context size
-                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                    SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret);
                     for (auto & slot : slots) {
                         slot.release();
                         send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
@@ -2967,15 +2967,15 @@ struct server_context {
 
                 // retry with half the batch size to try to find a free slot in the KV cache
                 n_batch /= 2;
-                i -= n_batch;
+                i_batch -= n_batch;
 
-                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
+                SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i_batch = %d, n_batch = %d, ret = %d\n", i_batch, n_batch, ret);
 
                 continue; // continue loop of n_batch
             }
 
             for (auto & slot : slots) {
-                if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) {
+                if (slot.i_batch < (int) i_batch || slot.i_batch >= (int) (i_batch + n_tokens)) {
                     continue; // continue loop of slots
                 }
 
@@ -3001,7 +3001,7 @@ struct server_context {
                     continue; // continue loop of slots
                 }
 
-                const int tok_idx = slot.i_batch - i;
+                const int tok_idx = slot.i_batch - i_batch;
 
                 llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx);
 
@@ -3687,8 +3687,8 @@ int main(int argc, char ** argv) {
                 } else {
                     // multiple results (multitask)
                     json arr = json::array();
-                    for (auto & res : results) {
-                        arr.push_back(res->to_json());
+                    for (auto & result : results) {
+                        arr.push_back(result->to_json());
                     }
                     res_ok(res, arr);
                 }
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 699480f90..fab0850e5 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -129,15 +129,15 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_
             if (p.is_string()) {
                 auto s = p.template get<std::string>();
 
-                llama_tokens p;
+                llama_tokens ids;
                 if (first) {
-                    p = common_tokenize(vocab, s, add_special, parse_special);
+                    ids = common_tokenize(vocab, s, add_special, parse_special);
                     first = false;
                 } else {
-                    p = common_tokenize(vocab, s, false, parse_special);
+                    ids = common_tokenize(vocab, s, false, parse_special);
                 }
 
-                prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+                prompt_tokens.insert(prompt_tokens.end(), ids.begin(), ids.end());
             } else {
                 if (first) {
                     first = false;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index c7ccea50d..56700db3f 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -544,26 +544,26 @@ int main(int argc, char ** argv) {
                 for (int is = 0; is < (int) sa.size(); ++is) {
                     const llama_token id = cur_p->data[is].id;
 
-                    const int s = sa[is];
+                    const int sd = sa[is];
 
-                    common_sampler_accept(drafts[s].smpl, id, true);
+                    common_sampler_accept(drafts[sd].smpl, id, true);
 
-                    drafts[s].tokens.push_back(id);
-                    // save cur_p.data into drafts[s].dists
-                    drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
+                    drafts[sd].tokens.push_back(id);
+                    // save cur_p.data into drafts[sd].dists
+                    drafts[sd].dists.push_back({cur_p->data, cur_p->data + cur_p->size});
 
                     // add unique drafted tokens to the target batch
-                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+                    drafts[sd].i_batch_tgt.push_back(batch_tgt.n_tokens);
 
-                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { sd }, true);
 
                     // add the token to the batch for batched decoding with the draft model
-                    drafts[s].i_batch_dft = batch_dft.n_tokens;
+                    drafts[sd].i_batch_dft = batch_dft.n_tokens;
 
-                    common_batch_add(batch_dft, id, n_past_cur, { s }, true);
+                    common_batch_add(batch_dft, id, n_past_cur, { sd }, true);
 
                     if (batch_tgt.n_tokens > n_draft) {
-                        drafts[s].drafting = false;
+                        drafts[sd].drafting = false;
                     }
                 }
             }

From e159e7751c5e358da439745141584d57f2056e40 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 14:35:29 +0200
Subject: [PATCH 07/15] cmake : disable -Wshadow for GCC

ggml-ci
---
 cmake/common.cmake | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmake/common.cmake b/cmake/common.cmake
index c64ddbc3d..bbc9c412e 100644
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -13,7 +13,12 @@ function(llama_add_compile_flags)
             list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
                                 -Werror=implicit-int -Werror=implicit-function-declaration)
 
-            list(APPEND CXX_FLAGS -Wshadow -Wmissing-declarations -Wmissing-noreturn)
+            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+
+            # GCC -Wshadow is way too agressive
+            if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+                list(APPEND CXX_FLAGS -Wshadow)
+            endif()
 
             list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
 

From 34889bf8102e806289613e566e835420d7ea3d70 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 15:11:52 +0200
Subject: [PATCH 08/15] cmake : cont

ggml-ci
---
 cmake/common.cmake          | 2 +-
 examples/run/CMakeLists.txt | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cmake/common.cmake b/cmake/common.cmake
index bbc9c412e..5dee785c3 100644
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -16,7 +16,7 @@ function(llama_add_compile_flags)
             list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
 
             # GCC -Wshadow is way too agressive
-            if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+            if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
                 list(APPEND CXX_FLAGS -Wshadow)
             endif()
 
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
index 22b43524b..5e9c57bbc 100644
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -3,4 +3,7 @@ add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
-target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP
+
+if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP
+endif()

From 439e68c1e5889a01116ba6eec1c03c9fe11bfaa0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 15:29:33 +0200
Subject: [PATCH 09/15] cmake : re-enable GCC -Wshadow

ggml-ci
---
 cmake/common.cmake                   |  7 ++++--
 common/arg.h                         | 34 ++++++++++++++--------------
 examples/export-lora/export-lora.cpp |  8 +++----
 examples/gguf-split/gguf-split.cpp   | 16 ++++++-------
 examples/run/CMakeLists.txt          |  9 ++++++--
 examples/server/server.cpp           |  2 +-
 src/llama-adapter.h                  |  2 +-
 src/llama-arch.cpp                   |  2 +-
 src/llama-arch.h                     |  2 +-
 src/llama-context.h                  |  4 ++--
 src/llama-impl.cpp                   |  2 +-
 src/llama-model-loader.h             |  2 +-
 src/llama-model.cpp                  |  2 +-
 src/llama-quant.cpp                  |  6 ++---
 src/llama-vocab.cpp                  | 14 ++++++------
 src/llama.cpp                        | 26 ++++++++++-----------
 16 files changed, 73 insertions(+), 65 deletions(-)

diff --git a/cmake/common.cmake b/cmake/common.cmake
index 5dee785c3..45bac7af8 100644
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -15,9 +15,12 @@ function(llama_add_compile_flags)
 
             list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
 
-            # GCC -Wshadow is way too agressive
-            if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
                 list(APPEND CXX_FLAGS -Wshadow)
+
+                if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+                    list(APPEND CXX_FLAGS -Wshadow -Wshadow-field-in-constructor)
+                endif()
             endif()
 
             list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
diff --git a/common/arg.h b/common/arg.h
index d88efa462..eff9e6e1f 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -25,33 +25,33 @@ struct common_arg {
     void (*handler_int)    (common_params & params, int) = nullptr;
 
     common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const char * value_hint_,
+        const std::string & help_,
         void (*handler)(common_params & params, const std::string &)
-    ) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
+    ) : args(args_), value_hint(value_hint_), help(help_), handler_string(handler) {}
 
     common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const char * value_hint_,
+        const std::string & help_,
         void (*handler)(common_params & params, int)
-    ) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
+    ) : args(args_), value_hint(value_hint_), help(help_), handler_int(handler) {}
 
     common_arg(
-        const std::initializer_list<const char *> & args,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const std::string & help_,
         void (*handler)(common_params & params)
-    ) : args(args), help(help), handler_void(handler) {}
+    ) : args(args_), help(help_), handler_void(handler) {}
 
     // support 2 values for arg
     common_arg(
-        const std::initializer_list<const char *> & args,
-        const char * value_hint,
-        const char * value_hint_2,
-        const std::string & help,
+        const std::initializer_list<const char *> & args_,
+        const char * value_hint_,
+        const char * value_hint_2_,
+        const std::string & help_,
         void (*handler)(common_params & params, const std::string &, const std::string &)
-    ) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
+    ) : args(args_), value_hint(value_hint_), value_hint_2(value_hint_2_), help(help_), handler_str_str(handler) {}
 
     common_arg & set_examples(std::initializer_list<enum llama_example> vals);
     common_arg & set_excludes(std::initializer_list<enum llama_example> vals);
@@ -69,7 +69,7 @@ struct common_params_context {
     common_params & params;
     std::vector<common_arg> options;
     void(*print_usage)(int, char **) = nullptr;
-    common_params_context(common_params & params) : params(params) {}
+    common_params_context(common_params & params_) : params(params_) {}
 };
 
 // parse input arguments from CLI
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 99063b5d5..592cffbf4 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -66,7 +66,7 @@ struct file_input {
     float alpha;
     float scale;
 
-    file_input(std::string & fname, float scale): f_in(fname, std::ios::binary), scale(scale) {
+    file_input(std::string & fname, float scale_): f_in(fname, std::ios::binary), scale(scale_) {
         if (!f_in.is_open()) {
             throw std::runtime_error("failed to open input gguf from " + fname);
         }
@@ -131,7 +131,7 @@ struct lora_merge_ctx {
             std::string & base_fname,
             std::vector<common_adapter_lora_info> & lora_files,
             std::string & outfile,
-            int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
+            int n_threads_) : base_model(base_fname, 0), n_threads(n_threads_), fout(outfile, std::ios::binary) {
         fout.exceptions(std::ofstream::failbit); // fail fast on write errors
 
         if (gguf_find_key(base_model.ctx_gguf, LLM_KV_SPLIT_COUNT) >= 0) {
@@ -157,7 +157,7 @@ struct lora_merge_ctx {
         allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
     }
 
-    void check_metadata_lora(file_input * adapter) {
+    void check_metadata_lora(const file_input * adapter) const {
         auto general_type = get_kv_str(adapter->ctx_gguf, "general.type");
         if (general_type != "adapter") {
             throw std::runtime_error("expect general.type to be 'adapter', but got: " + general_type);
@@ -175,7 +175,7 @@ struct lora_merge_ctx {
         }
     }
 
-    ggml_type get_out_tensor_type(struct ggml_tensor * t) {
+    static ggml_type get_out_tensor_type(struct ggml_tensor * t) {
         if (t->type == GGML_TYPE_F32) {
             return GGML_TYPE_F32;
         } else {
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index ef3ceb686..3b9ae6a58 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -204,14 +204,14 @@ struct split_strategy {
     // temporary buffer for reading in tensor data
     std::vector<uint8_t> read_buf;
 
-    split_strategy(const split_params & params,
-            std::ifstream & f_input,
-            struct gguf_context * ctx_gguf,
-            struct ggml_context * ctx_meta) :
-        params(params),
-        f_input(f_input),
-        ctx_gguf(ctx_gguf),
-        ctx_meta(ctx_meta),
+    split_strategy(const split_params & params_,
+            std::ifstream & f_input_,
+            struct gguf_context * ctx_gguf_,
+            struct ggml_context * ctx_meta_) :
+        params(params_),
+        f_input(f_input_),
+        ctx_gguf(ctx_gguf_),
+        ctx_meta(ctx_meta_),
         n_tensors(gguf_get_n_tensors(ctx_gguf)) {
 
         // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt
index 5e9c57bbc..8735c9dc2 100644
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@@ -4,6 +4,11 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    target_compile_options(${TARGET} PRIVATE -Wno-shadow) # TMP
+# TMP
+if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    target_compile_options(${TARGET} PRIVATE -Wno-shadow)
+
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        target_compile_options(${TARGET} PRIVATE -Wno-shadow-field-in-constructor)
+    endif()
 endif()
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index aa8b54680..0c0f066ca 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -200,7 +200,7 @@ struct server_task {
     // used by SERVER_TASK_TYPE_SET_LORA
     std::vector<common_adapter_lora_info> set_lora;
 
-    server_task(server_task_type type) : type(type) {}
+    server_task(server_task_type type_) : type(type_) {}
 
     static slot_params params_from_json_cmpl(
             const llama_context * ctx,
diff --git a/src/llama-adapter.h b/src/llama-adapter.h
index 603fa08f6..7cfc49689 100644
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@@ -55,7 +55,7 @@ struct llama_adapter_lora_weight {
     }
 
     llama_adapter_lora_weight() = default;
-    llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+    llama_adapter_lora_weight(struct ggml_tensor * a_, struct ggml_tensor * b_) : a(a_), b(b_) {}
 };
 
 struct llama_adapter_lora {
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5c1f14cfd..17d7939af 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1443,7 +1443,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_CONVNEXT_GAMMA,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
 };
 
-LLM_KV::LLM_KV(llm_arch arch) : arch(arch) {}
+LLM_KV::LLM_KV(llm_arch arch_) : arch(arch_) {}
 
 std::string LLM_KV::operator()(llm_kv kv) const {
     return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 349844790..d6a79db1e 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -374,7 +374,7 @@ struct LLM_TN_IMPL {
 };
 
 struct LLM_TN {
-    LLM_TN(llm_arch arch) : arch(arch) {}
+    LLM_TN(llm_arch arch_) : arch(arch_) {}
 
     llm_arch arch;
 
diff --git a/src/llama-context.h b/src/llama-context.h
index a9268b292..70c3d0ad7 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -15,8 +15,8 @@
 #include <set>
 
 struct llama_context {
-    llama_context(const llama_model & model)
-        : model(model)
+    llama_context(const llama_model & model_)
+        : model(model_)
         , t_start_us(model.t_start_us)
         , t_load_us(model.t_load_us) {}
 
diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
index 6ec709dd3..37cf7cdb7 100644
--- a/src/llama-impl.cpp
+++ b/src/llama-impl.cpp
@@ -17,7 +17,7 @@ struct llama_logger_state {
 
 static llama_logger_state g_logger_state;
 
-time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+time_meas::time_meas(int64_t & t_acc_, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc_) {}
 
 time_meas::~time_meas() {
         if (t_start_us >= 0) {
diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
index 4814bbdc9..2164da710 100644
--- a/src/llama-model-loader.h
+++ b/src/llama-model-loader.h
@@ -31,7 +31,7 @@ struct llama_model_loader {
 
         ggml_tensor * tensor;
 
-        llama_tensor_weight(const llama_file * file, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) {
+        llama_tensor_weight(const llama_file * file, uint16_t idx_, const struct gguf_context * gguf_ctx, ggml_tensor * tensor_) : idx(idx_), tensor(tensor_) {
             const int tensor_idx = gguf_find_tensor(gguf_ctx,  ggml_get_name(tensor));
             if (tensor_idx < 0) {
                 throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor)));
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 1229d8738..01a3afa40 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -369,7 +369,7 @@ struct llama_model::impl {
     std::vector<layer_dev> dev_layer;
 };
 
-llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
+llama_model::llama_model(const struct llama_model_params & params_) : params(params_), pimpl(std::make_unique<impl>()) {
 }
 
 llama_model::~llama_model() {}
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 6c59e1730..75899d142 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -41,9 +41,9 @@ struct quantize_state_impl {
     // used to figure out if a model shares tok_embd with the output weight
     bool has_output = false;
 
-    quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
-        : model(model)
-        , params(params)
+    quantize_state_impl(const llama_model & model_, const llama_model_quantize_params * params_)
+        : model(model_)
+        , params(params_)
         {}
 };
 
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index df6bcdf6a..ef108b991 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -115,7 +115,7 @@ struct llm_tokenizer_spm : llm_tokenizer {
 };
 
 struct llm_tokenizer_spm_session {
-    llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
+    llm_tokenizer_spm_session(const llama_vocab & vocab_) : vocab(vocab_) {}
 
     void tokenize(const std::string & text, std::vector<llama_token> & output) {
         // split string into utf8 chars
@@ -415,7 +415,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 };
 
 struct llm_tokenizer_bpe_session {
-    llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+    llm_tokenizer_bpe_session(const llama_vocab & vocab_, const llm_tokenizer_bpe & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}
 
     static void append(const llama_token token_id, std::vector<llama_token> & output)  {
         output.push_back(token_id);
@@ -603,7 +603,7 @@ struct llm_tokenizer_wpm : llm_tokenizer {
 };
 
 struct llm_tokenizer_wpm_session {
-    llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
+    llm_tokenizer_wpm_session(const llama_vocab & vocab_) : vocab(vocab_) {}
 
     void tokenize(const std::string & text, std::vector<llama_token> & output) {
         // normalize and split by whitespace
@@ -782,7 +782,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
 };
 
 struct llm_tokenizer_ugm_session {
-    llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+    llm_tokenizer_ugm_session(const llama_vocab & vocab_, const llm_tokenizer_ugm & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}
 
     /* This implementation is based on SentencePiece optimized Viterbi algorithm for
      * unigram language models. The general idea is to:
@@ -949,7 +949,7 @@ private:
      */
     struct xcda_array_view {
     public:
-        xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
+        xcda_array_view(const uint32_t * xcda_array_, size_t xcda_array_size_) : xcda_array(xcda_array_), xcda_array_size(xcda_array_size_) {
         }
         uint32_t get_base(size_t index) {
             uint32_t packed_node = get_node(index);
@@ -1135,7 +1135,7 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
 };
 
 struct llm_tokenizer_rwkv_session {
-    llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
+    llm_tokenizer_rwkv_session(const llama_vocab & vocab_, const llm_tokenizer_rwkv & tokenizer_) : vocab(vocab_), tokenizer(tokenizer_) {}
 
     void tokenize(const std::string & text, std::vector<llama_token> & output) {
         uint32_t position = 0;
@@ -1262,7 +1262,7 @@ struct llama_vocab::impl {
 
     std::vector<char> precompiled_charsmap;
 
-    impl(const llama_vocab & vocab) : vocab(vocab) {
+    impl(const llama_vocab & vocab_) : vocab(vocab_) {
     }
 
     ~impl() = default;
diff --git a/src/llama.cpp b/src/llama.cpp
index d907c2d6e..094ed0024 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -1089,16 +1089,16 @@ struct llm_build_context {
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-        llama_context  & lctx,
-    const llama_ubatch & ubatch,
-    const llm_build_cb & cb,
+         llama_context & lctx_,
+    const llama_ubatch & ubatch_,
+    const llm_build_cb & cb_,
                   bool   worst_case) :
-        model            (lctx.model),
-        lctx             (lctx),
+        model            (lctx_.model),
+        lctx             (lctx_),
         hparams          (model.hparams),
-        cparams          (lctx.cparams),
-        ubatch           (ubatch),
-        kv_self          (lctx.kv_self),
+        cparams          (lctx_.cparams),
+        ubatch           (ubatch_),
+        kv_self          (lctx_.kv_self),
         n_embd           (hparams.n_embd),
         n_layer          (hparams.n_layer),
         n_rot            (hparams.n_rot),
@@ -1119,17 +1119,17 @@ struct llm_build_context {
         beta_slow        (cparams.yarn_beta_slow),
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
-        n_tokens         (ubatch.n_tokens),
+        n_tokens         (ubatch_.n_tokens),
         n_kv             (worst_case ? kv_self.size : kv_self.n),
-        n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
-        n_outputs_enc    (worst_case ? n_tokens : lctx.embd_enc.size() / hparams.n_embd),
+        n_outputs        (worst_case ? n_tokens : lctx_.n_outputs),
+        n_outputs_enc    (worst_case ? n_tokens : lctx_.embd_enc.size() / hparams.n_embd),
         kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_ctx_orig       (cparams.n_ctx_orig_yarn),
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
-        cb               (cb),
-        buf_compute_meta (lctx.buf_compute_meta) {
+        cb               (cb_),
+        buf_compute_meta (lctx_.buf_compute_meta) {
             // all initializations should be done in init()
         }
 

From f65e3d324d641b8f4f110687e998ebfee0a94586 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 15:34:48 +0200
Subject: [PATCH 10/15] ggml : ggml_backend_graph_copy ->
 ggml_backend_graph_copy_init

---
 ggml/include/ggml-backend.h | 2 +-
 ggml/src/ggml-backend.cpp   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 7221a0830..ce4fb4652 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -331,7 +331,7 @@ extern "C" {
     };
 
     // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph);
     GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
 
     typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index dba7be33b..cbc57a2d3 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1724,7 +1724,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
     }
 }
 
-struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
+struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph) {
     struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
     struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
     bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@@ -1812,7 +1812,7 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
 }
 
 bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
-    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
+    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy_init(backend2, graph);
     if (copy.buffer == NULL) {
         return false;
     }

From 10eb87409ec0797ec79dec87f1004b380e094cfd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 16:09:49 +0200
Subject: [PATCH 11/15] shadow : cont gcc

ggml-ci
---
 common/arg.cpp                           | 936 +++++++++++------------
 common/json-schema-to-grammar.cpp        |   4 +-
 common/log.cpp                           |   4 +-
 examples/batched-bench/batched-bench.cpp |   8 +-
 examples/llava/clip.cpp                  |   8 +-
 examples/llava/clip.h                    |   6 +-
 examples/llava/llava.cpp                 |   6 +-
 examples/server/server.cpp               |  16 +-
 examples/simple-chat/simple-chat.cpp     |   3 +-
 src/llama-model.cpp                      |  13 +-
 src/llama-quant.cpp                      |   8 +-
 src/llama-vocab.cpp                      |   8 +-
 12 files changed, 509 insertions(+), 511 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index b551596df..d1faccee1 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -383,8 +383,8 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
             }
             exit(0);
         }
-    } catch (const std::invalid_argument & ex) {
-        fprintf(stderr, "%s\n", ex.what());
+    } catch (const std::invalid_argument & e) {
+        fprintf(stderr, "%s\n", e.what());
         ctx_arg.params = params_org;
         return false;
     }
@@ -438,8 +438,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-h", "--help", "--usage"},
         "print usage and exit",
-        [](common_params & params) {
-            params.usage = true;
+        [](common_params & cur) {
+            cur.usage = true;
         }
     ));
     add_opt(common_arg(
@@ -454,50 +454,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--verbose-prompt"},
         string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"),
-        [](common_params & params) {
-            params.verbose_prompt = true;
+        [](common_params & cur) {
+            cur.verbose_prompt = true;
         }
     ));
     add_opt(common_arg(
         {"--no-display-prompt"},
         string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [](common_params & params) {
-            params.display_prompt = false;
+        [](common_params & cur) {
+            cur.display_prompt = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-co", "--color"},
         string_format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
-        [](common_params & params) {
-            params.use_color = true;
+        [](common_params & cur) {
+            cur.use_color = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-t", "--threads"}, "N",
         string_format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
-        [](common_params & params, int value) {
-            params.cpuparams.n_threads = value;
-            if (params.cpuparams.n_threads <= 0) {
-                params.cpuparams.n_threads = std::thread::hardware_concurrency();
+        [](common_params & cur, int value) {
+            cur.cpuparams.n_threads = value;
+            if (cur.cpuparams.n_threads <= 0) {
+                cur.cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
     ).set_env("LLAMA_ARG_THREADS"));
     add_opt(common_arg(
         {"-tb", "--threads-batch"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads)",
-        [](common_params & params, int value) {
-            params.cpuparams_batch.n_threads = value;
-            if (params.cpuparams_batch.n_threads <= 0) {
-                params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+        [](common_params & cur, int value) {
+            cur.cpuparams_batch.n_threads = value;
+            if (cur.cpuparams_batch.n_threads <= 0) {
+                cur.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
             }
         }
     ));
     add_opt(common_arg(
         {"-C", "--cpu-mask"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
-        [](common_params & params, const std::string & mask) {
-            params.cpuparams.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.cpuparams.cpumask)) {
+        [](common_params & cur, const std::string & mask) {
+            cur.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, cur.cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
@@ -505,9 +505,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-Cr", "--cpu-range"}, "lo-hi",
         "range of CPUs for affinity. Complements --cpu-mask",
-        [](common_params & params, const std::string & range) {
-            params.cpuparams.mask_valid = true;
-            if (!parse_cpu_range(range, params.cpuparams.cpumask)) {
+        [](common_params & cur, const std::string & range) {
+            cur.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, cur.cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid range");
             }
         }
@@ -515,33 +515,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--cpu-strict"}, "<0|1>",
         string_format("use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu),
-        [](common_params & params, const std::string & value) {
-            params.cpuparams.strict_cpu = std::stoul(value);
+        [](common_params & cur, const std::string & value) {
+            cur.cpuparams.strict_cpu = std::stoul(value);
         }
     ));
     add_opt(common_arg(
         {"--prio"}, "N",
         string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
-        [](common_params & params, int prio) {
+        [](common_params & cur, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
-            params.cpuparams.priority = (enum ggml_sched_priority) prio;
+            cur.cpuparams.priority = (enum ggml_sched_priority) prio;
         }
     ));
     add_opt(common_arg(
         {"--poll"}, "<0...100>",
         string_format("use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll),
-        [](common_params & params, const std::string & value) {
-            params.cpuparams.poll = std::stoul(value);
+        [](common_params & cur, const std::string & value) {
+            cur.cpuparams.poll = std::stoul(value);
         }
     ));
     add_opt(common_arg(
         {"-Cb", "--cpu-mask-batch"}, "M",
         "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)",
-        [](common_params & params, const std::string & mask) {
-            params.cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.cpuparams_batch.cpumask)) {
+        [](common_params & cur, const std::string & mask) {
+            cur.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, cur.cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
@@ -549,9 +549,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-Crb", "--cpu-range-batch"}, "lo-hi",
         "ranges of CPUs for affinity. Complements --cpu-mask-batch",
-        [](common_params & params, const std::string & range) {
-            params.cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_range(range, params.cpuparams_batch.cpumask)) {
+        [](common_params & cur, const std::string & range) {
+            cur.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, cur.cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid range");
             }
         }
@@ -559,95 +559,95 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--cpu-strict-batch"}, "<0|1>",
         "use strict CPU placement (default: same as --cpu-strict)",
-        [](common_params & params, int value) {
-            params.cpuparams_batch.strict_cpu = value;
+        [](common_params & cur, int value) {
+            cur.cpuparams_batch.strict_cpu = value;
         }
     ));
     add_opt(common_arg(
         {"--prio-batch"}, "N",
         string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams_batch.priority),
-        [](common_params & params, int prio) {
+        [](common_params & cur, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
-            params.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+            cur.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
         }
     ));
     add_opt(common_arg(
         {"--poll-batch"}, "<0|1>",
         "use polling to wait for work (default: same as --poll)",
-        [](common_params & params, int value) {
-            params.cpuparams_batch.poll = value;
+        [](common_params & cur, int value) {
+            cur.cpuparams_batch.poll = value;
         }
     ));
     add_opt(common_arg(
         {"-lcs", "--lookup-cache-static"}, "FNAME",
         "path to static lookup cache to use for lookup decoding (not updated by generation)",
-        [](common_params & params, const std::string & value) {
-            params.lookup_cache_static = value;
+        [](common_params & cur, const std::string & value) {
+            cur.lookup_cache_static = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-lcd", "--lookup-cache-dynamic"}, "FNAME",
         "path to dynamic lookup cache to use for lookup decoding (updated by generation)",
-        [](common_params & params, const std::string & value) {
-            params.lookup_cache_dynamic = value;
+        [](common_params & cur, const std::string & value) {
+            cur.lookup_cache_dynamic = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LOOKUP}));
     add_opt(common_arg(
         {"-c", "--ctx-size"}, "N",
         string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
-        [](common_params & params, int value) {
-            params.n_ctx = value;
+        [](common_params & cur, int value) {
+            cur.n_ctx = value;
         }
     ).set_env("LLAMA_ARG_CTX_SIZE"));
     add_opt(common_arg(
         {"-n", "--predict", "--n-predict"}, "N",
         string_format("number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict),
-        [](common_params & params, int value) {
-            params.n_predict = value;
+        [](common_params & cur, int value) {
+            cur.n_predict = value;
         }
     ).set_env("LLAMA_ARG_N_PREDICT"));
     add_opt(common_arg(
         {"-b", "--batch-size"}, "N",
         string_format("logical maximum batch size (default: %d)", params.n_batch),
-        [](common_params & params, int value) {
-            params.n_batch = value;
+        [](common_params & cur, int value) {
+            cur.n_batch = value;
         }
     ).set_env("LLAMA_ARG_BATCH"));
     add_opt(common_arg(
         {"-ub", "--ubatch-size"}, "N",
         string_format("physical maximum batch size (default: %d)", params.n_ubatch),
-        [](common_params & params, int value) {
-            params.n_ubatch = value;
+        [](common_params & cur, int value) {
+            cur.n_ubatch = value;
         }
     ).set_env("LLAMA_ARG_UBATCH"));
     add_opt(common_arg(
         {"--keep"}, "N",
         string_format("number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep),
-        [](common_params & params, int value) {
-            params.n_keep = value;
+        [](common_params & cur, int value) {
+            cur.n_keep = value;
         }
     ));
     add_opt(common_arg(
         {"--no-context-shift"},
         string_format("disables context shift on inifinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
-        [](common_params & params) {
-            params.ctx_shift = false;
+        [](common_params & cur) {
+            cur.ctx_shift = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
     add_opt(common_arg(
         {"--chunks"}, "N",
         string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
-        [](common_params & params, int value) {
-            params.n_chunks = value;
+        [](common_params & cur, int value) {
+            cur.n_chunks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"-fa", "--flash-attn"},
         string_format("enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.flash_attn = true;
+        [](common_params & cur) {
+            cur.flash_attn = true;
         }
     ).set_env("LLAMA_ARG_FLASH_ATTN"));
     add_opt(common_arg(
@@ -655,115 +655,115 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         ex == LLAMA_EXAMPLE_MAIN
             ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
             : "prompt to start generation with",
-        [](common_params & params, const std::string & value) {
-            params.prompt = value;
+        [](common_params & cur, const std::string & value) {
+            cur.prompt = value;
         }
     ).set_excludes({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--no-perf"},
         string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](common_params & params) {
-            params.no_perf = true;
-            params.sampling.no_perf = true;
+        [](common_params & cur) {
+            cur.no_perf = true;
+            cur.sampling.no_perf = true;
         }
     ).set_env("LLAMA_ARG_NO_PERF"));
     add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
-            // store the external file name in params
-            params.prompt_file = value;
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (!params.prompt.empty() && params.prompt.back() == '\n') {
-                params.prompt.pop_back();
+            // store the external file name in cur
+            cur.prompt_file = value;
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(cur.prompt));
+            if (!cur.prompt.empty() && cur.prompt.back() == '\n') {
+                cur.prompt.pop_back();
             }
         }
     ).set_excludes({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--in-file"}, "FNAME",
         "an input file (repeat to specify multiple files)",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
-            params.in_files.push_back(value);
+            cur.in_files.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"-bf", "--binary-file"}, "FNAME",
         "binary file containing the prompt (default: none)",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
-            // store the external file name in params
-            params.prompt_file = value;
+            // store the external file name in cur
+            cur.prompt_file = value;
             std::ostringstream ss;
             ss << file.rdbuf();
-            params.prompt = ss.str();
-            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), value.c_str());
+            cur.prompt = ss.str();
+            fprintf(stderr, "Read %zu bytes from binary file %s\n", cur.prompt.size(), value.c_str());
         }
     ).set_excludes({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-e", "--escape"},
         string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](common_params & params) {
-            params.escape = true;
+        [](common_params & cur) {
+            cur.escape = true;
         }
     ));
     add_opt(common_arg(
         {"--no-escape"},
         "do not process escape sequences",
-        [](common_params & params) {
-            params.escape = false;
+        [](common_params & cur) {
+            cur.escape = false;
         }
     ));
     add_opt(common_arg(
         {"-ptc", "--print-token-count"}, "N",
         string_format("print token count every N tokens (default: %d)", params.n_print),
-        [](common_params & params, int value) {
-            params.n_print = value;
+        [](common_params & cur, int value) {
+            cur.n_print = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--prompt-cache"}, "FNAME",
         "file to cache prompt state for faster startup (default: none)",
-        [](common_params & params, const std::string & value) {
-            params.path_prompt_cache = value;
+        [](common_params & cur, const std::string & value) {
+            cur.path_prompt_cache = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--prompt-cache-all"},
         "if specified, saves user input and generations to cache as well\n",
-        [](common_params & params) {
-            params.prompt_cache_all = true;
+        [](common_params & cur) {
+            cur.prompt_cache_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--prompt-cache-ro"},
         "if specified, uses the prompt cache but does not update it",
-        [](common_params & params) {
-            params.prompt_cache_ro = true;
+        [](common_params & cur) {
+            cur.prompt_cache_ro = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-r", "--reverse-prompt"}, "PROMPT",
         "halt generation at PROMPT, return control in interactive mode\n",
-        [](common_params & params, const std::string & value) {
-            params.antiprompt.emplace_back(value);
+        [](common_params & cur, const std::string & value) {
+            cur.antiprompt.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-sp", "--special"},
         string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
-        [](common_params & params) {
-            params.special = true;
+        [](common_params & cur) {
+            cur.special = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
@@ -775,60 +775,60 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             "(default: %s)",
             params.conversation ? "true" : "false"
         ),
-        [](common_params & params) {
-            params.conversation = true;
+        [](common_params & cur) {
+            cur.conversation = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-i", "--interactive"},
         string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
-        [](common_params & params) {
-            params.interactive = true;
+        [](common_params & cur) {
+            cur.interactive = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-if", "--interactive-first"},
         string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
-        [](common_params & params) {
-            params.interactive_first = true;
+        [](common_params & cur) {
+            cur.interactive_first = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-mli", "--multiline-input"},
         "allows you to write or paste multiple lines without ending each in '\\'",
-        [](common_params & params) {
-            params.multiline_input = true;
+        [](common_params & cur) {
+            cur.multiline_input = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--in-prefix-bos"},
         "prefix BOS to user inputs, preceding the `--in-prefix` string",
-        [](common_params & params) {
-            params.input_prefix_bos = true;
-            params.enable_chat_template = false;
+        [](common_params & cur) {
+            cur.input_prefix_bos = true;
+            cur.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"--in-prefix"}, "STRING",
         "string to prefix user inputs with (default: empty)",
-        [](common_params & params, const std::string & value) {
-            params.input_prefix = value;
-            params.enable_chat_template = false;
+        [](common_params & cur, const std::string & value) {
+            cur.input_prefix = value;
+            cur.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--in-suffix"}, "STRING",
         "string to suffix after user inputs with (default: empty)",
-        [](common_params & params, const std::string & value) {
-            params.input_suffix = value;
-            params.enable_chat_template = false;
+        [](common_params & cur, const std::string & value) {
+            cur.input_suffix = value;
+            cur.enable_chat_template = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--no-warmup"},
         "skip warming up the model with an empty run",
-        [](common_params & params) {
-            params.warmup = false;
+        [](common_params & cur) {
+            cur.warmup = false;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
@@ -837,154 +837,154 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)",
             params.spm_infill ? "enabled" : "disabled"
         ),
-        [](common_params & params) {
-            params.spm_infill = true;
+        [](common_params & cur) {
+            cur.spm_infill = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--samplers"}, "SAMPLERS",
         string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             const auto sampler_names = string_split<std::string>(value, ';');
-            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
+            cur.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"-s", "--seed"}, "SEED",
         string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
-        [](common_params & params, const std::string & value) {
-            params.sampling.seed = std::stoul(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.seed = std::stoul(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
         string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.sampling.samplers = common_sampler_types_from_chars(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.samplers = common_sampler_types_from_chars(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--ignore-eos"},
         "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
-        [](common_params & params) {
-            params.sampling.ignore_eos = true;
+        [](common_params & cur) {
+            cur.sampling.ignore_eos = true;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--temp"}, "N",
         string_format("temperature (default: %.1f)", (double)params.sampling.temp),
-        [](common_params & params, const std::string & value) {
-            params.sampling.temp = std::stof(value);
-            params.sampling.temp = std::max(params.sampling.temp, 0.0f);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.temp = std::stof(value);
+            cur.sampling.temp = std::max(cur.sampling.temp, 0.0f);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--top-k"}, "N",
         string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
-        [](common_params & params, int value) {
-            params.sampling.top_k = value;
+        [](common_params & cur, int value) {
+            cur.sampling.top_k = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--top-p"}, "N",
         string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
-        [](common_params & params, const std::string & value) {
-            params.sampling.top_p = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.top_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--min-p"}, "N",
         string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
-        [](common_params & params, const std::string & value) {
-            params.sampling.min_p = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.min_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--xtc-probability"}, "N",
         string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
-        [](common_params & params, const std::string & value) {
-            params.sampling.xtc_probability = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.xtc_probability = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--xtc-threshold"}, "N",
         string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
-        [](common_params & params, const std::string & value) {
-            params.sampling.xtc_threshold = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.xtc_threshold = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--typical"}, "N",
         string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
-        [](common_params & params, const std::string & value) {
-            params.sampling.typ_p = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.typ_p = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--repeat-last-n"}, "N",
         string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
-        [](common_params & params, int value) {
+        [](common_params & cur, int value) {
             if (value < -1) {
                 throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value));
             }
-            params.sampling.penalty_last_n = value;
-            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
+            cur.sampling.penalty_last_n = value;
+            cur.sampling.n_prev = std::max(cur.sampling.n_prev, cur.sampling.penalty_last_n);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--repeat-penalty"}, "N",
         string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
-        [](common_params & params, const std::string & value) {
-            params.sampling.penalty_repeat = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.penalty_repeat = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--presence-penalty"}, "N",
         string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
-        [](common_params & params, const std::string & value) {
-            params.sampling.penalty_present = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.penalty_present = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--frequency-penalty"}, "N",
         string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
-        [](common_params & params, const std::string & value) {
-            params.sampling.penalty_freq = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.penalty_freq = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-multiplier"}, "N",
         string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
-        [](common_params & params, const std::string & value) {
-            params.sampling.dry_multiplier = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.dry_multiplier = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-base"}, "N",
         string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             float potential_base = std::stof(value);
             if (potential_base >= 1.0f)
             {
-                params.sampling.dry_base = potential_base;
+                cur.sampling.dry_base = potential_base;
             }
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-allowed-length"}, "N",
         string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
-        [](common_params & params, int value) {
-            params.sampling.dry_allowed_length = value;
+        [](common_params & cur, int value) {
+            cur.sampling.dry_allowed_length = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dry-penalty-last-n"}, "N",
         string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
-        [](common_params & params, int value) {
+        [](common_params & cur, int value) {
             if (value < -1) {
                 throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value));
             }
-            params.sampling.dry_penalty_last_n = value;
+            cur.sampling.dry_penalty_last_n = value;
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -998,55 +998,55 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                     std::string formatted_b = (b == "\n") ? "\\n" : b;
                     return a + ", '" + formatted_b + "'";
                 }).c_str()),
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             static bool defaults_cleared = false;
 
             if (!defaults_cleared) {
-                params.sampling.dry_sequence_breakers.clear();
+                cur.sampling.dry_sequence_breakers.clear();
                 defaults_cleared = true;
             }
 
             if (value == "none") {
-                params.sampling.dry_sequence_breakers.clear();
+                cur.sampling.dry_sequence_breakers.clear();
             } else {
-                params.sampling.dry_sequence_breakers.emplace_back(value);
+                cur.sampling.dry_sequence_breakers.emplace_back(value);
             }
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dynatemp-range"}, "N",
         string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
-        [](common_params & params, const std::string & value) {
-            params.sampling.dynatemp_range = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.dynatemp_range = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--dynatemp-exp"}, "N",
         string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
-        [](common_params & params, const std::string & value) {
-            params.sampling.dynatemp_exponent = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.dynatemp_exponent = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat"}, "N",
         string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
         "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
-        [](common_params & params, int value) {
-            params.sampling.mirostat = value;
+        [](common_params & cur, int value) {
+            cur.sampling.mirostat = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat-lr"}, "N",
         string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
-        [](common_params & params, const std::string & value) {
-            params.sampling.mirostat_eta = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.mirostat_eta = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--mirostat-ent"}, "N",
         string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
-        [](common_params & params, const std::string & value) {
-            params.sampling.mirostat_tau = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.mirostat_tau = std::stof(value);
         }
     ).set_sparam());
     add_opt(common_arg(
@@ -1054,7 +1054,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "modifies the likelihood of token appearing in the completion,\n"
         "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
         "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::stringstream ss(value);
             llama_token key;
             char sign;
@@ -1062,7 +1062,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             try {
                 if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
                     const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                    params.sampling.logit_bias.push_back({key, bias});
+                    cur.sampling.logit_bias.push_back({key, bias});
                 } else {
                     throw std::invalid_argument("invalid input format");
                 }
@@ -1074,14 +1074,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--grammar"}, "GRAMMAR",
         string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.sampling.grammar = value;
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.grammar = value;
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--grammar-file"}, "FNAME",
         "file to read grammar from",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::ifstream file(value);
             if (!file) {
                 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
@@ -1089,130 +1089,130 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             std::copy(
                 std::istreambuf_iterator<char>(file),
                 std::istreambuf_iterator<char>(),
-                std::back_inserter(params.sampling.grammar)
+                std::back_inserter(cur.sampling.grammar)
             );
         }
     ).set_sparam());
     add_opt(common_arg(
         {"-j", "--json-schema"}, "SCHEMA",
         "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
-        [](common_params & params, const std::string & value) {
-            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
+        [](common_params & cur, const std::string & value) {
+            cur.sampling.grammar = json_schema_to_grammar(json::parse(value));
         }
     ).set_sparam());
     add_opt(common_arg(
         {"--pooling"}, "{none,mean,cls,last,rank}",
         "pooling type for embeddings, use model default if unspecified",
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
-            else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
-            else if (value == "rank") { params.pooling_type = LLAMA_POOLING_TYPE_RANK; }
+        [](common_params & cur, const std::string & value) {
+            /**/ if (value == "none") { cur.pooling_type = LLAMA_POOLING_TYPE_NONE; }
+            else if (value == "mean") { cur.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
+            else if (value == "cls")  { cur.pooling_type = LLAMA_POOLING_TYPE_CLS;  }
+            else if (value == "last") { cur.pooling_type = LLAMA_POOLING_TYPE_LAST; }
+            else if (value == "rank") { cur.pooling_type = LLAMA_POOLING_TYPE_RANK; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_POOLING"));
     add_opt(common_arg(
         {"--attention"}, "{causal,non-causal}",
         "attention type for embeddings, use model default if unspecified",
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
-            else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
+        [](common_params & cur, const std::string & value) {
+            /**/ if (value == "causal") { cur.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
+            else if (value == "non-causal") { cur.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--rope-scaling"}, "{none,linear,yarn}",
         "RoPE frequency scaling method, defaults to linear unless specified by the model",
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
+        [](common_params & cur, const std::string & value) {
+            /**/ if (value == "none") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
+            else if (value == "linear") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
+            else if (value == "yarn") { cur.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_env("LLAMA_ARG_ROPE_SCALING_TYPE"));
     add_opt(common_arg(
         {"--rope-scale"}, "N",
         "RoPE context scaling factor, expands context by a factor of N",
-        [](common_params & params, const std::string & value) {
-            params.rope_freq_scale = 1.0f / std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.rope_freq_scale = 1.0f / std::stof(value);
         }
     ).set_env("LLAMA_ARG_ROPE_SCALE"));
     add_opt(common_arg(
         {"--rope-freq-base"}, "N",
         "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)",
-        [](common_params & params, const std::string & value) {
-            params.rope_freq_base = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.rope_freq_base = std::stof(value);
         }
     ).set_env("LLAMA_ARG_ROPE_FREQ_BASE"));
     add_opt(common_arg(
         {"--rope-freq-scale"}, "N",
         "RoPE frequency scaling factor, expands context by a factor of 1/N",
-        [](common_params & params, const std::string & value) {
-            params.rope_freq_scale = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.rope_freq_scale = std::stof(value);
         }
     ).set_env("LLAMA_ARG_ROPE_FREQ_SCALE"));
     add_opt(common_arg(
         {"--yarn-orig-ctx"}, "N",
         string_format("YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx),
-        [](common_params & params, int value) {
-            params.yarn_orig_ctx = value;
+        [](common_params & cur, int value) {
+            cur.yarn_orig_ctx = value;
         }
     ).set_env("LLAMA_ARG_YARN_ORIG_CTX"));
     add_opt(common_arg(
         {"--yarn-ext-factor"}, "N",
         string_format("YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor),
-        [](common_params & params, const std::string & value) {
-            params.yarn_ext_factor = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.yarn_ext_factor = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_EXT_FACTOR"));
     add_opt(common_arg(
         {"--yarn-attn-factor"}, "N",
         string_format("YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor),
-        [](common_params & params, const std::string & value) {
-            params.yarn_attn_factor = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.yarn_attn_factor = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_ATTN_FACTOR"));
     add_opt(common_arg(
         {"--yarn-beta-slow"}, "N",
         string_format("YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow),
-        [](common_params & params, const std::string & value) {
-            params.yarn_beta_slow = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.yarn_beta_slow = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_BETA_SLOW"));
     add_opt(common_arg(
         {"--yarn-beta-fast"}, "N",
         string_format("YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast),
-        [](common_params & params, const std::string & value) {
-            params.yarn_beta_fast = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.yarn_beta_fast = std::stof(value);
         }
     ).set_env("LLAMA_ARG_YARN_BETA_FAST"));
     add_opt(common_arg(
         {"-gan", "--grp-attn-n"}, "N",
         string_format("group-attention factor (default: %d)", params.grp_attn_n),
-        [](common_params & params, int value) {
-            params.grp_attn_n = value;
+        [](common_params & cur, int value) {
+            cur.grp_attn_n = value;
         }
     ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"-gaw", "--grp-attn-w"}, "N",
         string_format("group-attention width (default: %d)", params.grp_attn_w),
-        [](common_params & params, int value) {
-            params.grp_attn_w = value;
+        [](common_params & cur, int value) {
+            cur.grp_attn_w = value;
         }
     ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
         {"-dkvc", "--dump-kv-cache"},
         "verbose print of the KV cache",
-        [](common_params & params) {
-            params.dump_kv_cache = true;
+        [](common_params & cur) {
+            cur.dump_kv_cache = true;
         }
     ));
     add_opt(common_arg(
         {"-nkvo", "--no-kv-offload"},
         "disable KV offload",
-        [](common_params & params) {
-            params.no_kv_offload = true;
+        [](common_params & cur) {
+            cur.no_kv_offload = true;
         }
     ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
     add_opt(common_arg(
@@ -1224,8 +1224,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             get_all_kv_cache_types().c_str(),
             ggml_type_name(params.cache_type_k)
         ),
-        [](common_params & params, const std::string & value) {
-            params.cache_type_k = kv_cache_type_from_str(value);
+        [](common_params & cur, const std::string & value) {
+            cur.cache_type_k = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_K"));
     add_opt(common_arg(
@@ -1237,157 +1237,157 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             get_all_kv_cache_types().c_str(),
             ggml_type_name(params.cache_type_v)
         ),
-        [](common_params & params, const std::string & value) {
-            params.cache_type_v = kv_cache_type_from_str(value);
+        [](common_params & cur, const std::string & value) {
+            cur.cache_type_v = kv_cache_type_from_str(value);
         }
     ).set_env("LLAMA_ARG_CACHE_TYPE_V"));
     add_opt(common_arg(
         {"--perplexity", "--all-logits"},
         string_format("return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false"),
-        [](common_params & params) {
-            params.logits_all = true;
+        [](common_params & cur) {
+            cur.logits_all = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--hellaswag"},
         "compute HellaSwag score over random tasks from datafile supplied with -f",
-        [](common_params & params) {
-            params.hellaswag = true;
+        [](common_params & cur) {
+            cur.hellaswag = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--hellaswag-tasks"}, "N",
         string_format("number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks),
-        [](common_params & params, int value) {
-            params.hellaswag_tasks = value;
+        [](common_params & cur, int value) {
+            cur.hellaswag_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--winogrande"},
         "compute Winogrande score over random tasks from datafile supplied with -f",
-        [](common_params & params) {
-            params.winogrande = true;
+        [](common_params & cur) {
+            cur.winogrande = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--winogrande-tasks"}, "N",
         string_format("number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks),
-        [](common_params & params, int value) {
-            params.winogrande_tasks = value;
+        [](common_params & cur, int value) {
+            cur.winogrande_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--multiple-choice"},
         "compute multiple choice score over random tasks from datafile supplied with -f",
-        [](common_params & params) {
-            params.multiple_choice = true;
+        [](common_params & cur) {
+            cur.multiple_choice = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--multiple-choice-tasks"}, "N",
         string_format("number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks),
-        [](common_params & params, int value) {
-            params.multiple_choice_tasks = value;
+        [](common_params & cur, int value) {
+            cur.multiple_choice_tasks = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--kl-divergence"},
         "computes KL-divergence to logits provided via --kl-divergence-base",
-        [](common_params & params) {
-            params.kl_divergence = true;
+        [](common_params & cur) {
+            cur.kl_divergence = true;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--save-all-logits", "--kl-divergence-base"}, "FNAME",
         "set logits file",
-        [](common_params & params, const std::string & value) {
-            params.logits_file = value;
+        [](common_params & cur, const std::string & value) {
+            cur.logits_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--ppl-stride"}, "N",
         string_format("stride for perplexity calculation (default: %d)", params.ppl_stride),
-        [](common_params & params, int value) {
-            params.ppl_stride = value;
+        [](common_params & cur, int value) {
+            cur.ppl_stride = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"--ppl-output-type"}, "<0|1>",
         string_format("output type for perplexity calculation (default: %d)", params.ppl_output_type),
-        [](common_params & params, int value) {
-            params.ppl_output_type = value;
+        [](common_params & cur, int value) {
+            cur.ppl_output_type = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PERPLEXITY}));
     add_opt(common_arg(
         {"-dt", "--defrag-thold"}, "N",
         string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold),
-        [](common_params & params, const std::string & value) {
-            params.defrag_thold = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.defrag_thold = std::stof(value);
         }
     ).set_env("LLAMA_ARG_DEFRAG_THOLD"));
     add_opt(common_arg(
         {"-np", "--parallel"}, "N",
         string_format("number of parallel sequences to decode (default: %d)", params.n_parallel),
-        [](common_params & params, int value) {
-            params.n_parallel = value;
+        [](common_params & cur, int value) {
+            cur.n_parallel = value;
         }
     ).set_env("LLAMA_ARG_N_PARALLEL"));
     add_opt(common_arg(
         {"-ns", "--sequences"}, "N",
         string_format("number of sequences to decode (default: %d)", params.n_sequences),
-        [](common_params & params, int value) {
-            params.n_sequences = value;
+        [](common_params & cur, int value) {
+            cur.n_sequences = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
     add_opt(common_arg(
         {"-cb", "--cont-batching"},
         string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.cont_batching = true;
+        [](common_params & cur) {
+            cur.cont_batching = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
     add_opt(common_arg(
         {"-nocb", "--no-cont-batching"},
         "disable continuous batching",
-        [](common_params & params) {
-            params.cont_batching = false;
+        [](common_params & cur) {
+            cur.cont_batching = false;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
     add_opt(common_arg(
         {"--mmproj"}, "FILE",
         "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
-        [](common_params & params, const std::string & value) {
-            params.mmproj = value;
+        [](common_params & cur, const std::string & value) {
+            cur.mmproj = value;
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
-        [](common_params & params, const std::string & value) {
-            params.image.emplace_back(value);
+        [](common_params & cur, const std::string & value) {
+            cur.image.emplace_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_LLAVA}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",
             "comma separated list of RPC servers",
-            [](common_params & params, const std::string & value) {
-                params.rpc_servers = value;
+            [](common_params & cur, const std::string & value) {
+                cur.rpc_servers = value;
             }
         ).set_env("LLAMA_ARG_RPC"));
     }
     add_opt(common_arg(
         {"--mlock"},
         "force system to keep model in RAM rather than swapping or compressing",
-        [](common_params & params) {
-            params.use_mlock = true;
+        [](common_params & cur) {
+            cur.use_mlock = true;
         }
     ).set_env("LLAMA_ARG_MLOCK"));
     add_opt(common_arg(
         {"--no-mmap"},
         "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [](common_params & params) {
-            params.use_mmap = false;
+        [](common_params & cur) {
+            cur.use_mmap = false;
         }
     ).set_env("LLAMA_ARG_NO_MMAP"));
     add_opt(common_arg(
@@ -1398,10 +1398,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "- numactl: use the CPU map provided by numactl\n"
         "if run without this previously, it is recommended to drop the system page cache before using this\n"
         "see https://github.com/ggerganov/llama.cpp/issues/1437",
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-            else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-            else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
+        [](common_params & cur, const std::string & value) {
+            /**/ if (value == "distribute" || value == "") { cur.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+            else if (value == "isolate") { cur.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+            else if (value == "numactl") { cur.numa = GGML_NUMA_STRATEGY_NUMACTL; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_env("LLAMA_ARG_NUMA"));
@@ -1409,8 +1409,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         {"-dev", "--device"}, "<dev1,dev2,..>",
         "comma-separated list of devices to use for offloading (none = don't offload)\n"
         "use --list-devices to see a list of available devices",
-        [](common_params & params, const std::string & value) {
-            params.devices = parse_device_list(value);
+        [](common_params & cur, const std::string & value) {
+            cur.devices = parse_device_list(value);
         }
     ).set_env("LLAMA_ARG_DEVICE"));
     add_opt(common_arg(
@@ -1432,8 +1432,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
         "number of layers to store in VRAM",
-        [](common_params & params, int value) {
-            params.n_gpu_layers = value;
+        [](common_params & cur, int value) {
+            cur.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
                 fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -1447,14 +1447,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "- none: use one GPU only\n"
         "- layer (default): split layers and KV across GPUs\n"
         "- row: split rows across GPUs",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::string arg_next = value;
             if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
+                cur.split_mode = LLAMA_SPLIT_MODE_NONE;
             } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
+                cur.split_mode = LLAMA_SPLIT_MODE_LAYER;
             } else if (arg_next == "row") {
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
+                cur.split_mode = LLAMA_SPLIT_MODE_ROW;
             } else {
                 throw std::invalid_argument("invalid value");
             }
@@ -1466,7 +1466,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-ts", "--tensor-split"}, "N0,N1,N2,...",
         "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::string arg_next = value;
 
             // split string by , and /
@@ -1480,9 +1480,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
             for (size_t i = 0; i < llama_max_devices(); ++i) {
                 if (i < split_arg.size()) {
-                    params.tensor_split[i] = std::stof(split_arg[i]);
+                    cur.tensor_split[i] = std::stof(split_arg[i]);
                 } else {
-                    params.tensor_split[i] = 0.0f;
+                    cur.tensor_split[i] = 0.0f;
                 }
             }
             if (!llama_supports_gpu_offload()) {
@@ -1493,8 +1493,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-mg", "--main-gpu"}, "INDEX",
         string_format("the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu),
-        [](common_params & params, int value) {
-            params.main_gpu = value;
+        [](common_params & cur, int value) {
+            cur.main_gpu = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: llama.cpp was compiled without support for GPU offload. Setting the main GPU has no effect.\n");
             }
@@ -1503,16 +1503,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--check-tensors"},
         string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"),
-        [](common_params & params) {
-            params.check_tensors = true;
+        [](common_params & cur) {
+            cur.check_tensors = true;
         }
     ));
     add_opt(common_arg(
         {"--override-kv"}, "KEY=TYPE:VALUE",
         "advanced option to override model metadata by key. may be specified multiple times.\n"
         "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false",
-        [](common_params & params, const std::string & value) {
-            if (!string_parse_kv_override(value.c_str(), params.kv_overrides)) {
+        [](common_params & cur, const std::string & value) {
+            if (!string_parse_kv_override(value.c_str(), cur.kv_overrides)) {
                 throw std::runtime_error(string_format("error: Invalid type for KV override: %s\n", value.c_str()));
             }
         }
@@ -1520,47 +1520,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--lora"}, "FNAME",
         "path to LoRA adapter (can be repeated to use multiple adapters)",
-        [](common_params & params, const std::string & value) {
-            params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
+        [](common_params & cur, const std::string & value) {
+            cur.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"--lora-scaled"}, "FNAME", "SCALE",
         "path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
-        [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
+        [](common_params & cur, const std::string & fname, const std::string & scale) {
+            cur.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
         }
         // we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"--control-vector"}, "FNAME",
         "add a control vector\nnote: this argument can be repeated to add multiple control vectors",
-        [](common_params & params, const std::string & value) {
-            params.control_vectors.push_back({ 1.0f, value, });
+        [](common_params & cur, const std::string & value) {
+            cur.control_vectors.push_back({ 1.0f, value, });
         }
     ));
     add_opt(common_arg(
         {"--control-vector-scaled"}, "FNAME", "SCALE",
         "add a control vector with user defined scaling SCALE\n"
         "note: this argument can be repeated to add multiple scaled control vectors",
-        [](common_params & params, const std::string & fname, const std::string & scale) {
-            params.control_vectors.push_back({ std::stof(scale), fname });
+        [](common_params & cur, const std::string & fname, const std::string & scale) {
+            cur.control_vectors.push_back({ std::stof(scale), fname });
         }
     ));
     add_opt(common_arg(
         {"--control-vector-layer-range"}, "START", "END",
         "layer range to apply the control vector(s) to, start and end inclusive",
-        [](common_params & params, const std::string & start, const std::string & end) {
-            params.control_vector_layer_start = std::stoi(start);
-            params.control_vector_layer_end = std::stoi(end);
+        [](common_params & cur, const std::string & start, const std::string & end) {
+            cur.control_vector_layer_start = std::stoi(start);
+            cur.control_vector_layer_end = std::stoi(end);
         }
     ));
     add_opt(common_arg(
         {"-a", "--alias"}, "STRING",
         "set alias for model name (to be used by REST API)",
-        [](common_params & params, const std::string & value) {
-            params.model_alias = value;
+        [](common_params & cur, const std::string & value) {
+            cur.model_alias = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ALIAS"));
     add_opt(common_arg(
@@ -1571,89 +1571,89 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 "model path (default: `models/$filename` with filename from `--hf-file` "
                 "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
             ),
-        [](common_params & params, const std::string & value) {
-            params.model = value;
+        [](common_params & cur, const std::string & value) {
+            cur.model = value;
         }
     ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
     add_opt(common_arg(
         {"-mu", "--model-url"}, "MODEL_URL",
         "model download url (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.model_url = value;
+        [](common_params & cur, const std::string & value) {
+            cur.model_url = value;
         }
     ).set_env("LLAMA_ARG_MODEL_URL"));
     add_opt(common_arg(
         {"-hfr", "--hf-repo"}, "REPO",
         "Hugging Face model repository (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.hf_repo = value;
+        [](common_params & cur, const std::string & value) {
+            cur.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO"));
     add_opt(common_arg(
         {"-hff", "--hf-file"}, "FILE",
         "Hugging Face model file (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.hf_file = value;
+        [](common_params & cur, const std::string & value) {
+            cur.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
     add_opt(common_arg(
         {"-hfrv", "--hf-repo-v"}, "REPO",
         "Hugging Face model repository for the vocoder model (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.vocoder.hf_repo = value;
+        [](common_params & cur, const std::string & value) {
+            cur.vocoder.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO_V"));
     add_opt(common_arg(
         {"-hffv", "--hf-file-v"}, "FILE",
         "Hugging Face model file for the vocoder model (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.vocoder.hf_file = value;
+        [](common_params & cur, const std::string & value) {
+            cur.vocoder.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE_V"));
     add_opt(common_arg(
         {"-hft", "--hf-token"}, "TOKEN",
         "Hugging Face access token (default: value from HF_TOKEN environment variable)",
-        [](common_params & params, const std::string & value) {
-            params.hf_token = value;
+        [](common_params & cur, const std::string & value) {
+            cur.hf_token = value;
         }
     ).set_env("HF_TOKEN"));
     add_opt(common_arg(
         {"--context-file"}, "FNAME",
         "file to load context from (repeat to specify multiple files)",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::ifstream file(value, std::ios::binary);
             if (!file) {
                 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
             }
-            params.context_files.push_back(value);
+            cur.context_files.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--chunk-size"}, "N",
         string_format("minimum length of embedded text chunks (default: %d)", params.chunk_size),
-        [](common_params & params, int value) {
-            params.chunk_size = value;
+        [](common_params & cur, int value) {
+            cur.chunk_size = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--chunk-separator"}, "STRING",
         string_format("separator between chunks (default: '%s')", params.chunk_separator.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.chunk_separator = value;
+        [](common_params & cur, const std::string & value) {
+            cur.chunk_separator = value;
         }
     ).set_examples({LLAMA_EXAMPLE_RETRIEVAL}));
     add_opt(common_arg(
         {"--junk"}, "N",
         string_format("number of times to repeat the junk text (default: %d)", params.n_junk),
-        [](common_params & params, int value) {
-            params.n_junk = value;
+        [](common_params & cur, int value) {
+            cur.n_junk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
         {"--pos"}, "N",
         string_format("position of the passkey in the junk text (default: %d)", params.i_pos),
-        [](common_params & params, int value) {
-            params.i_pos = value;
+        [](common_params & cur, int value) {
+            cur.i_pos = value;
         }
     ).set_examples({LLAMA_EXAMPLE_PASSKEY}));
     add_opt(common_arg(
@@ -1664,152 +1664,152 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                 : ex == LLAMA_EXAMPLE_CVECTOR_GENERATOR
                     ? params.cvector_outfile.c_str()
                     : params.out_file.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.out_file = value;
-            params.cvector_outfile = value;
-            params.lora_outfile = value;
+        [](common_params & cur, const std::string & value) {
+            cur.out_file = value;
+            cur.cvector_outfile = value;
+            cur.lora_outfile = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA}));
     add_opt(common_arg(
         {"-ofreq", "--output-frequency"}, "N",
         string_format("output the imatrix every N iterations (default: %d)", params.n_out_freq),
-        [](common_params & params, int value) {
-            params.n_out_freq = value;
+        [](common_params & cur, int value) {
+            cur.n_out_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--save-frequency"}, "N",
         string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),
-        [](common_params & params, int value) {
-            params.n_save_freq = value;
+        [](common_params & cur, int value) {
+            cur.n_save_freq = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--process-output"},
         string_format("collect data for the output tensor (default: %s)", params.process_output ? "true" : "false"),
-        [](common_params & params) {
-            params.process_output = true;
+        [](common_params & cur) {
+            cur.process_output = true;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--no-ppl"},
         string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](common_params & params) {
-            params.compute_ppl = false;
+        [](common_params & cur) {
+            cur.compute_ppl = false;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"--chunk", "--from-chunk"}, "N",
         string_format("start processing the input from chunk N (default: %d)", params.i_chunk),
-        [](common_params & params, int value) {
-            params.i_chunk = value;
+        [](common_params & cur, int value) {
+            cur.i_chunk = value;
         }
     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
     add_opt(common_arg(
         {"-pps"},
         string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
-        [](common_params & params) {
-            params.is_pp_shared = true;
+        [](common_params & cur) {
+            cur.is_pp_shared = true;
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"-npp"}, "n0,n1,...",
         "number of prompt tokens",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             auto p = string_split<int>(value, ',');
-            params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+            cur.n_pp.insert(cur.n_pp.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"-ntg"}, "n0,n1,...",
         "number of text generation tokens",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             auto p = string_split<int>(value, ',');
-            params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+            cur.n_tg.insert(cur.n_tg.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"-npl"}, "n0,n1,...",
         "number of parallel prompts",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             auto p = string_split<int>(value, ',');
-            params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+            cur.n_pl.insert(cur.n_pl.end(), p.begin(), p.end());
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
     add_opt(common_arg(
         {"--embd-normalize"}, "N",
         string_format("normalisation for embeddings (default: %d) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)", params.embd_normalize),
-        [](common_params & params, int value) {
-            params.embd_normalize = value;
+        [](common_params & cur, int value) {
+            cur.embd_normalize = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--embd-output-format"}, "FORMAT",
         "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
-        [](common_params & params, const std::string & value) {
-            params.embd_out = value;
+        [](common_params & cur, const std::string & value) {
+            cur.embd_out = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--embd-separator"}, "STRING",
         "separator of embeddings (default \\n) for example \"<#sep#>\"",
-        [](common_params & params, const std::string & value) {
-            params.embd_sep = value;
+        [](common_params & cur, const std::string & value) {
+            cur.embd_sep = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--host"}, "HOST",
         string_format("ip address to listen (default: %s)", params.hostname.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.hostname = value;
+        [](common_params & cur, const std::string & value) {
+            cur.hostname = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_HOST"));
     add_opt(common_arg(
         {"--port"}, "PORT",
         string_format("port to listen (default: %d)", params.port),
-        [](common_params & params, int value) {
-            params.port = value;
+        [](common_params & cur, int value) {
+            cur.port = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
     add_opt(common_arg(
         {"--path"}, "PATH",
         string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.public_path = value;
+        [](common_params & cur, const std::string & value) {
+            cur.public_path = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STATIC_PATH"));
     add_opt(common_arg(
         {"--no-webui"},
         string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.webui = false;
+        [](common_params & cur) {
+            cur.webui = false;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
     add_opt(common_arg(
         {"--embedding", "--embeddings"},
         string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.embedding = true;
+        [](common_params & cur) {
+            cur.embedding = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
     add_opt(common_arg(
         {"--reranking", "--rerank"},
         string_format("enable reranking endpoint on server (default: %s)", params.reranking ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.reranking = true;
+        [](common_params & cur) {
+            cur.reranking = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_RERANKING"));
     add_opt(common_arg(
         {"--api-key"}, "KEY",
         "API key to use for authentication (default: none)",
-        [](common_params & params, const std::string & value) {
-            params.api_keys.push_back(value);
+        [](common_params & cur, const std::string & value) {
+            cur.api_keys.push_back(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY"));
     add_opt(common_arg(
         {"--api-key-file"}, "FNAME",
         "path to file containing API keys (default: none)",
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             std::ifstream key_file(value);
             if (!key_file) {
                 throw std::runtime_error(string_format("error: failed to open file '%s'\n", value.c_str()));
@@ -1817,7 +1817,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             std::string key;
             while (std::getline(key_file, key)) {
                 if (!key.empty()) {
-                        params.api_keys.push_back(key);
+                        cur.api_keys.push_back(key);
                 }
             }
             key_file.close();
@@ -1826,75 +1826,75 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--ssl-key-file"}, "FNAME",
         "path to file a PEM-encoded SSL private key",
-        [](common_params & params, const std::string & value) {
-            params.ssl_file_key = value;
+        [](common_params & cur, const std::string & value) {
+            cur.ssl_file_key = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_KEY_FILE"));
     add_opt(common_arg(
         {"--ssl-cert-file"}, "FNAME",
         "path to file a PEM-encoded SSL certificate",
-        [](common_params & params, const std::string & value) {
-            params.ssl_file_cert = value;
+        [](common_params & cur, const std::string & value) {
+            cur.ssl_file_cert = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
     add_opt(common_arg(
         {"-to", "--timeout"}, "N",
         string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
-        [](common_params & params, int value) {
-            params.timeout_read  = value;
-            params.timeout_write = value;
+        [](common_params & cur, int value) {
+            cur.timeout_read  = value;
+            cur.timeout_write = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TIMEOUT"));
     add_opt(common_arg(
         {"--threads-http"}, "N",
         string_format("number of threads used to process HTTP requests (default: %d)", params.n_threads_http),
-        [](common_params & params, int value) {
-            params.n_threads_http = value;
+        [](common_params & cur, int value) {
+            cur.n_threads_http = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_THREADS_HTTP"));
     add_opt(common_arg(
         {"--cache-reuse"}, "N",
         string_format("min chunk size to attempt reusing from the cache via KV shifting (default: %d)", params.n_cache_reuse),
-        [](common_params & params, int value) {
-            params.n_cache_reuse = value;
+        [](common_params & cur, int value) {
+            cur.n_cache_reuse = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
     add_opt(common_arg(
         {"--metrics"},
         string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.endpoint_metrics = true;
+        [](common_params & cur) {
+            cur.endpoint_metrics = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_METRICS"));
     add_opt(common_arg(
         {"--slots"},
         string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.endpoint_slots = true;
+        [](common_params & cur) {
+            cur.endpoint_slots = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
     add_opt(common_arg(
         {"--props"},
         string_format("enable changing global properties via POST /props (default: %s)", params.endpoint_props ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.endpoint_props = true;
+        [](common_params & cur) {
+            cur.endpoint_props = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
     add_opt(common_arg(
         {"--no-slots"},
         "disables slots monitoring endpoint",
-        [](common_params & params) {
-            params.endpoint_slots = false;
+        [](common_params & cur) {
+            cur.endpoint_slots = false;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
     add_opt(common_arg(
         {"--slot-save-path"}, "PATH",
         "path to save slot kv cache (default: disabled)",
-        [](common_params & params, const std::string & value) {
-            params.slot_save_path = value;
+        [](common_params & cur, const std::string & value) {
+            cur.slot_save_path = value;
             // if doesn't end with DIRECTORY_SEPARATOR, add it
-            if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
-                params.slot_save_path += DIRECTORY_SEPARATOR;
+            if (!cur.slot_save_path.empty() && cur.slot_save_path[cur.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+                cur.slot_save_path += DIRECTORY_SEPARATOR;
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
@@ -1905,7 +1905,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             "if suffix/prefix are specified, template will be disabled\n"
             "list of built-in templates:\n%s", list_builtin_chat_templates().c_str()
         ),
-        [](common_params & params, const std::string & value) {
+        [](common_params & cur, const std::string & value) {
             if (!common_chat_verify_template(value)) {
                 throw std::runtime_error(string_format(
                     "error: the supplied chat template is not supported: %s\n"
@@ -1913,73 +1913,73 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                     value.c_str()
                 ));
             }
-            params.chat_template = value;
+            cur.chat_template = value;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
         string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
-        [](common_params & params, const std::string & value) {
-            params.slot_prompt_similarity = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.slot_prompt_similarity = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--lora-init-without-apply"},
         string_format("load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.lora_init_without_apply = true;
+        [](common_params & cur) {
+            cur.lora_init_without_apply = true;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"--simple-io"},
         "use basic IO for better compatibility in subprocesses and limited consoles",
-        [](common_params & params) {
-            params.simple_io = true;
+        [](common_params & cur) {
+            cur.simple_io = true;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
     add_opt(common_arg(
         {"--positive-file"}, "FNAME",
         string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.cvector_positive_file = value;
+        [](common_params & cur, const std::string & value) {
+            cur.cvector_positive_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--negative-file"}, "FNAME",
         string_format("negative prompts file, one prompt per line (default: '%s')", params.cvector_negative_file.c_str()),
-        [](common_params & params, const std::string & value) {
-            params.cvector_negative_file = value;
+        [](common_params & cur, const std::string & value) {
+            cur.cvector_negative_file = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--pca-batch"}, "N",
         string_format("batch size used for PCA. Larger batch runs faster, but uses more memory (default: %d)", params.n_pca_batch),
-        [](common_params & params, int value) {
-            params.n_pca_batch = value;
+        [](common_params & cur, int value) {
+            cur.n_pca_batch = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--pca-iter"}, "N",
         string_format("number of iterations used for PCA (default: %d)", params.n_pca_iterations),
-        [](common_params & params, int value) {
-            params.n_pca_iterations = value;
+        [](common_params & cur, int value) {
+            cur.n_pca_iterations = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--method"}, "{pca, mean}",
         "dimensionality reduction method to be used (default: pca)",
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "pca") { params.cvector_dimre_method = DIMRE_METHOD_PCA; }
-            else if (value == "mean") { params.cvector_dimre_method = DIMRE_METHOD_MEAN; }
+        [](common_params & cur, const std::string & value) {
+            /**/ if (value == "pca") { cur.cvector_dimre_method = DIMRE_METHOD_PCA; }
+            else if (value == "mean") { cur.cvector_dimre_method = DIMRE_METHOD_MEAN; }
             else { throw std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_CVECTOR_GENERATOR}));
     add_opt(common_arg(
         {"--output-format"}, "{md,jsonl}",
         "output format for batched-bench results (default: md)",
-        [](common_params & params, const std::string & value) {
-            /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
-            else if (value == "md") { params.batched_bench_output_jsonl = false; }
+        [](common_params & cur, const std::string & value) {
+            /**/ if (value == "jsonl") { cur.batched_bench_output_jsonl = true; }
+            else if (value == "md") { cur.batched_bench_output_jsonl = false; }
             else { std::invalid_argument("invalid value"); }
         }
     ).set_examples({LLAMA_EXAMPLE_BENCH}));
@@ -2007,16 +2007,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-v", "--verbose", "--log-verbose"},
         "Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
-        [](common_params & params) {
-            params.verbosity = INT_MAX;
+        [](common_params & cur) {
+            cur.verbosity = INT_MAX;
             common_log_set_verbosity_thold(INT_MAX);
         }
     ));
     add_opt(common_arg(
         {"-lv", "--verbosity", "--log-verbosity"}, "N",
         "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
-        [](common_params & params, int value) {
-            params.verbosity = value;
+        [](common_params & cur, int value) {
+            cur.verbosity = value;
             common_log_set_verbosity_thold(value);
         }
     ).set_env("LLAMA_LOG_VERBOSITY"));
@@ -2039,29 +2039,29 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-td", "--threads-draft"}, "N",
         "number of threads to use during generation (default: same as --threads)",
-        [](common_params & params, int value) {
-            params.speculative.cpuparams.n_threads = value;
-            if (params.speculative.cpuparams.n_threads <= 0) {
-                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
+        [](common_params & cur, int value) {
+            cur.speculative.cpuparams.n_threads = value;
+            if (cur.speculative.cpuparams.n_threads <= 0) {
+                cur.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-tbd", "--threads-batch-draft"}, "N",
         "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
-        [](common_params & params, int value) {
-            params.speculative.cpuparams_batch.n_threads = value;
-            if (params.speculative.cpuparams_batch.n_threads <= 0) {
-                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
+        [](common_params & cur, int value) {
+            cur.speculative.cpuparams_batch.n_threads = value;
+            if (cur.speculative.cpuparams_batch.n_threads <= 0) {
+                cur.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
             }
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-Cd", "--cpu-mask-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](common_params & params, const std::string & mask) {
-            params.speculative.cpuparams.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
+        [](common_params & cur, const std::string & mask) {
+            cur.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_mask(mask, cur.speculative.cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
@@ -2069,9 +2069,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-Crd", "--cpu-range-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
-        [](common_params & params, const std::string & range) {
-            params.speculative.cpuparams.mask_valid = true;
-            if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
+        [](common_params & cur, const std::string & range) {
+            cur.speculative.cpuparams.mask_valid = true;
+            if (!parse_cpu_range(range, cur.speculative.cpuparams.cpumask)) {
                 throw std::invalid_argument("invalid range");
             }
         }
@@ -2079,33 +2079,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--cpu-strict-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: same as --cpu-strict)",
-        [](common_params & params, int value) {
-            params.speculative.cpuparams.strict_cpu = value;
+        [](common_params & cur, int value) {
+            cur.speculative.cpuparams.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--prio-draft"}, "N",
         string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
-        [](common_params & params, int prio) {
+        [](common_params & cur, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
-            params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
+            cur.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--poll-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: same as --poll])",
-        [](common_params & params, int value) {
-            params.speculative.cpuparams.poll = value;
+        [](common_params & cur, int value) {
+            cur.speculative.cpuparams.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"-Cbd", "--cpu-mask-batch-draft"}, "M",
         "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
-        [](common_params & params, const std::string & mask) {
-            params.speculative.cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
+        [](common_params & cur, const std::string & mask) {
+            cur.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_mask(mask, cur.speculative.cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
@@ -2113,9 +2113,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
         "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
-        [](common_params & params, const std::string & range) {
-            params.speculative.cpuparams_batch.mask_valid = true;
-            if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
+        [](common_params & cur, const std::string & range) {
+            cur.speculative.cpuparams_batch.mask_valid = true;
+            if (!parse_cpu_range(range, cur.speculative.cpuparams_batch.cpumask)) {
                 throw std::invalid_argument("invalid cpumask");
             }
         }
@@ -2123,75 +2123,75 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--cpu-strict-batch-draft"}, "<0|1>",
         "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
-        [](common_params & params, int value) {
-            params.speculative.cpuparams_batch.strict_cpu = value;
+        [](common_params & cur, int value) {
+            cur.speculative.cpuparams_batch.strict_cpu = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--prio-batch-draft"}, "N",
         string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
-        [](common_params & params, int prio) {
+        [](common_params & cur, int prio) {
             if (prio < 0 || prio > 3) {
                 throw std::invalid_argument("invalid value");
             }
-            params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
+            cur.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--poll-batch-draft"}, "<0|1>",
         "Use polling to wait for draft model work (default: --poll-draft)",
-        [](common_params & params, int value) {
-            params.speculative.cpuparams_batch.poll = value;
+        [](common_params & cur, int value) {
+            cur.speculative.cpuparams_batch.poll = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
     add_opt(common_arg(
         {"--draft-max", "--draft", "--draft-n"}, "N",
         string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
-        [](common_params & params, int value) {
-            params.speculative.n_max = value;
+        [](common_params & cur, int value) {
+            cur.speculative.n_max = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
     add_opt(common_arg(
         {"--draft-min", "--draft-n-min"}, "N",
         string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
-        [](common_params & params, int value) {
-            params.speculative.n_min = value;
+        [](common_params & cur, int value) {
+            cur.speculative.n_min = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
     add_opt(common_arg(
         {"--draft-p-split"}, "P",
         string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
-        [](common_params & params, const std::string & value) {
-            params.speculative.p_split = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.speculative.p_split = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
     add_opt(common_arg(
         {"--draft-p-min"}, "P",
         string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
-        [](common_params & params, const std::string & value) {
-            params.speculative.p_min = std::stof(value);
+        [](common_params & cur, const std::string & value) {
+            cur.speculative.p_min = std::stof(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
     add_opt(common_arg(
         {"-cd", "--ctx-size-draft"}, "N",
         string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
-        [](common_params & params, int value) {
-            params.speculative.n_ctx = value;
+        [](common_params & cur, int value) {
+            cur.speculative.n_ctx = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
     add_opt(common_arg(
         {"-devd", "--device-draft"}, "<dev1,dev2,..>",
         "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
         "use --list-devices to see a list of available devices",
-        [](common_params & params, const std::string & value) {
-            params.speculative.devices = parse_device_list(value);
+        [](common_params & cur, const std::string & value) {
+            cur.speculative.devices = parse_device_list(value);
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
         "number of layers to store in VRAM for the draft model",
-        [](common_params & params, int value) {
-            params.speculative.n_gpu_layers = value;
+        [](common_params & cur, int value) {
+            cur.speculative.n_gpu_layers = value;
             if (!llama_supports_gpu_offload()) {
                 fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
                 fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
@@ -2202,16 +2202,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-md", "--model-draft"}, "FNAME",
         "draft model for speculative decoding (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.speculative.model = value;
+        [](common_params & cur, const std::string & value) {
+            cur.speculative.model = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
 
     add_opt(common_arg(
         {"-mv", "--model-vocoder"}, "FNAME",
         "vocoder model for audio generation (default: unused)",
-        [](common_params & params, const std::string & value) {
-            params.vocoder.model = value;
+        [](common_params & cur, const std::string & value) {
+            cur.vocoder.model = value;
         }
     ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
 
@@ -2219,11 +2219,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"--tts-oute-default"},
         string_format("use default OuteTTS models (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
-            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
-            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
-            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
+        [](common_params & cur) {
+            cur.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+            cur.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+            cur.vocoder.hf_repo = "ggml-org/WavTokenizer";
+            cur.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
         }
     ).set_examples({LLAMA_EXAMPLE_TTS}));
 
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index dadc18c8b..5bf67ecc1 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -579,8 +579,8 @@ private:
                     seq.back().second = false;
                 } else {
                     std::string literal;
-                    auto is_non_literal = [&](char c) {
-                        return NON_LITERAL_SET.find(c) != NON_LITERAL_SET.end();
+                    auto is_non_literal = [&](char ch) {
+                        return NON_LITERAL_SET.find(ch) != NON_LITERAL_SET.end();
                     };
                     while (i < length) {
                         if (sub_pattern[i] == '\\' && i < length - 1) {
diff --git a/common/log.cpp b/common/log.cpp
index 7a94bf7f9..76715d629 100644
--- a/common/log.cpp
+++ b/common/log.cpp
@@ -255,8 +255,8 @@ public:
         thrd = std::thread([this]() {
             while (true) {
                 {
-                    std::unique_lock<std::mutex> lock(mtx);
-                    cv.wait(lock, [this]() { return head != tail; });
+                    std::unique_lock<std::mutex> lock_thrd(mtx);
+                    cv.wait(lock_thrd, [this]() { return head != tail; });
 
                     cur = entries[head];
 
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 0659ab6f1..b17d6bc57 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -62,7 +62,7 @@ int main(int argc, char ** argv) {
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
 
     // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
+    auto decode_helper = [&ctx, &batch](int32_t n_batch) {
         for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
             const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
 
@@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
             common_batch_add(batch, 0, i, { 0 }, false);
         }
 
-        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+        if (!decode_helper(ctx_params.n_batch)) {
             LOG_ERR("%s: llama_decode() failed\n", __func__);
             return 1;
         }
@@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
 
                 llama_kv_cache_clear(ctx);
 
-                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                if (!decode_helper(ctx_params.n_batch)) {
                     LOG_ERR("%s: llama_decode() failed\n", __func__);
                     return 1;
                 }
@@ -156,7 +156,7 @@ int main(int argc, char ** argv) {
                         common_batch_add(batch, 0, pp + i, { j }, true);
                     }
 
-                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
+                    if (!decode_helper(ctx_params.n_batch)) {
                         LOG_ERR("%s: llama_decode() failed\n", __func__);
                         return 1;
                     }
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index dc827e814..2e8812f03 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2082,7 +2082,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     }
     else if (ctx->has_qwen2vl_merger) {
         clip_image_u8 * resized = clip_image_u8_init();
-        auto patch_size = clip_patch_size(ctx) * 2;
+        auto patch_size = clip_get_patch_size(ctx) * 2;
         int nx = ceil((float)img->nx / patch_size) * patch_size;
         int ny = ceil((float)img->ny / patch_size) * patch_size;
         bicubic_resize(*img, *resized, nx, ny);
@@ -2293,15 +2293,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w
     return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
-int32_t clip_image_size(const struct clip_ctx * ctx) {
+int32_t clip_get_image_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_size;
 }
 
-int32_t clip_patch_size(const struct clip_ctx * ctx) {
+int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.patch_size;
 }
 
-int32_t clip_hidden_size(const struct clip_ctx * ctx) {
+int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.hidden_size;
 }
 
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 1603edd26..3b60f161d 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -47,9 +47,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
 
-CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
 
 // TODO: should be enum, not string
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index c598caf3d..1978ce180 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -105,8 +105,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
         struct ggml_context * ctx;
     } model;
 
-    const int32_t image_size = clip_image_size(ctx_clip);
-    const int32_t patch_size = clip_patch_size(ctx_clip);
+    const int32_t image_size = clip_get_image_size(ctx_clip);
+    const int32_t patch_size = clip_get_patch_size(ctx_clip);
 
     int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
 
@@ -353,7 +353,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         img_res_v.size = 0;
         img_res_v.data = nullptr;
 
-        const int32_t image_size = clip_image_size(ctx_clip);
+        const int32_t image_size = clip_get_image_size(ctx_clip);
 
         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0c0f066ca..ab8d6c6b4 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3702,8 +3702,8 @@ int main(int argc, char ** argv) {
                 ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
                     json res_json = result->to_json();
                     if (res_json.is_array()) {
-                        for (const auto & res : res_json) {
-                            if (!server_sent_event(sink, "data", res)) {
+                        for (const auto & item : res_json) {
+                            if (!server_sent_event(sink, "data", item)) {
                                 return false;
                             }
                         }
@@ -3973,9 +3973,9 @@ int main(int argc, char ** argv) {
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
             ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
-                for (auto & res : results) {
-                    GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
-                    responses.push_back(res->to_json());
+                for (auto & result : results) {
+                    GGML_ASSERT(dynamic_cast<server_task_result_embd*>(result.get()) != nullptr);
+                    responses.push_back(result->to_json());
                 }
             }, [&](const json & error_data) {
                 res_error(res, error_data);
@@ -4063,9 +4063,9 @@ int main(int argc, char ** argv) {
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
             ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
-                for (auto & res : results) {
-                    GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
-                    responses.push_back(res->to_json());
+                for (auto & result : results) {
+                    GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(result.get()) != nullptr);
+                    responses.push_back(result->to_json());
                 }
             }, [&](const json & error_data) {
                 res_error(res, error_data);
diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp
index e8eda9c22..2b2d906e5 100644
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -110,9 +110,8 @@ int main(int argc, char ** argv) {
         llama_token new_token_id;
         while (true) {
             // check if we have enough space in the context to evaluate this batch
-            int n_ctx = llama_n_ctx(ctx);
             int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
-            if (n_ctx_used + batch.n_tokens > n_ctx) {
+            if (n_ctx_used + batch.n_tokens > (int) llama_n_ctx(ctx)) {
                 printf("\033[0m\n");
                 fprintf(stderr, "context size exceeded\n");
                 exit(0);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 01a3afa40..9026fbcf5 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -311,9 +311,9 @@ static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_m
             ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
         if (ggml_backend_split_buffer_type_fn) {
             size_t dev_index = [&]() {
-                auto * reg = ggml_backend_dev_backend_reg(dev);
-                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
-                    if (ggml_backend_reg_dev_get(reg, i) == dev) {
+                ggml_backend_reg_t reg_dev = ggml_backend_dev_backend_reg(dev);
+                for (size_t i = 0; i < ggml_backend_reg_dev_count(reg_dev); ++i) {
+                    if (ggml_backend_reg_dev_get(reg_dev, i) == dev) {
                         return i;
                     }
                 }
@@ -1304,7 +1304,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
     const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
     auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
         if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
-            return {cpu_dev, &pimpl->cpu_buft_list};
+            return { cpu_dev, &pimpl->cpu_buft_list };
         }
         const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
         auto * dev = devices.at(layer_gpu);
@@ -1453,7 +1453,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
             // avoid using a host buffer when using mmap
             auto * buft_dev = ggml_backend_buft_get_device(buft);
             if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
-                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
                 buft = ggml_backend_dev_buffer_type(cpu_dev);
             }
 
@@ -3697,8 +3696,8 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
 
 const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
     auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
-            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
-                return it.first == name;
+            [name](const std::pair<std::string, struct ggml_tensor *> & entry) {
+                return entry.first == name;
             });
     if (it == tensors_by_name.end()) {
         return nullptr;
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 75899d142..c1e751e70 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -130,17 +130,17 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
     };
     const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
-    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
+    auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name_layer) {
         if (n_expert > 1) {
             // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
             // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
             // for getting the current layer as I initially thought, and we need to resort to parsing the
             // tensor name.
-            if (sscanf(name, "blk.%d.", &i_layer) != 1) {
-                throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
+            if (sscanf(name_layer, "blk.%d.", &i_layer) != 1) {
+                throw std::runtime_error(format("Failed to determine layer for tensor %s", name_layer));
             }
             if (i_layer < 0 || i_layer >= n_layer) {
-                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
+                throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name_layer, n_layer));
             }
         }
         return std::make_pair(i_layer, n_layer);
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index ef108b991..b03f40485 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -2496,15 +2496,15 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
 
     // copy piece chars to output text buffer
     // skip up to 'lstrip' leading spaces before copying
-    auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
-        for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
-            token++;
+    auto _try_copy = [=] (const char * text, size_t size) -> int32_t {
+        for (int32_t i = 0; i < lstrip && size && *text == ' '; ++i) {
+            text++;
             size--;
         }
         if (length < (int32_t)size) {
             return -(int32_t) size;
         }
-        memcpy(buf, token, size);
+        memcpy(buf, text, size);
         return (int32_t) size;
     };
 

From a59ee7c4eb3efa39718af405dc1fad43bdca6dce Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 16:19:18 +0200
Subject: [PATCH 12/15] common : cont

ggml-ci
---
 cmake/common.cmake | 2 +-
 common/common.cpp  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/common.cmake b/cmake/common.cmake
index 45bac7af8..5a39cbf78 100644
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -19,7 +19,7 @@ function(llama_add_compile_flags)
                 list(APPEND CXX_FLAGS -Wshadow)
 
                 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-                    list(APPEND CXX_FLAGS -Wshadow -Wshadow-field-in-constructor)
+                    list(APPEND CXX_FLAGS -Wshadow-field-in-constructor)
                 endif()
             endif()
 
diff --git a/common/common.cpp b/common/common.cpp
index 16cc3f41c..447fb03ea 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1208,7 +1208,7 @@ static bool common_download_file(const std::string & url, const std::string & pa
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
         auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+            common_load_model_from_url_headers * cur = (common_load_model_from_url_headers *) userdata;
 
             static std::regex header_regex("([^:]+): (.*)\r\n");
             static std::regex etag_regex("ETag", std::regex_constants::icase);
@@ -1220,9 +1220,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
                 const std::string & key = match[1];
                 const std::string & value = match[2];
                 if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
+                    cur->etag = value;
                 } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
+                    cur->last_modified = value;
                 }
             }
             return n_items;

From 36803b1902195f3489ede64644fc5d2e8d51ea77 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 16:53:44 +0200
Subject: [PATCH 13/15] common : cont

ggml-ci
---
 common/common.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 447fb03ea..e83537306 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1294,18 +1294,18 @@ static bool common_download_file(const std::string & url, const std::string & pa
         curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
 
         // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
+        auto llama_download_hide_password_in_url = [](const std::string & url_full) -> std::string {
+            std::size_t protocol_pos = url_full.find("://");
             if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
+                return url_full;  // Malformed URL
             }
 
-            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            std::size_t at_pos = url_full.find('@', protocol_pos + 3);
             if (at_pos == std::string::npos) {
-                return url;  // No password in URL
+                return url_full;  // No password in URL
             }
 
-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+            return url_full.substr(0, protocol_pos + 3) + "********" + url_full.substr(at_pos);
         };
 
         // start the download

From afd40ea206540e96f0a9dce45d43ac51dc966f1a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 17:22:16 +0200
Subject: [PATCH 14/15] minor : better names

ggml-ci
---
 .../convert-llama2c-to-ggml.cpp                        |  6 +++---
 src/llama-mmap.cpp                                     | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index ef0b22a3d..e597fa279 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -471,12 +471,12 @@ struct my_llama_file {
         GGML_ASSERT(ret == 0); // same
     }
 
-    void read_raw(void * ptr, size_t size_cur) {
-        if (size_cur == 0) {
+    void read_raw(void * raw_addr, size_t raw_size) {
+        if (raw_size == 0) {
             return;
         }
         errno = 0;
-        std::size_t ret = std::fread(ptr, size_cur, 1, fp);
+        std::size_t ret = std::fread(raw_addr, raw_size, 1, fp);
         if (ferror(fp)) {
             die_fmt("fread failed: %s", strerror(errno));
         }
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index db4c4bcbe..7f43bccda 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -454,8 +454,8 @@ struct llama_mlock::impl {
         return (size_t) sysconf(_SC_PAGESIZE);
     }
 
-    bool raw_lock(const void * addr_cur, size_t size_cur) const {
-        if (!mlock(addr_cur, size_cur)) {
+    bool raw_lock(const void * lock_addr, size_t lock_len) const {
+        if (!mlock(lock_addr, lock_len)) {
             return true;
         }
 
@@ -475,12 +475,12 @@ struct llama_mlock::impl {
         if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
             suggest = false;
         }
-        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size_cur)) {
+        if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + lock_len)) {
             suggest = false;
         }
 
         LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                size_cur, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+                lock_len, size, errmsg, suggest ? MLOCK_SUGGESTION : "");
         return false;
     }
 
@@ -535,7 +535,7 @@ struct llama_mlock::impl {
         return (size_t) 65536;
     }
 
-    bool raw_lock(const void * addr_cur, size_t size_cur) const {
+    bool raw_lock(const void * lock_addr, size_t lock_len) const {
         LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
         return false;
     }

From a97b3621cf40f264f2f73b41d87ec70ee8b79c17 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 12 Jan 2025 17:57:51 +0200
Subject: [PATCH 15/15] ggml : ggml_backend_graph_copy ->
 ggml_backend_graph_copy_state

ggml-ci
---
 ggml/include/ggml-backend.h | 6 +++---
 ggml/src/ggml-backend.cpp   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index ce4fb4652..df6faa4b9 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -323,7 +323,7 @@ extern "C" {
     // Utils
     //
 
-    struct ggml_backend_graph_copy {
+    struct ggml_backend_graph_copy_state {
         ggml_backend_buffer_t buffer;
         struct ggml_context * ctx_allocated;
         struct ggml_context * ctx_unallocated;
@@ -331,8 +331,8 @@ extern "C" {
     };
 
     // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+    GGML_API struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                                 ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy);
 
     typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
 
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index cbc57a2d3..8f15805ba 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -1724,7 +1724,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_
     }
 }
 
-struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backend, struct ggml_cgraph * graph) {
+struct ggml_backend_graph_copy_state ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
     struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
     struct ggml_tensor ** node_copies = (ggml_tensor **) calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
     bool * node_init = (bool *) calloc(hash_set.size, sizeof(node_init[0]));
@@ -1805,14 +1805,14 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy_init(ggml_backend_t backe
     };
 }
 
-void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
+void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy_state copy) {
     ggml_backend_buffer_free(copy.buffer);
     ggml_free(copy.ctx_allocated);
     ggml_free(copy.ctx_unallocated);
 }
 
 bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
-    struct ggml_backend_graph_copy copy = ggml_backend_graph_copy_init(backend2, graph);
+    struct ggml_backend_graph_copy_state copy = ggml_backend_graph_copy(backend2, graph);
     if (copy.buffer == NULL) {
         return false;
     }