From 55b008cdec66cd50a96cfab9701193f25af1fca8 Mon Sep 17 00:00:00 2001
From: Michael Klimenko <mklimenko29@gmail.com>
Date: Sat, 27 Jan 2024 22:29:31 +0100
Subject: [PATCH] Add additional fixes

	Change bind to lambdas
	Change push_back to emplace_back
	Replace for with range-based for
	Use auto to avoid duplication
	Use bool values instead of 0
	Use pass-by-value with std::move
---
 common/common.cpp                             |  10 +-
 common/grammar-parser.cpp                     |   6 +-
 common/sampling.cpp                           |   6 +-
 common/train.cpp                              |   4 +-
 examples/batched-bench/batched-bench.cpp      |  10 +-
 examples/benchmark/benchmark-matmult.cpp      |   2 +-
 examples/embedding/embedding.cpp              |   4 +-
 examples/export-lora/export-lora.cpp          |  16 +--
 examples/finetune/finetune.cpp                |  32 ++---
 examples/imatrix/imatrix.cpp                  |   4 +-
 examples/infill/infill.cpp                    |   8 +-
 examples/llama-bench/llama-bench.cpp          |  34 +++---
 examples/llava/clip.cpp                       |  10 +-
 examples/lookahead/lookahead.cpp              |  24 ++--
 examples/main/main.cpp                        |  24 ++--
 examples/perplexity/perplexity.cpp            |  16 +--
 examples/server/httplib.h                     | 114 +++++++++---------
 examples/server/server.cpp                    |  42 +++----
 examples/server/utils.hpp                     |  15 +--
 examples/tokenize/tokenize.cpp                |   6 +-
 .../train-text-from-scratch.cpp               |  17 ++-
 llama.cpp                                     |  38 +++---
 tests/test-backend-ops.cpp                    |  10 +-
 tests/test-grad0.cpp                          |  12 +-
 tests/test-grammar-parser.cpp                 |  22 ++--
 tests/test-llama-grammar.cpp                  |   6 +-
 tests/test-quantize-fns.cpp                   |   2 +-
 tests/test-quantize-perf.cpp                  |   2 +-
 28 files changed, 237 insertions(+), 259 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index f8fdcfe23..7a9583b65 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -511,7 +511,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
+            params.lora_adapter.emplace_back(argv[i], 1.0f);
             params.use_mmap = false;
         } else if (arg == "--lora-scaled") {
             if (++i >= argc) {
@@ -523,7 +523,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
+            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
             params.use_mmap = false;
         } else if (arg == "--lora-base") {
             if (++i >= argc) {
@@ -875,7 +875,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
     }
 
     if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back(llama_model_kv_override());
+        params.kv_overrides.emplace_back();
         params.kv_overrides.back().key[0] = 0;
     }
 
@@ -1335,8 +1335,8 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
     std::string piece;
     std::string result;
 
-    for (size_t i = 0; i < tokens.size(); ++i) {
-        piece = llama_token_to_piece(ctx, tokens[i]);
+    for (int token : tokens) {
+        piece = llama_token_to_piece(ctx, token);
 
         result += piece;
     }
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index bf89a96f3..c32cdadb0 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -296,9 +296,9 @@ namespace grammar_parser {
 
     static bool is_char_element(llama_grammar_element elem) {
         switch (elem.type) {
-            case LLAMA_GRETYPE_CHAR:           return true;
-            case LLAMA_GRETYPE_CHAR_NOT:       return true;
-            case LLAMA_GRETYPE_CHAR_ALT:       return true;
+            case LLAMA_GRETYPE_CHAR:
+            case LLAMA_GRETYPE_CHAR_NOT:
+            case LLAMA_GRETYPE_CHAR_ALT:
             case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
             default:                           return false;
         }
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 8a93b4ecb..711d647f8 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,7 +1,7 @@
 #include "sampling.h"
 
 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    struct llama_sampling_context * result = new llama_sampling_context();
+    auto result = new llama_sampling_context();
 
     result->params  = params;
     result->grammar = nullptr;
@@ -197,8 +197,8 @@ static llama_token llama_sampling_sample_impl(
     }
 
     // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
+    for (auto logit_bia : params.logit_bias) {
+        logits[logit_bia.first] += logit_bia.second;
     }
 
     if (ctx_cfg) {
diff --git a/common/train.cpp b/common/train.cpp
index b309808b5..c51a1839a 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -18,7 +18,7 @@ struct random_uniform_distribution {
 };
 
 struct train_state  * init_train_state() {
-    struct train_state * state = new struct train_state;
+    auto state = new struct train_state;
     state->train_its     = 0;
     state->train_samples = 0;
     state->train_tokens  = 0;
@@ -1379,7 +1379,7 @@ void finish_processing_train_args(struct train_params_common * params) {
 }
 
 void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
-    struct train_opt_callback_data * data   = (struct train_opt_callback_data *) vdata;
+    auto data   = (struct train_opt_callback_data *) vdata;
     struct train_params_common     * params = data->params;
     struct train_state             * train  = data->train;
     struct ggml_opt_context        * opt    = train->opt;
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 7924db267..ed5037451 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -164,13 +164,9 @@ int main(int argc, char ** argv) {
     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
     LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
 
-    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
-        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
-            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
-                const int pp = n_pp[i_pp];
-                const int tg = n_tg[i_tg];
-                const int pl = n_pl[i_pl];
-
+    for (int pp : n_pp) {
+        for (int tg : n_tg) {
+            for (int pl : n_pl) {
                 const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
 
                 if (n_ctx_req > n_kv_max) {
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index e89f3de2f..07c51313e 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv)  {
     struct ggml_init_params params = {
         /*.mem_size   =*/ ctx_size,
         /*.mem_buffer =*/ NULL,
-        /* no_alloc   =*/ 0
+        /* no_alloc   =*/ false
     };
 
     ctx = ggml_init(params);
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 3295cd240..35a0cb912 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -64,8 +64,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "\n");
         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+        for (int i : embd_inp) {
+            fprintf(stderr, "%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
         }
         fprintf(stderr, "\n");
     }
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 14a4e97d8..182976563 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -225,7 +225,7 @@ static void free_lora(struct lora_data * lora) {
 }
 
 static struct lora_data * load_lora(struct lora_info * info) {
-    struct lora_data * result = new struct lora_data;
+    auto result = new struct lora_data;
     result->info = *info;
     result->ctx = NULL;
     result->lora_r     = 1;
@@ -370,9 +370,9 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
 static void export_lora(struct export_lora_params * params) {
     // load all loras
     std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < params->lora.size(); ++i) {
-        struct lora_data * lora = load_lora(&params->lora[i]);
-        if (lora != NULL) {
+    for (auto& i : params->lora) {
+        auto lora = load_lora(&i);
+        if (lora) {
             loras.push_back(lora);
         }
     }
@@ -431,8 +431,8 @@ static void export_lora(struct export_lora_params * params) {
         fin.read_raw(data.data(), data.size());
 
         // apply all loras
-        for (size_t k = 0; k < loras.size(); ++k) {
-            apply_lora(tensor, loras[k], params->n_threads);
+        for (auto& lora : loras) {
+            apply_lora(tensor, lora, params->n_threads);
         }
 
         // write tensor data + padding
@@ -455,8 +455,8 @@ static void export_lora(struct export_lora_params * params) {
     gguf_free(gguf_in);
 
     // free loras
-    for (size_t i = 0; i < loras.size(); ++i) {
-        free_lora(loras[i]);
+    for (auto& lora : loras) {
+        free_lora(lora);
     }
 }
 
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 2a326a2c4..4dc588be6 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -379,8 +379,7 @@ static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora)
     ggml_allocr_alloc(alloc, lora->norm_b);
     ggml_allocr_alloc(alloc, lora->output_a);
     ggml_allocr_alloc(alloc, lora->output_b);
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
+    for (auto& layer : lora->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm_a);
         ggml_allocr_alloc(alloc, layer.attention_norm_b);
         ggml_allocr_alloc(alloc, layer.wq_a);
@@ -406,8 +405,7 @@ static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora)
     ggml_allocr_alloc(alloc, lora->norm_b->grad);
     ggml_allocr_alloc(alloc, lora->output_a->grad);
     ggml_allocr_alloc(alloc, lora->output_b->grad);
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
+    for (auto& layer : lora->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
         ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
         ggml_allocr_alloc(alloc, layer.wq_a->grad);
@@ -803,9 +801,9 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
 
     // allocating checkpoints in one block to reduce memory fragmentation
     // note: they will be freed in reverse order
-    for (unsigned int i = 0; i < checkpoints.size(); ++i) {
-        if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-            ggml_allocr_alloc(alloc, checkpoints[i]);
+    for (auto& checkpoint : checkpoints) {
+        if (checkpoint->data == NULL && checkpoint->view_src == NULL) {
+            ggml_allocr_alloc(alloc, checkpoint);
         }
     }
 
@@ -872,8 +870,7 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
     copy_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
     copy_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
 
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
+    for (auto& layer : lora->layers) {
         copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
         copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
         copy_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
@@ -940,9 +937,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
     gguf_add_tensor(fctx, lora->output_a);
     gguf_add_tensor(fctx, lora->output_b);
 
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
-
+    for (auto& layer : lora->layers) {
         gguf_add_tensor(fctx, layer.attention_norm_a);
         gguf_add_tensor(fctx, layer.attention_norm_b);
         gguf_add_tensor(fctx, layer.wq_a);
@@ -1476,7 +1471,7 @@ struct save_train_files_data {
 };
 
 static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
+    auto data   = (struct save_train_files_data *) vdata;
 
     int64_t iter = train->opt->iter;
 
@@ -1499,8 +1494,7 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
     nx += ggml_nelements(lora->output_a);
     nx += ggml_nelements(lora->output_b);
 
-    for (uint32_t i = 0; i < lora->layers.size(); ++i) {
-        auto & layer = lora->layers[i];
+    for (auto& layer : lora->layers) {
         nx += ggml_nelements(layer.attention_norm_a);
         nx += ggml_nelements(layer.attention_norm_b);
         nx += ggml_nelements(layer.wq_a);
@@ -1817,12 +1811,12 @@ int main(int argc, char ** argv) {
 
     std::vector<size_t> token_noccurs;
     token_noccurs.resize(model.hparams.n_vocab, 0);
-    for (unsigned int i = 0; i < train_tokens.size(); ++i) {
-        ++token_noccurs[train_tokens[i]];
+    for (int train_token : train_tokens) {
+        ++token_noccurs[train_token];
     }
     int n_unique_tokens = 0;
-    for (unsigned int i = 0; i < token_noccurs.size(); ++i) {
-        if (token_noccurs[i] == 0) continue;
+    for (unsigned long long token_noccur : token_noccurs) {
+        if (token_noccur == 0) continue;
         ++n_unique_tokens;
     }
     printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index ea06fcdbf..4cf05f7c8 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -216,8 +216,8 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
         sum_exp += exp_logit;
         probs[i] = exp_logit;
     }
-    for (size_t i = 0; i < probs.size(); i++) {
-        probs[i] /= sum_exp;
+    for (float& prob : probs) {
+        prob /= static_cast<float>(sum_exp);
     }
     return probs;
 }
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 72fb133b4..bebed4433 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -313,16 +313,16 @@ int main(int argc, char ** argv) {
         LOG_TEE("\n");
         LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+        for (int i : embd_inp) {
+            LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
         }
 
         if (ctx_guidance) {
             LOG_TEE("\n");
             LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+            for (int i : guidance_inp) {
+                LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
             }
         }
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 5053d3f52..4cf432496 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -745,7 +745,7 @@ const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
 
 struct printer {
-    virtual ~printer() {}
+    virtual ~printer() = default;
 
     FILE * fout;
     virtual void print_header(const cmd_params & params) { (void) params; }
@@ -891,43 +891,43 @@ struct markdown_printer : public printer {
 
     void print_header(const cmd_params & params) override {
         // select fields to print
-        fields.push_back("model");
-        fields.push_back("size");
-        fields.push_back("params");
-        fields.push_back("backend");
+        fields.emplace_back("model");
+        fields.emplace_back("size");
+        fields.emplace_back("params");
+        fields.emplace_back("backend");
         bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
         if (!is_cpu_backend) {
-            fields.push_back("n_gpu_layers");
+            fields.emplace_back("n_gpu_layers");
         }
         if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
-            fields.push_back("n_threads");
+            fields.emplace_back("n_threads");
         }
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
-            fields.push_back("n_batch");
+            fields.emplace_back("n_batch");
         }
         if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
-            fields.push_back("type_k");
+            fields.emplace_back("type_k");
         }
         if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
-            fields.push_back("type_v");
+            fields.emplace_back("type_v");
         }
         if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
-            fields.push_back("main_gpu");
+            fields.emplace_back("main_gpu");
         }
         if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
-            fields.push_back("split_mode");
+            fields.emplace_back("split_mode");
         }
         if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
-            fields.push_back("mul_mat_q");
+            fields.emplace_back("mul_mat_q");
         }
         if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
-            fields.push_back("no_kv_offload");
+            fields.emplace_back("no_kv_offload");
         }
         if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
-            fields.push_back("tensor_split");
+            fields.emplace_back("tensor_split");
         }
-        fields.push_back("test");
-        fields.push_back("t/s");
+        fields.emplace_back("test");
+        fields.emplace_back("t/s");
 
         fprintf(fout, "|");
         for (const auto & field : fields) {
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3d43c9d99..c4a18748f 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -800,7 +800,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
     buffer_size += n_tensors * 128 /* CLIP PADDING */;
 
-    clip_ctx * new_clip = new clip_ctx;
+    auto* new_clip = new clip_ctx;
 
     // update projector type
     {
@@ -1416,13 +1416,13 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
         printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
 
         int64_t sum_all = 0;
-        for (size_t i = 0; i < hist_all.size(); ++i) {
-            sum_all += hist_all[i];
+        for (auto i : hist_all) {
+            sum_all += i;
         }
 
         printf("%s: hist: ", __func__);
-        for (size_t i = 0; i < hist_all.size(); ++i) {
-            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        for (auto i : hist_all) {
+            printf("%5.3f ", i / (float)sum_all);
         }
         printf("\n");
     }
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index ba949ff6e..d0adfec12 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
 
         for (int i = 0; i < W; i++) {
             // there are different ways to init these tokens
-            if (0) {
+            if (false) {
                 // initialize randomly from the prompt tokens
                 tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
             } else {
@@ -268,10 +268,10 @@ int main(int argc, char ** argv) {
 
             // if no active ngrams are left, it means the sampled token does not pass the verification
             if (v > 0) {
-                for (int g = 0; g < (int) ngrams_cur.size(); g++) {
-                    if (ngrams_cur[g].active) {
-                        i_batch = ngrams_cur[g].i_batch[v];
-                        seq_id_best = ngrams_cur[g].seq_id;
+                for (auto& g : ngrams_cur) {
+                    if (g.active) {
+                        i_batch = g.i_batch[v];
+                        seq_id_best = g.seq_id;
 
                         ++n_accept;
                         break;
@@ -316,20 +316,20 @@ int main(int argc, char ** argv) {
             }
 
             // verify across active n-grams
-            for (int g = 0; g < (int) ngrams_cur.size(); g++) {
-                if (ngrams_cur[g].active) {
+            for (auto& g : ngrams_cur) {
+                if (g.active) {
                     if (v == N - 1) {
-                        ngrams_cur[g].active = false;
+                        g.active = false;
                     } else {
-                        if (id != ngrams_cur[g].tokens[v + 1]) {
-                            ngrams_cur[g].active = false;
+                        if (id != g.tokens[v + 1]) {
+                            g.active = false;
                         }
                     }
                 }
             }
 
             // print known n-grams starting with token id (debug)
-            if (0 && v == 0) {
+            if (false && v == 0) {
                 if (ngrams_observed.cnt[id] > 0) {
                     printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
                 }
@@ -367,7 +367,7 @@ int main(int argc, char ** argv) {
                 } else {
                     for (int i = 0; i < W; i++) {
                         // there are different ways to init these tokens
-                        if (0) {
+                        if (false) {
                             // random init
                             tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
                         } else {
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 58b7f807a..4d52cf284 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -344,12 +344,12 @@ int main(int argc, char ** argv) {
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
         params.interactive_first = true;
-        params.antiprompt.push_back("### Instruction:\n\n");
+        params.antiprompt.emplace_back("### Instruction:\n\n");
     }
     // similar for chatml mode
     else if (params.chatml) {
         params.interactive_first = true;
-        params.antiprompt.push_back("<|im_start|>user\n");
+        params.antiprompt.emplace_back("<|im_start|>user\n");
     }
 
     // enable interactive mode if interactive start is specified
@@ -361,16 +361,16 @@ int main(int argc, char ** argv) {
         LOG_TEE("\n");
         LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
         LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+        for (int i : embd_inp) {
+            LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
         }
 
         if (ctx_guidance) {
             LOG_TEE("\n");
             LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
             LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+            for (int i : guidance_inp) {
+                LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
             }
         }
 
@@ -405,8 +405,8 @@ int main(int argc, char ** argv) {
                 LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
                 if (params.verbose_prompt) {
                     auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
-                    for (int i = 0; i < (int) tmp.size(); i++) {
-                        LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                    for (int i : tmp) {
+                        LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
                     }
                 }
             }
@@ -420,8 +420,8 @@ int main(int argc, char ** argv) {
             LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
             if (params.verbose_prompt) {
                 auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
-                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                for (int i : tmp) {
+                    LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
                 }
             }
         }
@@ -430,8 +430,8 @@ int main(int argc, char ** argv) {
             LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
             if (params.verbose_prompt) {
                 auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
-                for (int i = 0; i < (int) tmp.size(); i++) {
-                    LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
+                for (int i : tmp) {
+                    LOG_TEE("%6d -> '%s'\n", i, llama_token_to_piece(ctx, i).c_str());
                 }
             }
         }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index a14a23313..cf87cf5a9 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -94,8 +94,8 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
         sum_exp += exp_logit;
         probs[i] = exp_logit;
     }
-    for (size_t i = 0; i < probs.size(); i++) {
-        probs[i] /= sum_exp;
+    for (float& prob : probs) {
+        prob /= static_cast<float>(sum_exp);
     }
     return probs;
 }
@@ -881,7 +881,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
             size_t li = hs_cur.common_prefix;
             for (int s = 0; s < 4; ++s) {
                 for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.push_back(std::make_pair(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]));
+                    eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]);
                 }
                 ++li;
             }
@@ -997,7 +997,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string&
             printf("%s: no _ in <%s>\n", __func__, sentence.c_str());
             continue;
         }
-        std::istringstream stream(answer.c_str());
+        std::istringstream stream(answer);
         int i_answer; stream >> i_answer;
         if (stream.fail() || i_answer < 1 || i_answer > 2) {
             printf("%s: failed to parse answer <%s>\n", __func__, answer.c_str());
@@ -1158,13 +1158,13 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
             const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0;
             size_t li = n_base1 - 1;
             for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) {
-                eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[0][j+1]));
+                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]);
             }
             const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix;
             const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0;
             li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1;
             for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) {
-                eval_pairs.push_back(std::make_pair(task.i_batch + li++, task.seq_tokens[1][j+1]));
+                eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]);
             }
         }
         compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
@@ -1221,7 +1221,7 @@ static bool deserialize_string(std::istream & in, std::string & str) {
     uint32_t size;
     if (!in.read((char *)&size, sizeof(size)).fail()) {
         str.resize(size);
-        if (!in.read((char *)&str[0], size).fail()) return true;
+        if (!in.read((char *)str.data(), size).fail()) return true;
     }
     return false;
 }
@@ -1523,7 +1523,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
             size_t li = cur_task.common_prefix;
             for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
                 for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
+                    eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]);
                 }
                 ++li;
             }
diff --git a/examples/server/httplib.h b/examples/server/httplib.h
index e495d8299..a8244fd64 100644
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
@@ -223,6 +223,7 @@ using socket_t = int;
 #include <string>
 #include <sys/stat.h>
 #include <thread>
+#include <utility>
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 #ifdef _WIN32
@@ -705,7 +706,7 @@ public:
   Server &set_file_request_handler(Handler handler);
 
   Server &set_error_handler(HandlerWithResponse handler);
-  Server &set_error_handler(Handler handler);
+  Server &set_error_handler(const Handler& handler);
   Server &set_exception_handler(ExceptionHandler handler);
   Server &set_pre_routing_handler(HandlerWithResponse handler);
   Server &set_post_routing_handler(Handler handler);
@@ -781,7 +782,7 @@ private:
   bool dispatch_request(Request &req, Response &res, const Handlers &handlers);
   bool
   dispatch_request_for_content_reader(Request &req, Response &res,
-                                      ContentReader content_reader,
+                                      const ContentReader& content_reader,
                                       const HandlersForContentReader &handlers);
 
   bool parse_request_line(const char *s, Request &req);
@@ -804,7 +805,7 @@ private:
                                      MultipartContentHeader multipart_header,
                                      ContentReceiver multipart_receiver);
   bool read_content_core(Stream &strm, Request &req, Response &res,
-                         ContentReceiver receiver,
+                         const ContentReceiver& receiver,
                          MultipartContentHeader multipart_header,
                          ContentReceiver multipart_receiver);
 
@@ -910,8 +911,8 @@ public:
   explicit ClientImpl(const std::string &host, int port);
 
   explicit ClientImpl(const std::string &host, int port,
-                      const std::string &client_cert_path,
-                      const std::string &client_key_path);
+                      std::string client_cert_path,
+                      std::string client_key_path);
 
   virtual ~ClientImpl();
 
@@ -937,7 +938,7 @@ public:
   Result Get(const std::string &path, ResponseHandler response_handler,
              ContentReceiver content_receiver, Progress progress);
   Result Get(const std::string &path, const Headers &headers,
-             ResponseHandler response_handler, ContentReceiver content_receiver,
+             ResponseHandler response_handler, const ContentReceiver& content_receiver,
              Progress progress);
 
   Result Get(const std::string &path, const Params &params,
@@ -946,8 +947,8 @@ public:
              const Headers &headers, ContentReceiver content_receiver,
              Progress progress = nullptr);
   Result Get(const std::string &path, const Params &params,
-             const Headers &headers, ResponseHandler response_handler,
-             ContentReceiver content_receiver, Progress progress = nullptr);
+             const Headers &headers, const ResponseHandler& response_handler,
+             const ContentReceiver& content_receiver, const Progress& progress = nullptr);
 
   Result Head(const std::string &path);
   Result Head(const std::string &path, const Headers &headers);
@@ -1790,7 +1791,7 @@ void hosted_at(const std::string &hostname, std::vector<std::string> &addrs);
 
 std::string append_query_params(const std::string &path, const Params &params);
 
-std::pair<std::string, std::string> make_range_header(Ranges ranges);
+std::pair<std::string, std::string> make_range_header(const Ranges& ranges);
 
 std::pair<std::string, std::string>
 make_basic_authentication_header(const std::string &username,
@@ -1808,12 +1809,12 @@ void read_file(const std::string &path, std::string &out);
 std::string trim_copy(const std::string &s);
 
 void split(const char *b, const char *e, char d,
-           std::function<void(const char *, const char *)> fn);
+           const std::function<void(const char *, const char *)>& fn);
 
 bool process_client_socket(socket_t sock, time_t read_timeout_sec,
                            time_t read_timeout_usec, time_t write_timeout_sec,
                            time_t write_timeout_usec,
-                           std::function<bool(Stream &)> callback);
+                           const std::function<bool(Stream &)>& callback);
 
 socket_t create_client_socket(
     const std::string &host, const std::string &ip, int port,
@@ -2231,8 +2232,8 @@ inline void read_file(const std::string &path, std::string &out) {
   fs.seekg(0, std::ios_base::end);
   auto size = fs.tellg();
   fs.seekg(0);
-  out.resize(static_cast<size_t>(size));
-  fs.read(&out[0], static_cast<std::streamsize>(size));
+  out.resize(size);
+  fs.read(&out[0], size);
 }
 
 inline std::string file_extension(const std::string &path) {
@@ -2261,7 +2262,7 @@ inline std::string trim_copy(const std::string &s) {
 }
 
 inline void split(const char *b, const char *e, char d,
-                  std::function<void(const char *, const char *)> fn) {
+                  const std::function<void(const char *, const char *)>& fn) {
   size_t i = 0;
   size_t beg = 0;
 
@@ -2624,7 +2625,7 @@ inline bool process_client_socket(socket_t sock, time_t read_timeout_sec,
                                   time_t read_timeout_usec,
                                   time_t write_timeout_sec,
                                   time_t write_timeout_usec,
-                                  std::function<bool(Stream &)> callback) {
+                                  const std::function<bool(Stream &)>& callback) {
   SocketStream strm(sock, read_timeout_sec, read_timeout_usec,
                     write_timeout_sec, write_timeout_usec);
   return callback(strm);
@@ -3037,7 +3038,7 @@ find_content_type(const std::string &path,
   case "svg"_t: return "image/svg+xml";
   case "webp"_t: return "image/webp";
   case "ico"_t: return "image/x-icon";
-  case "tif"_t: return "image/tiff";
+  case "tif"_t:
   case "tiff"_t: return "image/tiff";
   case "jpg"_t:
   case "jpeg"_t: return "image/jpeg";
@@ -3488,13 +3489,13 @@ inline bool read_headers(Stream &strm, Headers &headers) {
 }
 
 inline bool read_content_with_length(Stream &strm, uint64_t len,
-                                     Progress progress,
-                                     ContentReceiverWithProgress out) {
+                                     const Progress& progress,
+                                     const ContentReceiverWithProgress& out) {
   char buf[CPPHTTPLIB_RECV_BUFSIZ];
 
   uint64_t r = 0;
   while (r < len) {
-    auto read_len = static_cast<size_t>(len - r);
+    auto read_len = len - r;
     auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
     if (n <= 0) { return false; }
 
@@ -3513,7 +3514,7 @@ inline void skip_content_with_length(Stream &strm, uint64_t len) {
   char buf[CPPHTTPLIB_RECV_BUFSIZ];
   uint64_t r = 0;
   while (r < len) {
-    auto read_len = static_cast<size_t>(len - r);
+    auto read_len = len - r;
     auto n = strm.read(buf, (std::min)(read_len, CPPHTTPLIB_RECV_BUFSIZ));
     if (n <= 0) { return; }
     r += static_cast<uint64_t>(n);
@@ -3521,7 +3522,7 @@ inline void skip_content_with_length(Stream &strm, uint64_t len) {
 }
 
 inline bool read_content_without_length(Stream &strm,
-                                        ContentReceiverWithProgress out) {
+                                        const ContentReceiverWithProgress& out) {
   char buf[CPPHTTPLIB_RECV_BUFSIZ];
   uint64_t r = 0;
   for (;;) {
@@ -3983,12 +3984,12 @@ inline bool parse_range_header(const std::string &s, Ranges &ranges) try {
       if (std::regex_match(b, e, cm, re_another_range)) {
         ssize_t first = -1;
         if (!cm.str(1).empty()) {
-          first = static_cast<ssize_t>(std::stoll(cm.str(1)));
+          first = std::stoll(cm.str(1));
         }
 
         ssize_t last = -1;
         if (!cm.str(2).empty()) {
-          last = static_cast<ssize_t>(std::stoll(cm.str(2)));
+          last = std::stoll(cm.str(2));
         }
 
         if (first != -1 && last != -1 && first > last) {
@@ -4254,9 +4255,8 @@ inline std::string make_multipart_data_boundary() {
 
 inline bool is_multipart_boundary_chars_valid(const std::string &boundary) {
   auto valid = true;
-  for (size_t i = 0; i < boundary.size(); i++) {
-    auto c = boundary[i];
-    if (!std::isalnum(c) && c != '-' && c != '_') {
+  for (char c : boundary) {
+      if (!std::isalnum(c) && c != '-' && c != '_') {
       valid = false;
       break;
     }
@@ -4707,7 +4707,7 @@ inline bool parse_www_authenticate(const Response &res,
         s = s.substr(pos + 1);
         auto beg = std::sregex_iterator(s.begin(), s.end(), re);
         for (auto i = beg; i != std::sregex_iterator(); ++i) {
-          auto m = *i;
+          const auto& m = *i;
           auto key = s.substr(static_cast<size_t>(m.position(1)),
                               static_cast<size_t>(m.length(1)));
           auto val = m.length(2) > 0
@@ -4802,7 +4802,7 @@ inline std::string append_query_params(const std::string &path,
 }
 
 // Header utilities
-inline std::pair<std::string, std::string> make_range_header(Ranges ranges) {
+inline std::pair<std::string, std::string> make_range_header(const Ranges& ranges) {
   std::string field = "bytes=";
   auto i = 0;
   for (auto r : ranges) {
@@ -4949,7 +4949,7 @@ inline void Response::set_content_provider(
   set_header("Content-Type", content_type);
   content_length_ = in_length;
   if (in_length > 0) { content_provider_ = std::move(provider); }
-  content_provider_resource_releaser_ = resource_releaser;
+  content_provider_resource_releaser_ = std::move(resource_releaser);
   is_chunked_content_provider_ = false;
 }
 
@@ -4959,7 +4959,7 @@ inline void Response::set_content_provider(
   set_header("Content-Type", content_type);
   content_length_ = 0;
   content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = resource_releaser;
+  content_provider_resource_releaser_ = std::move(resource_releaser);
   is_chunked_content_provider_ = false;
 }
 
@@ -4969,7 +4969,7 @@ inline void Response::set_chunked_content_provider(
   set_header("Content-Type", content_type);
   content_length_ = 0;
   content_provider_ = detail::ContentProviderAdapter(std::move(provider));
-  content_provider_resource_releaser_ = resource_releaser;
+  content_provider_resource_releaser_ = std::move(resource_releaser);
   is_chunked_content_provider_ = true;
 }
 
@@ -5010,7 +5010,7 @@ inline SocketStream::SocketStream(socket_t sock, time_t read_timeout_sec,
       write_timeout_sec_(write_timeout_sec),
       write_timeout_usec_(write_timeout_usec), read_buff_(read_buff_size_, 0) {}
 
-inline SocketStream::~SocketStream() {}
+inline SocketStream::~SocketStream() = default;
 
 inline bool SocketStream::is_readable() const {
   return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0;
@@ -5101,7 +5101,7 @@ inline ssize_t BufferStream::read(char *ptr, size_t size) {
 #else
   auto len_read = buffer.copy(ptr, size, position);
 #endif
-  position += static_cast<size_t>(len_read);
+  position += len_read;
   return static_cast<ssize_t>(len_read);
 }
 
@@ -5131,7 +5131,7 @@ inline Server::Server()
 #endif
 }
 
-inline Server::~Server() {}
+inline Server::~Server() = default;
 
 inline Server &Server::Get(const std::string &pattern, Handler handler) {
   get_handlers_.push_back(
@@ -5241,7 +5241,7 @@ inline Server &Server::set_error_handler(HandlerWithResponse handler) {
   return *this;
 }
 
-inline Server &Server::set_error_handler(Handler handler) {
+inline Server &Server::set_error_handler(const Handler& handler) {
   error_handler_ = [handler](const Request &req, Response &res) {
     handler(req, res);
     return HandlerResponse::Handled;
@@ -5618,7 +5618,7 @@ inline bool Server::read_content_with_content_receiver(
 }
 
 inline bool Server::read_content_core(Stream &strm, Request &req, Response &res,
-                                      ContentReceiver receiver,
+                                      const ContentReceiver& receiver,
                                       MultipartContentHeader multipart_header,
                                       ContentReceiver multipart_receiver) {
   detail::MultipartFormDataParser multipart_form_data_parser;
@@ -5688,7 +5688,7 @@ inline bool Server::handle_file_request(const Request &req, Response &res,
               detail::find_content_type(path, file_extension_and_mimetype_map_);
           if (type) { res.set_header("Content-Type", type); }
           for (const auto &kv : entry.headers) {
-            res.set_header(kv.first.c_str(), kv.second);
+            res.set_header(kv.first, kv.second);
           }
           res.status = req.has_header("Range") ? 206 : 200;
           if (!head && file_request_handler_) {
@@ -6024,7 +6024,7 @@ inline void Server::apply_ranges(const Request &req, Response &res,
 }
 
 inline bool Server::dispatch_request_for_content_reader(
-    Request &req, Response &res, ContentReader content_reader,
+    Request &req, Response &res, const ContentReader& content_reader,
     const HandlersForContentReader &handlers) {
   for (const auto &x : handlers) {
     const auto &pattern = x.first;
@@ -6202,11 +6202,11 @@ inline ClientImpl::ClientImpl(const std::string &host, int port)
     : ClientImpl(host, port, std::string(), std::string()) {}
 
 inline ClientImpl::ClientImpl(const std::string &host, int port,
-                              const std::string &client_cert_path,
-                              const std::string &client_key_path)
+                              std::string client_cert_path,
+                              std::string client_key_path)
     : host_(host), port_(port),
       host_and_port_(adjust_host_string(host) + ":" + std::to_string(port)),
-      client_cert_path_(client_cert_path), client_key_path_(client_key_path) {}
+      client_cert_path_(std::move(client_cert_path)), client_key_path_(std::move(client_key_path)) {}
 
 inline ClientImpl::~ClientImpl() {
   std::lock_guard<std::mutex> guard(socket_mutex_);
@@ -6579,7 +6579,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) {
       return false;
 #endif
     } else {
-      ClientImpl cli(next_host.c_str(), next_port);
+      ClientImpl cli(next_host, next_port);
       cli.copy_settings(*this);
       return detail::redirect(cli, req, res, path, location, error);
     }
@@ -7056,7 +7056,7 @@ inline Result ClientImpl::Get(const std::string &path,
 
 inline Result ClientImpl::Get(const std::string &path, const Headers &headers,
                               ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
+                              const ContentReceiver& content_receiver,
                               Progress progress) {
   Request req;
   req.method = "GET";
@@ -7078,27 +7078,27 @@ inline Result ClientImpl::Get(const std::string &path, const Params &params,
   if (params.empty()) { return Get(path, headers); }
 
   std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query.c_str(), headers, progress);
+  return Get(path_with_query, headers, std::move(progress));
 }
 
 inline Result ClientImpl::Get(const std::string &path, const Params &params,
                               const Headers &headers,
                               ContentReceiver content_receiver,
                               Progress progress) {
-  return Get(path, params, headers, nullptr, content_receiver, progress);
+  return Get(path, params, headers, nullptr, std::move(content_receiver), std::move(progress));
 }
 
 inline Result ClientImpl::Get(const std::string &path, const Params &params,
                               const Headers &headers,
-                              ResponseHandler response_handler,
-                              ContentReceiver content_receiver,
-                              Progress progress) {
+                              const ResponseHandler& response_handler,
+                              const ContentReceiver& content_receiver,
+                              const Progress& progress) {
   if (params.empty()) {
     return Get(path, headers, response_handler, content_receiver, progress);
   }
 
   std::string path_with_query = append_query_params(path, params);
-  return Get(path_with_query.c_str(), headers, response_handler,
+  return Get(path_with_query, headers, response_handler,
              content_receiver, progress);
 }
 
@@ -7201,7 +7201,7 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
   const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type.c_str());
+  return Post(path, headers, body, content_type);
 }
 
 inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
@@ -7214,7 +7214,7 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers,
   const auto &content_type =
       detail::serialize_multipart_formdata_get_content_type(boundary);
   const auto &body = detail::serialize_multipart_formdata(items, boundary);
-  return Post(path, headers, body, content_type.c_str());
+  return Post(path, headers, body, content_type);
 }
 
 inline Result
@@ -8361,7 +8361,7 @@ inline Client::Client(const std::string &host, int port,
     : cli_(detail::make_unique<ClientImpl>(host, port, client_cert_path,
                                            client_key_path)) {}
 
-inline Client::~Client() {}
+inline Client::~Client() = default;
 
 inline bool Client::is_valid() const {
   return cli_ != nullptr && cli_->is_valid();
@@ -8421,19 +8421,19 @@ inline Result Client::Get(const std::string &path, const Headers &headers,
 }
 inline Result Client::Get(const std::string &path, const Params &params,
                           const Headers &headers, Progress progress) {
-  return cli_->Get(path, params, headers, progress);
+  return cli_->Get(path, params, headers, std::move(progress));
 }
 inline Result Client::Get(const std::string &path, const Params &params,
                           const Headers &headers,
                           ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, content_receiver, progress);
+  return cli_->Get(path, params, headers, std::move(content_receiver), std::move(progress));
 }
 inline Result Client::Get(const std::string &path, const Params &params,
                           const Headers &headers,
                           ResponseHandler response_handler,
                           ContentReceiver content_receiver, Progress progress) {
-  return cli_->Get(path, params, headers, response_handler, content_receiver,
-                   progress);
+  return cli_->Get(path, params, headers, std::move(response_handler), std::move(content_receiver),
+                   std::move(progress));
 }
 
 inline Result Client::Head(const std::string &path) { return cli_->Head(path); }
@@ -8754,7 +8754,7 @@ inline void Client::enable_server_certificate_verification(bool enabled) {
 }
 #endif
 
-inline void Client::set_logger(Logger logger) { cli_->set_logger(logger); }
+inline void Client::set_logger(Logger logger) { cli_->set_logger(std::move(logger)); }
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 inline void Client::set_ca_cert_path(const std::string &ca_cert_file_path,
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 7dcb5950d..ad47e3703 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1202,9 +1202,8 @@ struct llama_server_context
                 (json)(slot.images[image_idx].prefix_prompt);
 
             std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
-            for (int i = 0; i < (int) append_tokens.size(); ++i)
-            {
-                llama_batch_add(batch, append_tokens[i], slot.n_past, { slot.id }, true);
+            for (int append_token : append_tokens) {
+                llama_batch_add(batch, append_token, slot.n_past, { slot.id }, true);
                 slot.n_past += 1;
             }
         }
@@ -2034,7 +2033,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
+            params.lora_adapter.emplace_back(argv[i], 1.0f);
             params.use_mmap = false;
         }
         else if (arg == "--lora-scaled")
@@ -2050,7 +2049,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
+            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
             params.use_mmap = false;
         }
         else if (arg == "--lora-base")
@@ -2192,7 +2191,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
     }
     if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back(llama_model_kv_override());
+        params.kv_overrides.emplace_back();
         params.kv_overrides.back().key[0] = 0;
     }
 
@@ -2626,12 +2625,11 @@ int main(int argc, char **argv)
                             if (!llama_result.error) {
                                 std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
 
-                                for (auto it = result_array.begin(); it != result_array.end(); ++it)
-                                {
-                                    if (!it->empty()) {
+                                for (auto& it : result_array) {
+                                    if (!it.empty()) {
                                         const std::string str =
                                             "data: " +
-                                            it->dump(-1, ' ', false, json::error_handler_t::replace) +
+                                            it.dump(-1, ' ', false, json::error_handler_t::replace) +
                                             "\n\n";
                                         LOG_VERBOSE("data stream", {{"to_send", str}});
                                         if (!sink.write(str.c_str(), str.size())) {
@@ -2824,19 +2822,17 @@ int main(int argc, char **argv)
     }*/
     //);
 
-    llama.queue_tasks.on_new_task(std::bind(
-        &llama_server_context::process_single_task, &llama, std::placeholders::_1));
-    llama.queue_tasks.on_finish_multitask(std::bind(
-        &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
-    llama.queue_tasks.on_all_tasks_finished(std::bind(
-        &llama_server_context::run_on_all_tasks_finished, &llama));
-    llama.queue_results.on_multitask_update(std::bind(
-        &llama_server_queue::update_multitask,
-        &llama.queue_tasks,
-        std::placeholders::_1,
-        std::placeholders::_2,
-        std::placeholders::_3
-    ));
+    llama.queue_tasks.on_new_task([ObjectPtr = &llama](auto&& PH1) {
+        ObjectPtr->process_single_task(std::forward<decltype(PH1)>(PH1));
+    });
+    llama.queue_tasks.on_finish_multitask([ObjectPtr = &llama](auto&& PH1) {
+        ObjectPtr->on_finish_multitask(std::forward<decltype(PH1)>(PH1));
+    });
+    llama.queue_tasks.on_all_tasks_finished([ObjectPtr = &llama] { ObjectPtr->run_on_all_tasks_finished(); });
+    llama.queue_results.on_multitask_update([ObjectPtr = &llama.queue_tasks](auto&& PH1, auto&& PH2, auto&& PH3) {
+        ObjectPtr->update_multitask(std::forward<decltype(PH1)>(PH1), std::forward<decltype(PH2)>(PH2),
+                                    std::forward<decltype(PH3)>(PH3));
+    });
     llama.queue_tasks.start_loop();
 
     t.join();
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index b6d6d27c5..b65bc007f 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <string>
+#include <utility>
 #include <vector>
 #include <set>
 #include <mutex>
@@ -171,10 +172,10 @@ inline std::string format_chatml(std::vector<json> messages)
 {
     std::ostringstream chatml_msgs;
 
-    for (auto it = messages.begin(); it != messages.end(); ++it) {
+    for (auto& message : messages) {
         chatml_msgs << "<|im_start|>"
-                    << json_value(*it, "role",    std::string("user")) << '\n';
-        chatml_msgs << json_value(*it, "content", std::string(""))
+                    << json_value(message, "role",    std::string("user")) << '\n';
+        chatml_msgs << json_value(message, "content", std::string(""))
                     << "<|im_end|>\n";
     }
 
@@ -225,17 +226,17 @@ struct llama_server_queue {
 
     // Register function to process a new task
     void on_new_task(std::function<void(task_server&)> callback) {
-        callback_new_task = callback;
+        callback_new_task = std::move(callback);
     }
 
     // Register function to process a multitask
     void on_finish_multitask(std::function<void(task_multi&)> callback) {
-        callback_finish_multitask = callback;
+        callback_finish_multitask = std::move(callback);
     }
 
     // Register the function to be called when the batch of tasks is finished
     void on_all_tasks_finished(std::function<void(void)> callback) {
-        callback_all_task_finished = callback;
+        callback_all_task_finished = std::move(callback);
     }
 
     // Call when the state of one slot is changed
@@ -378,7 +379,7 @@ struct llama_server_response {
 
     // Register the function to update multitask
     void on_multitask_update(callback_multitask_t callback) {
-        callback_update_multitask = callback;
+        callback_update_multitask = std::move(callback);
     }
 
     // Send a new result to a waiting task_id
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
index 4ff8e3fa7..da3aaa48e 100644
--- a/examples/tokenize/tokenize.cpp
+++ b/examples/tokenize/tokenize.cpp
@@ -32,11 +32,11 @@ int main(int argc, char ** argv) {
 
     tokens = ::llama_tokenize(model, prompt, add_bos, true);
 
-    for (int i = 0; i < (int) tokens.size(); i++) {
+    for (int token : tokens) {
         if (printing_ids) {
-            printf("%d\n", tokens[i]);
+            printf("%d\n", token);
         } else {
-            printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
+            printf("%6d -> '%s'\n", token, llama_token_to_piece(ctx, token).c_str());
         }
     }
 
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 49eaf3e61..0c9a3d41c 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -151,8 +151,7 @@ static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * mode
     ggml_allocr_alloc(alloc, model->tok_embeddings);
     ggml_allocr_alloc(alloc, model->norm);
     ggml_allocr_alloc(alloc, model->output);
-    for (uint32_t i = 0; i < model->layers.size(); ++i) {
-        auto & layer = model->layers[i];
+    for (auto& layer : model->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm);
         ggml_allocr_alloc(alloc, layer.wq);
         ggml_allocr_alloc(alloc, layer.wk);
@@ -166,8 +165,7 @@ static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * mode
     ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
     ggml_allocr_alloc(alloc, model->norm->grad);
     ggml_allocr_alloc(alloc, model->output->grad);
-    for (uint32_t i = 0; i < model->layers.size(); ++i) {
-        auto & layer = model->layers[i];
+    for (auto& layer : model->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm->grad);
         ggml_allocr_alloc(alloc, layer.wq->grad);
         ggml_allocr_alloc(alloc, layer.wk->grad);
@@ -453,9 +451,9 @@ static struct ggml_tensor * llama_build_train_graphs(
 
         // allocating checkpoints in one block to reduce memory fragmentation
         // note: they will be freed in reverse order
-        for (int i = 0; i < (int) checkpoints.size(); ++i) {
-            if (checkpoints[i]->data == NULL && checkpoints[i]->view_src == NULL) {
-                ggml_allocr_alloc(alloc, checkpoints[i]);
+        for (auto& checkpoint : checkpoints) {
+            if (checkpoint->data == NULL && checkpoint->view_src == NULL) {
+                ggml_allocr_alloc(alloc, checkpoint);
             }
         }
 
@@ -925,7 +923,7 @@ struct save_train_files_data {
 };
 
 static void save_train_files(void * vdata, struct train_state * train) {
-    struct save_train_files_data * data   = (struct save_train_files_data *) vdata;
+    auto data   = (struct save_train_files_data *) vdata;
     int64_t iter = train->opt->iter;
 
     if (strlen(data->fn_checkpoint_out) > 0) {
@@ -945,8 +943,7 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
     nx += ggml_nelements(model->norm);
     nx += ggml_nelements(model->output);
 
-    for (uint32_t i = 0; i < model->layers.size(); ++i) {
-        auto & layer = model->layers[i];
+    for (auto& layer : model->layers) {
         nx += ggml_nelements(layer.attention_norm);
         nx += ggml_nelements(layer.wq);
         nx += ggml_nelements(layer.wk);
diff --git a/llama.cpp b/llama.cpp
index 096eb4ac0..4ba83d40b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1092,7 +1092,7 @@ struct llama_mlock {
 
     bool failed_already = false;
 
-    llama_mlock() {}
+    llama_mlock() = default;
     llama_mlock(const llama_mlock &) = delete;
 
     ~llama_mlock() {
@@ -2958,7 +2958,7 @@ static void llm_load_hparams(
 }
 
 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string& raw_text, bool bos, bool special = false);
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 
 static void llm_load_vocab(
@@ -7111,8 +7111,8 @@ struct llm_tokenizer_bpe {
                 const auto token = vocab.token_to_id.find(str);
 
                 if (token == vocab.token_to_id.end()) {
-                    for (auto j = str.begin(); j != str.end(); ++j) {
-                        std::string byte_str(1, *j);
+                    for (char j : str) {
+                        std::string byte_str(1, j);
                         auto token_multibyte = vocab.token_to_id.find(byte_str);
                         if (token_multibyte == vocab.token_to_id.end()) {
                             throw std::runtime_error("ERROR: byte not found in vocab");
@@ -7172,8 +7172,8 @@ private:
         bpe_encoded_words.reserve(text.size());
 
         auto cps = codepoints_from_utf8(text);
-        for (size_t i = 0; i < cps.size(); ++i)
-            text_utf.emplace_back(codepoint_to_utf8(cps[i]));
+        for (unsigned int cp : cps)
+            text_utf.emplace_back(codepoint_to_utf8(cp));
 
         for (int i = 0; i < (int)text_utf.size(); i++) {
             const std::string & utf_char = text_utf[i];
@@ -7344,7 +7344,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
         const auto & special_id    = st.second;
 
         // for each text fragment
-        std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
+        auto it = buffer.begin();
         while (it != buffer.end()) {
             auto & fragment = (*it);
 
@@ -7431,7 +7431,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
     }
 }
 
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string& raw_text, bool bos, bool special) {
     std::vector<llama_vocab::id> output;
 
     // OG tokenizer behavior:
@@ -7887,7 +7887,7 @@ void llama_grammar_free(struct llama_grammar * grammar) {
 }
 
 struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
+    auto result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
 
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -8095,8 +8095,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
     }
 
     // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = std::abs(second_derivatives[i]);
+    for (float& second_derivative : second_derivatives) {
+        second_derivative = std::abs(second_derivative);
     }
 
     // Normalize the second derivatives
@@ -9412,8 +9412,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
             if (tot_count > 0) {
                 LLAMA_LOG_INFO(" | hist: ");
-                for (size_t i = 0; i < hist_cur.size(); i++) {
-                    LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
+                for (long long i : hist_cur) {
+                    LLAMA_LOG_INFO("%5.3f ", i / float(nelements));
                 }
             }
             LLAMA_LOG_INFO("\n");
@@ -9448,14 +9448,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // print histogram for all tensors
     {
         int64_t sum_all = 0;
-        for (size_t i = 0; i < hist_all.size(); i++) {
-            sum_all += hist_all[i];
+        for (auto i : hist_all) {
+            sum_all += i;
         }
 
         if (sum_all > 0) {
             LLAMA_LOG_INFO("%s: hist: ", __func__);
-            for (size_t i = 0; i < hist_all.size(); i++) {
-                LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all));
+            for (auto i : hist_all) {
+                LLAMA_LOG_INFO("%5.3f ", i / float(sum_all));
             }
             LLAMA_LOG_INFO("\n");
         }
@@ -9859,7 +9859,7 @@ struct llama_model * llama_load_model_from_file(
               struct llama_model_params   params) {
     ggml_time_init();
 
-    llama_model * model = new llama_model;
+    auto model = new llama_model;
 
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
@@ -9905,7 +9905,7 @@ struct llama_context * llama_new_context_with_model(
         return nullptr;
     }
 
-    llama_context * ctx = new llama_context(*model);
+    auto ctx = new llama_context(*model);
 
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 5ec0ed335..a2f5c62aa 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -256,7 +256,7 @@ enum test_mode {
 };
 
 struct test_case {
-    virtual ~test_case() {}
+    virtual ~test_case() = default;
 
     virtual std::string op_desc(ggml_tensor * t) {
         return ggml_op_desc(t);
@@ -281,9 +281,9 @@ struct test_case {
     virtual size_t op_size(ggml_tensor * t) {
         size_t size = ggml_nbytes(t);
         // add source tensors
-        for (int i = 0; i < GGML_MAX_SRC; i++) {
-            if (t->src[i] != NULL) {
-                size += ggml_nbytes(t->src[i]);
+        for (auto& el : t->src) {
+            if (el) {
+                size += ggml_nbytes(el);
             }
         }
         return size;
@@ -416,7 +416,7 @@ struct test_case {
         };
 
         auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
-            callback_userdata * ud = (callback_userdata *) user_data;
+            auto ud = (callback_userdata *) user_data;
             const char * bn1 = ggml_backend_name(ud->backend1);
             const char * bn2 = ggml_backend_name(ud->backend2);
 
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 8ff76c891..0f555df09 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -583,7 +583,7 @@ int main(int argc, const char ** argv) {
         }
 
         // mean, not yet fully implemented
-        if(0)
+        if(false)
         {
             srand(seed);
             const int nargs = 1;
@@ -601,7 +601,7 @@ int main(int argc, const char ** argv) {
         }
 
         // argmax
-        if (0)
+        if (false)
         {
             srand(seed);
             const int nargs = 1;
@@ -732,7 +732,7 @@ int main(int argc, const char ** argv) {
         }
 
         // tanh, not yet fully implemented
-        if(0)
+        if(false)
         {
             srand(seed);
             const int nargs = 1;
@@ -787,7 +787,7 @@ int main(int argc, const char ** argv) {
         }
 
         // elu, not yet fully implemented
-        if(0)
+        if(false)
         {
             srand(seed);
             const int nargs = 1;
@@ -822,7 +822,7 @@ int main(int argc, const char ** argv) {
         }
 
         // gelu, not yet fully implemented
-        if(0)
+        if(false)
         {
             srand(seed);
             const int nargs = 1;
@@ -1559,7 +1559,7 @@ int main(int argc, const char ** argv) {
         }
 
         // flash_attn f16, not yet fully implemented
-        if(0)
+        if(false)
         {
             srand(seed);
             const int nargs = 3;
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index a0b5b043d..ca1664eeb 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -29,10 +29,9 @@ term  ::= [0-9]+)""";
     };
 
     uint32_t index = 0;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
+    for (auto& symbol_id : parsed_grammar.symbol_ids) {
+        std::string key = symbol_id.first;
+        uint32_t value = symbol_id.second;
         std::pair<std::string, uint32_t> expected_pair = expected[index];
 
         // pretty print error message before asserting
@@ -88,9 +87,7 @@ term  ::= [0-9]+)""";
     for (auto rule : parsed_grammar.rules)
     {
         // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
+        for (auto element : rule) {
             llama_grammar_element expected_element = expected_rules[index];
 
             // pretty print error message before asserting
@@ -135,10 +132,9 @@ term  ::= [0-9]+)""";
     };
 
     index = 0;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
+    for (auto& symbol_id : parsed_grammar.symbol_ids) {
+        std::string key = symbol_id.first;
+        uint32_t value = symbol_id.second;
         std::pair<std::string, uint32_t> expected_pair = expected[index];
 
         // pretty print error message before asserting
@@ -227,9 +223,7 @@ term  ::= [0-9]+)""";
     for (auto rule : parsed_grammar.rules)
     {
         // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
+        for (auto element : rule) {
             llama_grammar_element expected_element = expected_rules[index];
 
             // pretty print error message before asserting
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index 78fc41117..e90aafe15 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -98,14 +98,14 @@ int main()
         },
     };
 
-    for (auto pair : expected)
+    for (const auto& pair : expected)
     {
         parsed_grammar.symbol_ids[pair.first] = pair.second;
     }
 
-    for (auto rule : expected_rules)
+    for (const auto& rule : expected_rules)
     {
-        parsed_grammar.rules.push_back({});
+        parsed_grammar.rules.emplace_back();
         for (auto element : rule)
         {
             parsed_grammar.rules.back().push_back(element);
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 31a78c632..4ccc66271 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -126,7 +126,7 @@ int main(int argc, char * argv[]) {
     bool failed = false;
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
+        auto type = (ggml_type) i;
         ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
 
         // deprecated - skip
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 09d410b7f..24e288720 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -269,7 +269,7 @@ int main(int argc, char * argv[]) {
     struct ggml_context * ctx = ggml_init(ggml_params);
 
     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
+        auto type = (ggml_type) i;
         ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
         if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
             continue;