diff --git a/common/sampling.cpp b/common/sampling.cpp
index 8ff2009af..7e8cd4c81 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,7 +1,7 @@
 #include "sampling.h"
 
 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    auto result = new llama_sampling_context();
+    auto * result = new llama_sampling_context();
 
     result->params  = params;
     result->grammar = nullptr;
@@ -197,8 +197,8 @@ static llama_token llama_sampling_sample_impl(
     }
 
     // apply params.logit_bias map
-    for (const auto & logit_bia : params.logit_bias) {
-        logits[logit_bia.first] += logit_bia.second;
+    for (const auto & logit_bias : params.logit_bias) {
+        logits[logit_bias.first] += logit_bias.second;
     }
 
     if (ctx_cfg) {
diff --git a/common/train.cpp b/common/train.cpp
index 59be89ce9..b84d43daa 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -18,7 +18,7 @@ struct random_uniform_distribution {
 };
 
 struct train_state  * init_train_state() {
-    auto state = new struct train_state;
+    auto * state = new struct train_state;
     state->train_its     = 0;
     state->train_samples = 0;
     state->train_tokens  = 0;
@@ -46,12 +46,12 @@ void free_train_state(struct train_state  * state) {
 struct random_normal_distribution * init_random_normal_distribution(
     int seed, float mean, float std, float min, float max
 ) {
-    auto rnd = new random_normal_distribution{std::mt19937(seed), std::normal_distribution<float>{mean, std}, min, max};
+    auto * rnd = new random_normal_distribution{std::mt19937(seed), std::normal_distribution<float>{mean, std}, min, max};
     return rnd;
 }
 
 struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max) {
-    auto rnd = new random_uniform_distribution{std::mt19937(seed), std::uniform_real_distribution<float>{min, max}};
+    auto * rnd = new random_uniform_distribution{std::mt19937(seed), std::uniform_real_distribution<float>{min, max}};
     return rnd;
 }
 
@@ -1379,7 +1379,7 @@ void finish_processing_train_args(struct train_params_common * params) {
 }
 
 void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel) {
-    auto data   = (struct train_opt_callback_data *) vdata;
+    auto * data = (struct train_opt_callback_data *) vdata;
     struct train_params_common     * params = data->params;
     struct train_state             * train  = data->train;
     struct ggml_opt_context        * opt    = train->opt;
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 182976563..00da15710 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -225,7 +225,7 @@ static void free_lora(struct lora_data * lora) {
 }
 
 static struct lora_data * load_lora(struct lora_info * info) {
-    auto result = new struct lora_data;
+    auto * result = new struct lora_data;
     result->info = *info;
     result->ctx = NULL;
     result->lora_r     = 1;
@@ -370,8 +370,8 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
 static void export_lora(struct export_lora_params * params) {
     // load all loras
     std::vector<struct lora_data *> loras;
-    for (auto& i : params->lora) {
-        auto lora = load_lora(&i);
+    for (auto & i : params->lora) {
+        auto * lora = load_lora(&i);
         if (lora) {
             loras.push_back(lora);
         }
@@ -431,7 +431,7 @@ static void export_lora(struct export_lora_params * params) {
         fin.read_raw(data.data(), data.size());
 
         // apply all loras
-        for (auto& lora : loras) {
+        for (auto & lora : loras) {
             apply_lora(tensor, lora, params->n_threads);
         }
 
@@ -455,7 +455,7 @@ static void export_lora(struct export_lora_params * params) {
     gguf_free(gguf_in);
 
     // free loras
-    for (auto& lora : loras) {
+    for (auto * lora : loras) {
         free_lora(lora);
     }
 }
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 4dc588be6..e4bc0bde7 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -379,7 +379,7 @@ static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora)
     ggml_allocr_alloc(alloc, lora->norm_b);
     ggml_allocr_alloc(alloc, lora->output_a);
     ggml_allocr_alloc(alloc, lora->output_b);
-    for (auto& layer : lora->layers) {
+    for (auto & layer : lora->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm_a);
         ggml_allocr_alloc(alloc, layer.attention_norm_b);
         ggml_allocr_alloc(alloc, layer.wq_a);
@@ -405,7 +405,7 @@ static void alloc_lora(struct ggml_allocr * alloc, struct my_llama_lora * lora)
     ggml_allocr_alloc(alloc, lora->norm_b->grad);
     ggml_allocr_alloc(alloc, lora->output_a->grad);
     ggml_allocr_alloc(alloc, lora->output_b->grad);
-    for (auto& layer : lora->layers) {
+    for (auto & layer : lora->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm_a->grad);
         ggml_allocr_alloc(alloc, layer.attention_norm_b->grad);
         ggml_allocr_alloc(alloc, layer.wq_a->grad);
@@ -801,7 +801,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
 
     // allocating checkpoints in one block to reduce memory fragmentation
     // note: they will be freed in reverse order
-    for (auto& checkpoint : checkpoints) {
+    for (auto * checkpoint : checkpoints) {
         if (checkpoint->data == NULL && checkpoint->view_src == NULL) {
             ggml_allocr_alloc(alloc, checkpoint);
         }
@@ -870,7 +870,7 @@ static void load_llama_lora_gguf(struct gguf_context * fctx, struct ggml_context
     copy_tensor_by_name(lora->output_a,         f_ggml_ctx, ggml_get_name(lora->output_a));
     copy_tensor_by_name(lora->output_b,         f_ggml_ctx, ggml_get_name(lora->output_b));
 
-    for (auto& layer : lora->layers) {
+    for (auto & layer : lora->layers) {
         copy_tensor_by_name(layer.attention_norm_a, f_ggml_ctx, ggml_get_name(layer.attention_norm_a));
         copy_tensor_by_name(layer.attention_norm_b, f_ggml_ctx, ggml_get_name(layer.attention_norm_b));
         copy_tensor_by_name(layer.wq_a,             f_ggml_ctx, ggml_get_name(layer.wq_a));
@@ -937,7 +937,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
     gguf_add_tensor(fctx, lora->output_a);
     gguf_add_tensor(fctx, lora->output_b);
 
-    for (auto& layer : lora->layers) {
+    for (auto & layer : lora->layers) {
         gguf_add_tensor(fctx, layer.attention_norm_a);
         gguf_add_tensor(fctx, layer.attention_norm_b);
         gguf_add_tensor(fctx, layer.wq_a);
@@ -1471,7 +1471,7 @@ struct save_train_files_data {
 };
 
 static void save_train_files(void * vdata, struct train_state * train) {
-    auto data   = (struct save_train_files_data *) vdata;
+    auto * data   = (struct save_train_files_data *) vdata;
 
     int64_t iter = train->opt->iter;
 
@@ -1494,7 +1494,7 @@ static int64_t get_parameter_count(struct my_llama_lora* lora) {
     nx += ggml_nelements(lora->output_a);
     nx += ggml_nelements(lora->output_b);
 
-    for (auto& layer : lora->layers) {
+    for (auto & layer : lora->layers) {
         nx += ggml_nelements(layer.attention_norm_a);
         nx += ggml_nelements(layer.attention_norm_b);
         nx += ggml_nelements(layer.wq_a);
@@ -1815,7 +1815,7 @@ int main(int argc, char ** argv) {
         ++token_noccurs[train_token];
     }
     int n_unique_tokens = 0;
-    for (unsigned long long token_noccur : token_noccurs) {
+    for (size_t token_noccur : token_noccurs) {
         if (token_noccur == 0) continue;
         ++n_unique_tokens;
     }
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index bfb7b4579..4825bdade 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -216,7 +216,7 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
         sum_exp += exp_logit;
         probs[i] = exp_logit;
     }
-    for (float& prob : probs) {
+    for (float & prob : probs) {
         prob /= float(sum_exp);
     }
     return probs;
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 94f12d04b..f96d9916f 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1011,21 +1011,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
                 vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
             } catch (std::runtime_error & e) {
-                static_cast<void>(e);
+                GGML_UNUSED(e);
             }
             try {
                 // Yi-type llava
                 vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
                 vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
             } catch (std::runtime_error & e) {
-                static_cast<void>(e);
+                GGML_UNUSED(e);
             }
             try {
                 // Yi-type llava
                 vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                 vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
             } catch (std::runtime_error & e) {
-                static_cast<void>(e);
+                GGML_UNUSED(e);
             }
         }
         else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index f5488eded..3a20e132c 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -268,7 +268,7 @@ int main(int argc, char ** argv) {
 
             // if no active ngrams are left, it means the sampled token does not pass the verification
             if (v > 0) {
-                for (auto& g : ngrams_cur) {
+                for (auto & g : ngrams_cur) {
                     if (g.active) {
                         i_batch = g.i_batch[v];
                         seq_id_best = g.seq_id;
@@ -316,7 +316,7 @@ int main(int argc, char ** argv) {
             }
 
             // verify across active n-grams
-            for (auto& g : ngrams_cur) {
+            for (auto & g : ngrams_cur) {
                 if (g.active) {
                     if (v == N - 1) {
                         g.active = false;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9389fc41e..1e6768651 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -94,7 +94,7 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
         sum_exp += exp_logit;
         probs[i] = exp_logit;
     }
-    for (float& prob : probs) {
+    for (float & prob : probs) {
         prob /= float(sum_exp);
     }
     return probs;
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0c9a3d41c..958da8792 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -151,7 +151,7 @@ static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * mode
     ggml_allocr_alloc(alloc, model->tok_embeddings);
     ggml_allocr_alloc(alloc, model->norm);
     ggml_allocr_alloc(alloc, model->output);
-    for (auto& layer : model->layers) {
+    for (auto & layer : model->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm);
         ggml_allocr_alloc(alloc, layer.wq);
         ggml_allocr_alloc(alloc, layer.wk);
@@ -165,7 +165,7 @@ static void alloc_model(struct ggml_allocr * alloc, struct my_llama_model * mode
     ggml_allocr_alloc(alloc, model->tok_embeddings->grad);
     ggml_allocr_alloc(alloc, model->norm->grad);
     ggml_allocr_alloc(alloc, model->output->grad);
-    for (auto& layer : model->layers) {
+    for (auto & layer : model->layers) {
         ggml_allocr_alloc(alloc, layer.attention_norm->grad);
         ggml_allocr_alloc(alloc, layer.wq->grad);
         ggml_allocr_alloc(alloc, layer.wk->grad);
@@ -451,7 +451,7 @@ static struct ggml_tensor * llama_build_train_graphs(
 
         // allocating checkpoints in one block to reduce memory fragmentation
         // note: they will be freed in reverse order
-        for (auto& checkpoint : checkpoints) {
+        for (auto * checkpoint : checkpoints) {
             if (checkpoint->data == NULL && checkpoint->view_src == NULL) {
                 ggml_allocr_alloc(alloc, checkpoint);
             }
@@ -923,7 +923,7 @@ struct save_train_files_data {
 };
 
 static void save_train_files(void * vdata, struct train_state * train) {
-    auto data   = (struct save_train_files_data *) vdata;
+    auto * data = (struct save_train_files_data *) vdata;
     int64_t iter = train->opt->iter;
 
     if (strlen(data->fn_checkpoint_out) > 0) {
@@ -943,7 +943,7 @@ static int64_t get_parameter_count(struct my_llama_model* model) {
     nx += ggml_nelements(model->norm);
     nx += ggml_nelements(model->output);
 
-    for (auto& layer : model->layers) {
+    for (auto & layer : model->layers) {
         nx += ggml_nelements(layer.attention_norm);
         nx += ggml_nelements(layer.wq);
         nx += ggml_nelements(layer.wk);
diff --git a/llama.cpp b/llama.cpp
index 6aec726ef..f73f8f84f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3004,7 +3004,7 @@ static void llm_load_hparams(
 }
 
 // TODO: This should probably be in llama.h
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string& raw_text, bool bos, bool special = false);
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool special = false);
 static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
 
 static void llm_load_vocab(
@@ -7374,7 +7374,7 @@ private:
         bpe_encoded_words.reserve(text.size());
 
         auto cps = codepoints_from_utf8(text);
-        for (unsigned int cp : cps)
+        for (uint32_t cp : cps)
             text_utf.emplace_back(codepoint_to_utf8(cp));
 
         for (int i = 0; i < (int)text_utf.size(); i++) {
@@ -7633,7 +7633,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
     }
 }
 
-static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string& raw_text, bool bos, bool special) {
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool special) {
     std::vector<llama_vocab::id> output;
 
     // OG tokenizer behavior:
@@ -8089,7 +8089,7 @@ void llama_grammar_free(struct llama_grammar * grammar) {
 }
 
 struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
-    auto result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
+    auto * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
 
     // redirect elements in stacks to point to new rules
     for (size_t is = 0; is < result->stacks.size(); is++) {
@@ -8337,7 +8337,7 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
     }
 
     // Calculate absolute value of second derivatives
-    for (float& second_derivative : second_derivatives) {
+    for (float & second_derivative : second_derivatives) {
         second_derivative = std::abs(second_derivative);
     }
 
@@ -9654,7 +9654,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
             if (tot_count > 0) {
                 LLAMA_LOG_INFO(" | hist: ");
-                for (long long i : hist_cur) {
+                for (int64_t i : hist_cur) {
                     LLAMA_LOG_INFO("%5.3f ", i / float(nelements));
                 }
             }
@@ -10101,7 +10101,7 @@ struct llama_model * llama_load_model_from_file(
               struct llama_model_params   params) {
     ggml_time_init();
 
-    auto model = new llama_model;
+    auto * model = new llama_model;
 
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
@@ -10147,7 +10147,7 @@ struct llama_context * llama_new_context_with_model(
         return nullptr;
     }
 
-    auto ctx = new llama_context(*model);
+    auto * ctx = new llama_context(*model);
 
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 028683abb..699f50ae3 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -64,7 +64,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
             }
         }
         ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, int(size/tensor->ne[0]),
-            static_cast<int>(tensor->ne[0]), hist, im);
+            int(tensor->ne[0]), hist, im);
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
@@ -288,9 +288,9 @@ struct test_case {
     virtual size_t op_size(ggml_tensor * t) {
         size_t size = ggml_nbytes(t);
         // add source tensors
-        for (auto& el : t->src) {
-            if (el) {
-                size += ggml_nbytes(el);
+        for (auto * src : t->src) {
+            if (src) {
+                size += ggml_nbytes(src);
             }
         }
         return size;
@@ -423,7 +423,7 @@ struct test_case {
         };
 
         auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
-            auto ud = (callback_userdata *) user_data;
+            auto * ud = (callback_userdata *) user_data;
             const char * bn1 = ggml_backend_name(ud->backend1);
             const char * bn2 = ggml_backend_name(ud->backend2);
 
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index ca1664eeb..3b108917e 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -29,7 +29,7 @@ term  ::= [0-9]+)""";
     };
 
     uint32_t index = 0;
-    for (auto& symbol_id : parsed_grammar.symbol_ids) {
+    for (auto & symbol_id : parsed_grammar.symbol_ids) {
         std::string key = symbol_id.first;
         uint32_t value = symbol_id.second;
         std::pair<std::string, uint32_t> expected_pair = expected[index];
@@ -132,7 +132,7 @@ term  ::= [0-9]+)""";
     };
 
     index = 0;
-    for (auto& symbol_id : parsed_grammar.symbol_ids) {
+    for (auto & symbol_id : parsed_grammar.symbol_ids) {
         std::string key = symbol_id.first;
         uint32_t value = symbol_id.second;
         std::pair<std::string, uint32_t> expected_pair = expected[index];
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index e90aafe15..f81679413 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -103,7 +103,7 @@ int main()
         parsed_grammar.symbol_ids[pair.first] = pair.second;
     }
 
-    for (const auto& rule : expected_rules)
+    for (const auto & rule : expected_rules)
     {
         parsed_grammar.rules.emplace_back();
         for (auto element : rule)