diff --git a/common/common.cpp b/common/common.cpp
index 35b22de0f..e938dee16 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,28 +1,20 @@
 #include "common.h"
-#include "ggml.h"
 #include "llama.h"
-#include "log.h"
-#include "sampling.h"
 
 #include <algorithm>
-#include <cctype>
-#include <chrono>
-#include <cinttypes>
+#include <cassert>
 #include <cmath>
-#include <cstdlib>
+#include <cstring>
 #include <ctime>
-#include <exception>
 #include <fstream>
 #include <iterator>
+#include <iostream>
 #include <regex>
 #include <sstream>
-#include <stdexcept>
 #include <string>
-#include <thread>
-#include <unordered_map>
 #include <unordered_set>
-#include <utility>
 #include <vector>
+#include <cinttypes>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -40,7 +32,9 @@
 #include <fcntl.h>
 #include <io.h>
 #else
+#include <sys/ioctl.h>
 #include <sys/stat.h>
+#include <unistd.h>
 #endif
 
 #if defined(_MSC_VER)
diff --git a/common/common.h b/common/common.h
index c763be8b0..72a49b890 100644
--- a/common/common.h
+++ b/common/common.h
@@ -3,18 +3,19 @@
 #pragma once
 
 #include "llama.h"
+
 #include "sampling.h"
 
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 
 #include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <random>
 #include <string>
-#include <tuple>
 #include <vector>
+#include <random>
+#include <thread>
+#include <unordered_map>
+#include <tuple>
 
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
@@ -67,7 +68,7 @@ struct gpt_params {
     int32_t yarn_orig_ctx                   = 0;    // YaRN original context length
     int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED;
 
-    // sampling parameters
+    // // sampling parameters
     struct llama_sampling_params sparams;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
diff --git a/common/console.cpp b/common/console.cpp
index 69f1419f1..f65cbc6ed 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -14,13 +14,14 @@
 #define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
 #endif
 #else
-#include <clocale>
-#include <cstdio>
-#include <cwchar>
-
+#include <climits>
 #include <sys/ioctl.h>
-#include <termios.h>
 #include <unistd.h>
+#include <wchar.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <termios.h>
 #endif
 
 #define ANSI_COLOR_RED     "\x1b[31m"
diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index ef56e909d..ff51cc803 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -1,5 +1,6 @@
 #include "grammar-parser.h"
 #include <cstdint>
+#include <cwchar>
 #include <string>
 #include <utility>
 #include <stdexcept>
diff --git a/common/grammar-parser.h b/common/grammar-parser.h
index b603764b3..9037d7272 100644
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@@ -10,14 +10,11 @@
 // space ::= [ \t\n]*
 
 #pragma once
-
 #include "llama.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <map>
-#include <string>
 #include <vector>
+#include <map>
+#include <cstdint>
+#include <string>
 
 namespace grammar_parser {
     struct parse_state {
diff --git a/common/sampling.cpp b/common/sampling.cpp
index a5b684ee3..1317024c2 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,11 +1,5 @@
-#include "common.h"
 #include "sampling.h"
 
-#include <algorithm>
-#include <cstdio>
-#include <map>
-#include <utility>
-
 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
     struct llama_sampling_context * result = new llama_sampling_context();
 
diff --git a/common/sampling.h b/common/sampling.h
index 03909efbc..7c9b8dcf2 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -1,12 +1,12 @@
 #pragma once
 
-#include "grammar-parser.h"
 #include "llama.h"
 
-#include <cstdint>
+#include "grammar-parser.h"
+
 #include <string>
-#include <unordered_map>
 #include <vector>
+#include <unordered_map>
 
 // sampling parameters
 typedef struct llama_sampling_params {
@@ -56,6 +56,8 @@ struct llama_sampling_context {
     std::vector<llama_token_data> cur;
 };
 
+#include "common.h"
+
 // Create a new sampling context instance.
 struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
 
diff --git a/common/train.cpp b/common/train.cpp
index d1f5505cb..bc15b7a03 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1,19 +1,9 @@
-#include "common.h"
-#include "ggml.h"
-#include "llama.h"
 #include "train.h"
+#include "common.h"
 
-#include <algorithm>
-#include <cerrno>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <functional>
-#include <locale>
 #include <random>
 #include <sstream>
+#include <functional>
 
 struct random_normal_distribution {
     std::mt19937 gen;
diff --git a/common/train.h b/common/train.h
index ccac6b7d6..d86c93cc4 100644
--- a/common/train.h
+++ b/common/train.h
@@ -2,14 +2,13 @@
 
 #pragma once
 
-#include "llama.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <random>
 #include <string>
+#include <random>
 #include <vector>
 
+#include "ggml.h"
+#include "llama.h"
+
 typedef std::string mt19937_state;
 
 struct train_state {
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 35404f9a3..8155101d0 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1,13 +1,11 @@
 #include "ggml.h"
 #include "train.h"
 
-#include <algorithm>
+#include <vector>
 #include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <random>
 #include <vector>
 
 #if defined(_MSC_VER)
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 955d815cc..533c55c17 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,11 +1,10 @@
 #include "common.h"
-#include "ggml.h"
 #include "llama.h"
 
 #include <algorithm>
-#include <cstdint>
+#include <cmath>
 #include <cstdio>
-#include <cstdlib>
+#include <string>
 #include <vector>
 
 // mutates the input string
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index f91872798..22a4265df 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -1,11 +1,9 @@
 #include "common.h"
-#include "ggml.h"
 #include "llama.h"
 
 #include <algorithm>
-#include <cstdint>
+#include <cmath>
 #include <cstdio>
-#include <cstdlib>
 #include <string>
 #include <vector>
 
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
index cce2b9916..679b382e1 100644
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@@ -1,14 +1,29 @@
 #include "common.h"
 #include "llama.h"
 
-#include <algorithm>
 #include <cassert>
+#include <cinttypes>
+#include <cmath>
 #include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
 #include <iostream>
 #include <string>
-#include <tuple>
 #include <vector>
 
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
 // Used for debugging to print out beam tokens.
 struct ostream_beam_view {
     llama_context * ctx;
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 58bf9a814..76e3f57cc 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,11 +1,20 @@
 #include "common.h"
 #include "ggml.h"
 
-#include <cinttypes>
+#include <locale.h>
+#include <assert.h>
+#include <math.h>
+#include <cstring>
 #include <cstdio>
-#include <cstdlib>
+#include <cinttypes>
+#include <unordered_map>
+#include <queue>
+#include <string.h>
+#include <cassert>
+#include <fstream>
 #include <string>
-#include <vector>
+#include <iterator>
+#include <algorithm>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 7ecc15cb5..cae3bf3c3 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,22 +1,19 @@
-#include "common.h"
 #include "ggml.h"
 #include "llama.h"
+#include "common.h"
 
-#include <algorithm>
-#include <cassert>
-#include <cerrno>
-#include <climits>
-#include <cmath>
-#include <cstdarg>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <sstream>
-#include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <sstream>
+#include <algorithm>
+#include <string>
 
 // GGUF keys & tensor names.
 
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 86f874a52..3295cd240 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,12 +1,7 @@
 #include "common.h"
 #include "llama.h"
 
-#include <algorithm>
-#include <cstdio>
 #include <ctime>
-#include <random>
-#include <tuple>
-#include <vector>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index a2406de5a..d803cfd5c 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -1,16 +1,11 @@
+
 #include "common.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
 
-#include <algorithm>
-#include <cerrno>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
+#include <vector>
 #include <string>
 #include <thread>
-#include <vector>
 
 static const size_t tensor_alignment = 32;
 
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 710ddba96..649a3b7c1 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1,19 +1,19 @@
-#include "common.h"
-#include "ggml-alloc.h"
 #include "ggml.h"
+#include "ggml-alloc.h"
 #include "llama.h"
+#include "common.h"
 #include "train.h"
-
-#include <algorithm>
-#include <cerrno>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <string>
+#include <unordered_map>
 #include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index f9caffacb..62f5ce3c1 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -1,17 +1,19 @@
 #include "common.h"
+
 #include "console.h"
 #include "llama.h"
-#include "sampling.h"
+#include "grammar-parser.h"
 
-#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
 #include <cstdio>
-#include <cstdlib>
+#include <cstring>
 #include <ctime>
 #include <fstream>
-#include <random>
+#include <iostream>
 #include <sstream>
 #include <string>
-#include <tuple>
 #include <vector>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 0df6c4c45..9bd82d565 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1,30 +1,26 @@
-#include "ggml.h"
-#include "llama.h"
-#include "common.h"
-#include "ggml-cuda.h"
-
 #include <algorithm>
 #include <array>
 #include <cassert>
-#include <cctype>
 #include <chrono>
 #include <cinttypes>
 #include <clocale>
 #include <cmath>
 #include <cstdio>
-#include <cstdlib>
 #include <cstring>
 #include <ctime>
 #include <iterator>
 #include <map>
-#include <memory>
 #include <numeric>
 #include <regex>
 #include <sstream>
 #include <string>
-#include <utility>
 #include <vector>
 
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "ggml-cuda.h"
+
 // utils
 static uint64_t get_time_ns() {
     using clock = std::chrono::high_resolution_clock;
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 03a8e9c46..61932e659 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2,23 +2,21 @@
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
 
-#include "clip.h"
-#include "ggml-alloc.h"
-#include "ggml.h"
-
-#include <algorithm>
-#include <climits>
+#include <cassert>
 #include <cmath>
-#include <cstdarg>
-#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
+#include <iostream>
+#include <map>
 #include <regex>
 #include <stdexcept>
-#include <string>
 #include <vector>
 
+#include "clip.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 106df0d1a..3d7261e29 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -1,8 +1,7 @@
 #ifndef CLIP_H
 #define CLIP_H
 
-#include <stddef.h>
-#include <stdint.h>
+#include "ggml.h"
 
 struct clip_ctx;
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 2c5a86398..f0974d5bc 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,14 +1,11 @@
 #include "clip.h"
-#include "common.h"
-#include "ggml.h"
-#include "llama.h"
 #include "llava-utils.h"
+#include "common.h"
+#include "llama.h"
 
-#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
-#include <string>
+#include <vector>
 
 static void show_additional_info(int /*argc*/, char ** argv) {
     printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index ce68efae4..8d985c82a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,17 +1,18 @@
 #include "common.h"
+
 #include "console.h"
 #include "llama.h"
-#include "sampling.h"
 
-#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
 #include <cstdio>
-#include <cstdlib>
+#include <cstring>
 #include <ctime>
 #include <fstream>
-#include <random>
+#include <iostream>
 #include <sstream>
 #include <string>
-#include <tuple>
 #include <vector>
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 9b107e5f1..a78df305f 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -2,20 +2,13 @@
 // The clients submite requests to the server and they are processed in parallel.
 
 #include "common.h"
-#include "ggml.h"
 #include "llama.h"
-#include "sampling.h"
 
-#include <algorithm>
-#include <cctype>
-#include <cstdint>
+#include <cmath>
 #include <cstdio>
-#include <cstdlib>
-#include <ctime>
-#include <istream>
 #include <string>
-#include <tuple>
 #include <vector>
+#include <ctime>
 
 // trim whitespace from the beginning and end of a string
 static std::string trim(const std::string & str) {
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index f8e8f8fc5..de60c5227 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,20 +1,13 @@
 #include "common.h"
 #include "llama.h"
 
-#include <algorithm>
-#include <chrono>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <iterator>
-#include <mutex>
-#include <random>
 #include <sstream>
-#include <string>
 #include <thread>
-#include <tuple>
-#include <utility>
+#include <mutex>
 #include <vector>
 
 #if defined(_MSC_VER)
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index d5c3c3591..271282477 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -8,16 +8,15 @@
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
-#include <cstdlib>
 #include <cstring>
-#include <iterator>
-#include <mutex>
+#include <map>
 #include <numeric>
 #include <regex>
 #include <string>
-#include <thread>
-#include <utility>
+#include <unordered_map>
 #include <vector>
+#include <thread>
+#include <mutex>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index ce6863a2b..d27ea5e91 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,14 +1,10 @@
 #include "common.h"
 #include "llama.h"
 
-#include <cctype>
-#include <cstdint>
 #include <cstdio>
-#include <cstdlib>
 #include <cstring>
-#include <exception>
-#include <string>
 #include <vector>
+#include <string>
 
 struct quant_option {
     std::string name;
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 622101449..48d801110 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,11 +1,9 @@
 #include "common.h"
 #include "llama.h"
 
-#include <cstdint>
-#include <cstdio>
-#include <string>
-#include <tuple>
 #include <vector>
+#include <cstdio>
+#include <chrono>
 
 int main(int argc, char ** argv) {
     gpt_params params;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0da14c9b7..fd755327a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,7 +1,6 @@
 #include "common.h"
-#include "ggml.h"
 #include "llama.h"
-#include "sampling.h"
+#include "grammar-parser.h"
 
 #include "../llava/clip.h"
 
@@ -21,28 +20,10 @@
 #include "completion.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"
 
-#include <algorithm>
-#include <cctype>
-#include <chrono>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <exception>
-#include <functional>
-#include <istream>
-#include <iterator>
-#include <mutex>
-#include <numeric>
-#include <stdexcept>
-#include <string>
+#include <cstddef>
 #include <thread>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
+#include <mutex>
+#include <chrono>
 
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 52b0de48f..374aef6f1 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,8 +1,9 @@
 #include "common.h"
-#include "ggml.h"
 #include "llama.h"
 
+#include <cmath>
 #include <cstdio>
+#include <string>
 #include <vector>
 
 int main(int argc, char ** argv) {
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 3e65d7e6e..798684f66 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,13 +1,9 @@
 #include "common.h"
-#include "ggml.h"
 #include "llama.h"
-#include "sampling.h"
 
-#include <algorithm>
+#include <cmath>
 #include <cstdio>
-#include <cstring>
 #include <string>
-#include <tuple>
 #include <vector>
 
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 42e5c6b0f..2a257e632 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,18 +1,19 @@
-#include "common.h"
-#include "ggml-alloc.h"
 #include "ggml.h"
-#include "llama.h"
+#include "ggml-alloc.h"
+#include "common.h"
 #include "train.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <string>
+#include "llama.h"
+#include <unordered_map>
 #include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/ggml-alloc.c b/ggml-alloc.c
index c9fd6e54d..34eba3f83 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -1,9 +1,8 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 #include "ggml.h"
-
 #include <assert.h>
-#include <stdint.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/ggml-alloc.h b/ggml-alloc.h
index 4ebba6a6b..e38758878 100644
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -2,9 +2,6 @@
 
 #include "ggml.h"
 
-#include <stdbool.h>
-#include <stddef.h>
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/ggml-backend.c b/ggml-backend.c
index 89af304d3..ca8d83daf 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1,5 +1,8 @@
 #include "ggml-backend.h"
+#include "ggml-alloc.h"
 
+#include <assert.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/ggml-backend.h b/ggml-backend.h
index 12618036b..da134b0db 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -2,9 +2,6 @@
 
 #include "ggml.h"
 
-#include <stdbool.h>
-#include <stddef.h>
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/ggml-impl.h b/ggml-impl.h
index 8a9fb7388..5ec18a50c 100644
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -1,9 +1,9 @@
 #pragma once
 
-// GGML internal header
-
 #include "ggml.h"
 
+// GGML internal header
+
 #include <assert.h>
 #include <stddef.h>
 #include <stdbool.h>
diff --git a/ggml-quants.c b/ggml-quants.c
index 39f2c27b3..740be6dc5 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1,11 +1,10 @@
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 
+#include <math.h>
+#include <string.h>
 #include <assert.h>
 #include <float.h>
-#include <math.h>
-#include <stdbool.h>
-#include <string.h>
 
 #ifdef __ARM_NEON
 
diff --git a/ggml-quants.h b/ggml-quants.h
index f782d54c8..70c12c274 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -1,11 +1,11 @@
 #pragma once
 
+#include "ggml-impl.h"
+
 // GGML internal header
 
-#include "ggml.h"
-
-#include <assert.h>
 #include <stdint.h>
+#include <stddef.h>
 
 #define QK4_0 32
 typedef struct {
diff --git a/ggml.c b/ggml.c
index fca45ad14..605a27940 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1,7 +1,6 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 
-#include "ggml.h"
 #include "ggml-impl.h"
 #include "ggml-quants.h"
 
@@ -21,7 +20,9 @@
 #include <inttypes.h>
 #include <stdio.h>
 #include <float.h>
+#include <limits.h>
 #include <stdarg.h>
+#include <signal.h>
 
 #ifdef GGML_USE_METAL
 #include <unistd.h>
@@ -84,14 +85,15 @@ static int sched_yield (void) {
     return 0;
 }
 #else
-
 #include <pthread.h>
-#include <sched.h>
 #include <stdatomic.h>
-#include <sys/stat.h>
 
 typedef void * thread_ret_t;
 
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
 #endif
 
 #ifdef GGML_USE_CPU_HBM
diff --git a/ggml.h b/ggml.h
index 5b27b7ad2..70eb25a6b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -300,6 +300,7 @@ extern "C" {
     GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
     GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
 
+    struct ggml_object;
     struct ggml_context;
 
     enum ggml_type {
diff --git a/llama.cpp b/llama.cpp
index a6353fcec..518aa5b98 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -51,34 +51,29 @@
 #include <algorithm>
 #include <array>
 #include <cassert>
-#include <cerrno>
 #include <cinttypes>
 #include <climits>
 #include <cmath>
 #include <cstdarg>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
-#include <cstdlib>
 #include <cstring>
 #include <ctime>
-#include <exception>
 #include <forward_list>
 #include <fstream>
 #include <functional>
 #include <initializer_list>
-#include <iterator>
-#include <limits>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <numeric>
 #include <queue>
 #include <random>
+#include <regex>
 #include <set>
 #include <sstream>
-#include <stdexcept>
 #include <thread>
-#include <type_traits>
 #include <unordered_map>
 
 #if defined(_MSC_VER)
diff --git a/llama.h b/llama.h
index dd16407c8..3f1becd76 100644
--- a/llama.h
+++ b/llama.h
@@ -2,14 +2,12 @@
 #define LLAMA_H
 
 #include "ggml.h"
-
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
 #else
 #define LLAMA_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
-
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -761,9 +759,8 @@ extern "C" {
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 
-#include <string>
-#include <utility>
 #include <vector>
+#include <string>
 
 struct ggml_tensor;
 
diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp
index 05e85ee30..111770d55 100644
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@@ -1,13 +1,16 @@
-#include "ggml.h"
-
-#include <algorithm>
-#include <chrono>
-#include <cmath>
-#include <cstdint>
 #include <cstdio>
-#include <cstdlib>
-#include <random>
+#include <type_traits>
 #include <vector>
+#include <random>
+#include <chrono>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <array>
+#include <type_traits>
+
+#include <ggml.h>
 
 constexpr int kVecSize = 1 << 16;
 
diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp
index 5b0ebb706..e96372c4b 100644
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -1,15 +1,14 @@
-#include "ggml.h"
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <cmath>
-#include <cstdint>
 #include <cstdio>
-#include <cstdlib>
-#include <random>
-#include <utility>
 #include <vector>
+#include <random>
+#include <chrono>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <array>
+
+#include <ggml.h>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 49a92ed23..0a559b27a 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -1,11 +1,10 @@
 #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
 #include "ggml.h"
 
-#include <cassert>
 #include <cmath>
-#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <cassert>
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index b4d825b62..a0b5b043d 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -2,16 +2,10 @@
 #undef NDEBUG
 #endif
 
-#include "grammar-parser.h"
 #include "llama.h"
+#include "grammar-parser.h"
 
 #include <cassert>
-#include <cstdint>
-#include <cstdio>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
 
 int main()
 {
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index cbbe95bd3..73dd33dd2 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -6,12 +6,6 @@
 #include "grammar-parser.h"
 
 #include <cassert>
-#include <cstdint>
-#include <cstdio>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
 
 int main()
 {
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 3b88b4989..a2459a286 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -2,9 +2,10 @@
 
 #include "ggml.h"
 
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
 #include <string>
 #include <vector>
 
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index b4260e548..88fac0e23 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -2,12 +2,14 @@
 
 #include "ggml.h"
 
+#undef NDEBUG
 #include <algorithm>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
+#include <assert.h>
 #include <functional>
+#include <inttypes.h>
+#include <math.h>
 #include <memory>
+#include <stdio.h>
 #include <string>
 #include <vector>
 
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 2fda30ccb..26c1f42dc 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -1,10 +1,9 @@
 #include "ggml.h"
 
-#include <cassert>
 #include <cmath>
-#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <cassert>
 #include <vector>
 
 #if defined(_MSC_VER)
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 63ae06d30..32e58941c 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -1,9 +1,15 @@
 #include "ggml.h"
 #include "llama.h"
 
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
 #include <cmath>
-#include <cstdio>
+#include <numeric>
+#include <cassert>
 #include <vector>
+#include <algorithm>
 
 static void dump(const llama_token_data_array * candidates) {
     for (size_t i = 0; i < candidates->size; i++) {
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
index d2f983baa..a4e9d2b91 100644
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@@ -1,14 +1,12 @@
+#include "llama.h"
 #include "common.h"
 #include "console.h"
-#include "llama.h"
 
 #include <cstdio>
-#include <fstream>
-#include <iterator>
-#include <map>
 #include <string>
-#include <utility>
+#include <map>
 #include <vector>
+#include <fstream>
 
 // generate using test-tokenizer-0-falcon.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp
index c00a668de..39c8d188c 100644
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@@ -1,14 +1,12 @@
+#include "llama.h"
 #include "common.h"
 #include "console.h"
-#include "llama.h"
 
 #include <cstdio>
-#include <fstream>
-#include <iterator>
-#include <map>
 #include <string>
-#include <utility>
+#include <map>
 #include <vector>
+#include <fstream>
 
 // generate using test-tokenizer-0-llama.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index cc924da5c..386530f23 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -1,14 +1,16 @@
-#include "common.h"
-#include "console.h"
-#include "ggml.h"
 #include "llama.h"
+#include "common.h"
 #include "unicode.h"
+#include "console.h"
 
-#include <cstdint>
+#include <cassert>
 #include <cstdio>
-#include <stdexcept>
+#include <cstring>
 #include <string>
+#include <codecvt>
+#include <map>
 #include <vector>
+#include <locale>
 
 int main(int argc, char **argv) {
     if (argc < 2) {
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp
index 95832f899..4b58fe495 100644
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@@ -1,13 +1,16 @@
-#include "common.h"
-#include "console.h"
-#include "ggml.h"
 #include "llama.h"
+#include "common.h"
 #include "unicode.h"
+#include "console.h"
 
-#include <cstdint>
+#include <cassert>
 #include <cstdio>
+#include <cstring>
 #include <string>
+#include <codecvt>
+#include <map>
 #include <vector>
+#include <locale>
 
 int main(int argc, char **argv) {
     if (argc < 2) {