From 63fd76fbb06f9b723ca11505352387a3148b1814 Mon Sep 17 00:00:00 2001
From: uint256_t <konndennsa@gmail.com>
Date: Tue, 14 Mar 2023 01:33:43 +0900
Subject: [PATCH 1/9] Reduce model loading time (#43)

* Use buffering

* Use vector

* Minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 main.cpp | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/main.cpp b/main.cpp
index d068761e3..ee0952f74 100644
--- a/main.cpp
+++ b/main.cpp
@@ -87,7 +87,10 @@ struct llama_model {
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
+    std::vector<char> f_buf(1024*1024);
+
     auto fin = std::ifstream(fname, std::ios::binary);
+    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
         return false;
@@ -325,6 +328,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
 
         fin = std::ifstream(fname_part, std::ios::binary);
+        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
         fin.seekg(file_offset);
 
         // load weights

From 84d9015c4a91ab586ba65d5bd31a8482baf46ba1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Mar 2023 18:36:44 +0200
Subject: [PATCH 2/9] Use vdotq_s32 to improve performance (#67)

* 10% performance boost on ARM

* Back to original change
---
 ggml.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/ggml.c b/ggml.c
index fbd7b9339..7b0b456f8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1360,34 +1360,20 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
         const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
 
         // dot product into int16x8_t
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
+        // assume that vdotq_s32 is always available, if not, should check for __ARM_FEATURE_DOTPROD
+        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
+        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
 
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
-
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
-
-        const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h);
-        const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h);
-
-        const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h);
-        const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h);
-
-        const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
-        const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
+        p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
+        p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
 
         // scalar
 #if defined(__ARM_FEATURE_QRDMX)
-        sum0 += d0_0*d1_0*vaddvq_s16(p_0);
-        sum1 += d0_1*d1_1*vaddvq_s16(p_1);
+        sum0 += d0_0*d1_0*vaddvq_s32(p_0);
+        sum1 += d0_1*d1_1*vaddvq_s32(p_1);
 #else
-        sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
-        sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
+        sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
+        sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
 #endif
     }
 

From 671d5cac15241b495006f56482bf2d6967dca91f Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Mon, 13 Mar 2023 17:39:56 +0100
Subject: [PATCH 3/9] Use fprintf for diagnostic output (#48)

keep printf only for printing model output

one can now use ./main ... 2>dev/null to suppress any diagnostic output
---
 main.cpp | 92 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/main.cpp b/main.cpp
index ee0952f74..c96f9edc9 100644
--- a/main.cpp
+++ b/main.cpp
@@ -85,7 +85,7 @@ struct llama_model {
 
 // load the model's weights from a file
 bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
-    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     std::vector<char> f_buf(1024*1024);
 
@@ -127,16 +127,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
         n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
 
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_mult  = %d\n", __func__, hparams.n_mult);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
-        printf("%s: n_ff    = %d\n", __func__, n_ff);
-        printf("%s: n_parts = %d\n", __func__, n_parts);
+        fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        fprintf(stderr, "%s: n_mult  = %d\n", __func__, hparams.n_mult);
+        fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
+        fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
+        fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
+        fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
+        fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
     }
 
     // load vocab
@@ -161,7 +161,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             vocab.id_to_token[i] = word;
 
             //if (i < 30000) {
-            //    printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
+            //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
             //}
         }
     }
@@ -220,7 +220,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
         ctx_size += (5 + 10*n_layer)*256; // object overhead
 
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
     }
 
     // create the ggml context
@@ -307,7 +307,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 
-        printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+        fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
     }
 
     const size_t file_offset = fin.tellg();
@@ -325,7 +325,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             fname_part += "." + std::to_string(i);
         }
 
-        printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
+        fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
 
         fin = std::ifstream(fname_part, std::ios::binary);
         fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
@@ -336,7 +336,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             int n_tensors = 0;
             size_t total_size = 0;
 
-            printf("%s: ", __func__);
+            fprintf(stderr, "%s: ", __func__);
 
             while (true) {
                 int32_t n_dims;
@@ -436,7 +436,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
                 if (0) {
                     static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                    printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
+                    fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
                 }
 
                 size_t bpe = 0;
@@ -499,16 +499,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
                     total_size += ggml_nbytes(tensor)/n_parts;
                 }
 
-                //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
                 if (++n_tensors % 8 == 0) {
-                    printf(".");
-                    fflush(stdout);
+                    fprintf(stderr, ".");
+                    fflush(stderr);
                 }
             }
 
-            printf(" done\n");
+            fprintf(stderr, " done\n");
 
-            printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
+            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
         }
 
         fin.close();
@@ -552,7 +552,7 @@ bool llama_eval(
 
     if (mem_per_token > 0 && mem_per_token*N > buf_size) {
         const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
 
         // reallocate
         buf_size = buf_size_new;
@@ -744,7 +744,7 @@ bool llama_eval(
     if (mem_per_token == 0) {
         mem_per_token = ggml_used_mem(ctx0)/N;
     }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
 
     ggml_free(ctx0);
 
@@ -780,7 +780,7 @@ int main(int argc, char ** argv) {
         params.seed = time(NULL);
     }
 
-    printf("%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.prompt.empty()) {
@@ -822,13 +822,13 @@ int main(int argc, char ** argv) {
     // tokenize the reverse prompt
     std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
 
-    printf("\n");
-    printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+    fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
     for (int i = 0; i < (int) embd_inp.size(); i++) {
-        printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
+        fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
     }
-    printf("\n");
+    fprintf(stderr, "\n");
     if (params.interactive) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
         struct sigaction sigint_action;
@@ -838,19 +838,19 @@ int main(int argc, char ** argv) {
         sigaction(SIGINT, &sigint_action, NULL);
 #endif
 
-        printf("%s: interactive mode on.\n", __func__);
+        fprintf(stderr, "%s: interactive mode on.\n", __func__);
 
         if(antiprompt_inp.size()) {
-            printf("%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
-            printf("%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
+            fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
+            fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
             for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
-                printf("%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
             }
-            printf("\n");
+            fprintf(stderr, "\n");
         }
     }
-    printf("sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-    printf("\n\n");
+    fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "\n\n");
 
     std::vector<gpt_vocab::id> embd;
 
@@ -864,7 +864,7 @@ int main(int argc, char ** argv) {
 
 
     if (params.interactive) {
-        printf("== Running in interactive mode. ==\n"
+        fprintf(stderr, "== Running in interactive mode. ==\n"
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
                " - Press Ctrl+C to interject at any time.\n"
 #endif
@@ -892,7 +892,7 @@ int main(int argc, char ** argv) {
             const int64_t t_start_us = ggml_time_us();
 
             if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
-                printf("Failed to predict\n");
+                fprintf(stderr, "Failed to predict\n");
                 return 1;
             }
 
@@ -1005,7 +1005,7 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (embd.back() == 2) {
-            printf(" [end of text]\n");
+            fprintf(stderr, " [end of text]\n");
             break;
         }
     }
@@ -1015,12 +1015,12 @@ int main(int argc, char ** argv) {
     {
         const int64_t t_main_end_us = ggml_time_us();
 
-        printf("\n\n");
-        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
-        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+        fprintf(stderr, "\n\n");
+        fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
+        fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+        fprintf(stderr, "%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+        fprintf(stderr, "%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+        fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
     }
 
     ggml_free(model.ctx);

From 41be0a3b3d76ee4f254dc81b42bd8ed26ee324e7 Mon Sep 17 00:00:00 2001
From: Thomas Klausner <wiz@gatalith.at>
Date: Mon, 13 Mar 2023 17:40:54 +0100
Subject: [PATCH 4/9] Add NetBSD support. (#90)

---
 Makefile  | 4 ++++
 ggml.c    | 2 +-
 utils.cpp | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 8388c290d..1601079a4 100644
--- a/Makefile
+++ b/Makefile
@@ -48,6 +48,10 @@ ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
+ifeq ($(UNAME_S),NetBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
 ifeq ($(UNAME_S),Haiku)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
diff --git a/ggml.c b/ggml.c
index 7b0b456f8..58a4c9b6d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2,7 +2,7 @@
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__)
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
 #include <alloca.h>
 #endif
 
diff --git a/utils.cpp b/utils.cpp
index b340bd61b..54217f02f 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -11,7 +11,7 @@
 
  #if defined(_MSC_VER) || defined(__MINGW32__)
  #include <malloc.h> // using malloc.h with MSC/MINGW
- #elif !defined(__FreeBSD__)
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
  #include <alloca.h>
  #endif
 

From ed6849cc07a8973e5d31947b9df2df2da975ac96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebasti=C3=A1n=20A?= <sebastian.aedo29@gmail.com>
Date: Mon, 13 Mar 2023 14:12:33 -0300
Subject: [PATCH 5/9] Initial support for CMake (#75)

---
 CMakeLists.txt | 123 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..04ee2bc19
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,123 @@
+cmake_minimum_required(VERSION 3.8)
+project("llama.cpp")
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED true)
+set(CMAKE_C_STANDARD 11)
+
+option(LLAMA_ALL_WARNINGS            "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY  "llama: enable all compiler warnings in 3rd party libs" OFF)
+
+option(LLAMA_SANITIZE_THREAD         "llama: enable thread sanitizer"    OFF)
+option(LLAMA_SANITIZE_ADDRESS        "llama: enable address sanitizer"   OFF)
+option(LLAMA_SANITIZE_UNDEFINED      "llama: enable undefined sanitizer" OFF)
+
+if (APPLE)
+    option(LLAMA_NO_ACCELERATE       "llama: disable Accelerate framework" OFF)
+    option(LLAMA_NO_AVX              "llama: disable AVX" OFF)
+    option(LLAMA_NO_AVX2             "llama: disable AVX2" OFF)
+    option(LLAMA_NO_FMA              "llama: disable FMA" OFF)
+endif()
+
+if (NOT MSVC)
+    if (LLAMA_SANITIZE_THREAD)
+        set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
+    endif()
+
+    if (LLAMA_SANITIZE_ADDRESS)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+    endif()
+
+    if (LLAMA_SANITIZE_UNDEFINED)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
+    endif()
+endif()
+
+if (APPLE AND NOT LLAMA_NO_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        set(LLAMA_EXTRA_LIBS  ${LLAMA_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
+if (LLAMA_ALL_WARNINGS)
+    if (NOT MSVC)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
+            -Wall                           \
+            -Wextra                         \
+            -Wpedantic                      \
+            -Wshadow                        \
+            -Wcast-qual                     \
+            -Wstrict-prototypes             \
+            -Wpointer-arith                 \
+            -Wno-unused-function            \
+        ")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
+            -Wall                           \
+            -Wextra                         \
+            -Wpedantic                      \
+            -Wcast-qual                     \
+        ")
+    else()
+        # todo : msvc
+    endif()
+endif()
+
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+    message(STATUS "ARM detected")
+else()
+    message(STATUS "x86 detected")
+    if (MSVC)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
+    else()
+        if(NOT LLAMA_NO_AVX)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
+        endif()
+        if(NOT LLAMA_NO_AVX2)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
+        endif()
+        if(NOT LLAMA_NO_FMA)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+        endif()
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+    endif()
+endif()
+
+# if (LLAMA_PERF)
+#     set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
+# endif()
+
+add_executable(llama
+    main.cpp
+    utils.cpp
+    utils.h)
+
+add_executable(quantize
+    quantize.cpp
+    utils.cpp
+    utils.h)
+
+add_library(ggml
+    ggml.c
+    ggml.h)
+
+target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
+target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
+target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
+
+target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
+target_include_directories(ggml PUBLIC .)
+target_link_libraries(quantize PRIVATE ggml)
+target_link_libraries(llama PRIVATE ggml)

From 4497ad819c0010a8b19ffeaf8c0428eb7558d3e0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Mar 2023 19:15:08 +0200
Subject: [PATCH 6/9] Print system information

---
 main.cpp | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/main.cpp b/main.cpp
index c96f9edc9..6dc9ae980 100644
--- a/main.cpp
+++ b/main.cpp
@@ -765,6 +765,26 @@ void sigint_handler(int signo) {
 }
 #endif
 
+const char * llama_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
+    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
+    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+
+    return s.c_str();
+}
+
 int main(int argc, char ** argv) {
     ggml_time_init();
     const int64_t t_main_start_us = ggml_time_us();
@@ -807,6 +827,13 @@ int main(int argc, char ** argv) {
         t_load_us = ggml_time_us() - t_start_us;
     }
 
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+
     int n_past = 0;
 
     int64_t t_sample_us  = 0;
@@ -834,7 +861,7 @@ int main(int argc, char ** argv) {
         struct sigaction sigint_action;
         sigint_action.sa_handler = sigint_handler;
         sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0; 
+        sigint_action.sa_flags = 0;
         sigaction(SIGINT, &sigint_action, NULL);
 #endif
 
@@ -967,7 +994,7 @@ int main(int argc, char ** argv) {
                 is_interacting = true;
             }
             if (is_interacting) {
-                // currently being interactive 
+                // currently being interactive
                 bool another_line=true;
                 while (another_line) {
                     fflush(stdout);
@@ -999,7 +1026,7 @@ int main(int argc, char ** argv) {
                     input_noecho = true; // do not echo this again
                 }
 
-                is_interacting = false;            
+                is_interacting = false;
             }
         }
 

From 7ec903d3c162417c11463f14ad5b773a918fb7f1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Mar 2023 19:21:51 +0200
Subject: [PATCH 7/9] Update contribution section, hot topics, limitations,
 etc.

---
 README.md | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 65be1a687..e936282f4 100644
--- a/README.md
+++ b/README.md
@@ -5,11 +5,6 @@
 
 Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++
 
-**Hot topics**
-
-- Running on Windows: https://github.com/ggerganov/llama.cpp/issues/22
-- Fix Tokenizer / Unicode support: https://github.com/ggerganov/llama.cpp/issues/11
-
 ## Description
 
 The main goal is to run the model using 4-bit quantization on a MacBook
@@ -23,14 +18,14 @@ The main goal is to run the model using 4-bit quantization on a MacBook
 
 This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
 Please do not make conclusions about the models based on the results from this implementation.
-For all I know, it can be completely wrong. This project is for educational purposes and is not going to be maintained properly.
-New features will probably be added mostly through community contributions, if any.
+For all I know, it can be completely wrong. This project is for educational purposes.
+New features will probably be added mostly through community contributions.
 
 Supported platforms:
 
 - [X] Mac OS
 - [X] Linux
-- [ ] Windows (soon)
+- [X] Windows (via CMake)
 
 ---
 
@@ -179,10 +174,6 @@ Note the use of `--color` to distinguish between user input and generated text.
 
 ## Limitations
 
-- Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
-  - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
-  - https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69
-  In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
 - I don't know yet how much the quantization affects the quality of the generated text
 - Probably the token sampling can be improved
 - The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
@@ -192,16 +183,15 @@ Note the use of `--color` to distinguish between user input and generated text.
 
 ### Contributing
 
-- There are 2 git branches: [master](https://github.com/ggerganov/llama.cpp/commits/master) and [dev](https://github.com/ggerganov/llama.cpp/commits/dev)
-- Contributors can open PRs to either one
-- Collaborators can push straight into `dev`, but need to open a PR to get stuff to `master`
+- Contributors can open PRs
+- Collaborators can push to branches in the `llama.cpp` repo
 - Collaborators will be invited based on contributions
-- `dev` branch is considered unstable
-- `master` branch is considered stable and approved. 3-rd party projects should use the `master` branch
 
-General principles to follow when writing code:
+### Coding guide-lines
 
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
+- Clean-up any tailing whitespaces, use 4 spaces indentation, brackets on same line, `int * var`
+- Look at the [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks

From c09a9cfb06c87d114615c105adda91b0e6273b69 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 13 Mar 2023 21:22:15 +0200
Subject: [PATCH 8/9] CMake build in Release by default (#75)

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04ee2bc19..ca3be38a5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,11 @@ set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
 option(LLAMA_ALL_WARNINGS            "llama: enable all compiler warnings"                   ON)
 option(LLAMA_ALL_WARNINGS_3RD_PARTY  "llama: enable all compiler warnings in 3rd party libs" OFF)
 

From 2f700a27381e558a4eb5a3f8fd56757f4c7a417c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebasti=C3=A1n=20A?= <sebastian.aedo29@gmail.com>
Date: Mon, 13 Mar 2023 17:29:10 -0300
Subject: [PATCH 9/9] Add windows to the CI (#98)

---
 .github/workflows/build.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c10e671c5..1a068ae75 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,6 +33,20 @@ jobs:
         run: |
           make
 
+  windows-latest:
+    runs-on: windows-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+
 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
 #