From 4122dffff958cd137175b58f1f27c0913528d7ba Mon Sep 17 00:00:00 2001
From: Erik Scholz <Green-Sky@users.noreply.github.com>
Date: Wed, 22 Mar 2023 17:37:10 +0100
Subject: [PATCH 01/23] cmake: make llama an actual library (#392)

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 400cecf9c..d952afb4f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -217,6 +217,7 @@ add_library(utils OBJECT
 
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
+target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
 
 add_library(ggml OBJECT
             ggml.c
@@ -226,12 +227,13 @@ target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
 
-add_library(llama OBJECT
+add_library(llama
             llama.cpp
             llama.h)
 
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
+target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
 
 #
 # Executables

From 305ba6f0e6daa3796aad9dd18053a1945dd4cc58 Mon Sep 17 00:00:00 2001
From: tjohnman <tjohnman@users.noreply.github.com>
Date: Wed, 22 Mar 2023 18:16:35 +0100
Subject: [PATCH 02/23] Don't force immediate interactive without `-i` (#354)

* Don't force immediate interactive without -i

Sometimes we might want to use a reverse prompt but we want to let the
model generate tokens right after the initial prompt. So we don't force
user input mode if the -i flag wasn't specified and instead let it run
until we encounter the reverse prompt.

This gives use some more flexibility, since it doesn't force the user to
enter a newline if they want to let the model generate text right after
the initial prompt and only be asked for input if the reverse prompt is
encountered.

The `--interactive-first` flag is reintroduced to force the old
behavior. `-r` behaves like `-i` plus introduces a reverse prompt (it
can be specified more than once).

* Update help output.

---------

Co-authored-by: Johnman <tjohnman@github>
---
 main.cpp  | 6 +++++-
 utils.cpp | 5 ++++-
 utils.h   | 2 +-
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/main.cpp b/main.cpp
index fbb43a8cc..4569ef2a1 100644
--- a/main.cpp
+++ b/main.cpp
@@ -254,6 +254,10 @@ int main(int argc, char ** argv) {
         params.interactive = true;
     }
 
+    if (params.interactive_start) {
+        params.interactive = true;
+    }
+
     fprintf(stderr, "\n");
     fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
     fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@@ -296,7 +300,7 @@ int main(int argc, char ** argv) {
 #endif
                " - Press Return to return control to LLaMa.\n"
                " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = true;
+        is_interacting = params.interactive_start;
     }
 
     int input_consumed = 0;
diff --git a/utils.cpp b/utils.cpp
index 3909c974f..1d5309c3a 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -63,6 +63,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.model = argv[++i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
+        } else if (arg == "--interactive-first") {
+            params.interactive_start = true;
         } else if (arg == "-ins" || arg == "--instruct") {
             params.instruct = true;
         } else if (arg == "--color") {
@@ -96,9 +98,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help            show this help message and exit\n");
     fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
+    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
     fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
     fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        in interactive mode, poll user input upon seeing PROMPT (can be\n");
+    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
     fprintf(stderr, "                        specified more than once for multiple prompts).\n");
     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
diff --git a/utils.h b/utils.h
index 3f970eabb..b0de556c9 100644
--- a/utils.h
+++ b/utils.h
@@ -38,7 +38,7 @@ struct gpt_params {
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
-    bool interactive_start = false; // reverse prompt immediately
+    bool interactive_start = false; // wait for user input immediately
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool ignore_eos        = false; // do not stop generating after eos
     bool perplexity        = false; // compute perplexity over the prompt

From 97940520e8fd49c56bb29b71cc350190b723513f Mon Sep 17 00:00:00 2001
From: Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
Date: Wed, 22 Mar 2023 18:20:25 +0100
Subject: [PATCH 03/23] fix: add POSIX functionality for Linux compilation
 (#51)

* fix: add POSIX functionality for Linux compilation

* fix: older standard for compatibility
---
 ggml.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ggml.c b/ggml.c
index d00544577..7ea9f6228 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1,3 +1,6 @@
+// Defines CLOCK_MONOTONIC on Linux
+#define _POSIX_C_SOURCE 199309L
+
 #include "ggml.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)

From 69c92298a9e36dc2363b3bf50452976ce49487b3 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Wed, 22 Mar 2023 17:29:06 +0000
Subject: [PATCH 04/23] Deduplicate q4 quantization functions (#383)

* Deduplicate q4 quantization functions

* Use const; add basic test

* Re-enable quantization test

* Disable AVX2 flags in CI

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 .github/workflows/build.yml |   2 +-
 ggml.c                      | 171 ++++++++++++++----------------------
 ggml.h                      |   4 +-
 tests/CMakeLists.txt        |  13 ++-
 tests/test-quantize.c       |  42 +++++++++
 5 files changed, 119 insertions(+), 113 deletions(-)
 create mode 100644 tests/test-quantize.c

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5882fc747..6ce9cc726 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -89,7 +89,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake ..
+          cmake -DLLAMA_AVX2=OFF ..
           cmake --build . --config Release
           ctest --output-on-failure
 
diff --git a/ggml.c b/ggml.c
index 7ea9f6228..0e4b1466c 100644
--- a/ggml.c
+++ b/ggml.c
@@ -403,9 +403,55 @@ static inline __m128i packNibbles( __m256i bytes )
 // method 5
 // blocks of QK elements
 // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) {
+    assert(k % QK == 0);
+    const int nb = k / QK;
+
+    const size_t bs = sizeof(float) + QK/2;
+
+    uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
+    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
+
+    uint8_t pp[QK/2];
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int l = 0; l < QK; l++) {
+            const float v = x[i*QK + l];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 3) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        *(float *)pd = d;
+        pd += bs;
+
+        for (int l = 0; l < QK; l += 2) {
+            const float v0 = x[i*QK + l + 0]*id;
+            const float v1 = x[i*QK + l + 1]*id;
+
+            const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
+            const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
+
+            assert(vi0 >= 0 && vi0 < 16);
+            assert(vi1 >= 0 && vi1 < 16);
+
+            pp[l/2] = vi0 | (vi1 << 4);
+        }
+
+        memcpy(pb, pp, sizeof(pp));
+        pb += bs;
+    }
+}
+
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     assert(k % QK == 0);
 
+#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
     const int nb = k / QK;
     const size_t bs = sizeof(float) + QK/2;
 
@@ -413,6 +459,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));
 
     uint8_t pp[QK/2];
+#endif
 
 #if __ARM_NEON
 #if QK == 32
@@ -569,36 +616,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
 #endif
 #else
     // scalar
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < QK; l++) {
-            const float v = x[i*QK + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 3) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        *(float *)pd = d;
-        pd += bs;
-
-        for (int l = 0; l < QK; l += 2) {
-            const float v0 = x[i*QK + l + 0]*id;
-            const float v1 = x[i*QK + l + 1]*id;
-
-            const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
-            const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
-
-            assert(vi0 >= 0 && vi0 < 16);
-            assert(vi1 >= 0 && vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
-        }
-
-        memcpy(pb, pp, sizeof(pp));
-        pb += bs;
-    }
+    quantize_row_q4_0_reference(x, y, k);
 #endif
 }
 
@@ -10705,119 +10723,60 @@ enum ggml_opt_result ggml_opt(
 
 ////////////////////////////////////////////////////////////////////////////////
 
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
     const int nb = k / qk;
     const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
     const size_t row_size = nb*bs;
 
     assert(k % qk == 0);
 
-    const size_t pp_size = qk / 2;
-    uint8_t * pp = (uint8_t *) alloca(pp_size);
-
     char * pdst = (char *) dst;
 
     for (int j = 0; j < n; j += k) {
         uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
         uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
 
+        quantize_row_q4_0_reference(src + j, pd, k);
+
         for (int i = 0; i < nb; i++) {
-            float amax = 0.0f; // absolute max
+            for (int l = 0; l < qk; l += 2) {
+                const uint8_t vi0 = pb[l/2] & 0xF;
+                const uint8_t vi1 = pb[l/2] >> 4;
 
-            {
-                for (int l = 0; l < qk; l++) {
-                    const float v = src[j + i*qk + l];
-                    amax = MAX(amax, fabsf(v));
-                }
-
-                const float d = amax / ((1 << 3) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                *(float *) pd = d;
-                pd += bs;
-
-                for (int l = 0; l < qk; l += 2) {
-                    const float v0 = (src[j + i*qk + l + 0])*id;
-                    const float v1 = (src[j + i*qk + l + 1])*id;
-
-                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
-                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    hist[vi0]++;
-                    hist[vi1]++;
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb, pp, pp_size);
-                pb += bs;
+                hist[vi0]++;
+                hist[vi1]++;
             }
+            pb += bs;
         }
     }
 
     return (n/k)*row_size;
 }
 
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) {
     const int nb = k / qk;
     const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
     const size_t row_size = nb*bs;
 
     assert(k % qk == 0);
 
-    const size_t pp_size = qk / 2;
-    uint8_t * pp = (uint8_t *) alloca(pp_size);
-
     char * pdst = (char *) dst;
 
     for (int j = 0; j < n; j += k) {
         uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
-        uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs +   sizeof(float));
         uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
 
-        //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
+        quantize_row_q4_1(src + j, pd, k);
 
         for (int i = 0; i < nb; i++) {
-            float min = FLT_MAX;
-            float max = -FLT_MAX;
+            for (int l = 0; l < qk; l += 2) {
+                const uint8_t vi0 = pb[l/2] & 0xF;
+                const uint8_t vi1 = pb[l/2] >> 4;
 
-            {
-                for (int l = 0; l < qk; l++) {
-                    const float v = src[j + i*qk + l];
-                    if (v < min) min = v;
-                    if (v > max) max = v;
-                }
-
-                const float d = (max - min) / ((1 << 4) - 1);
-                const float id = d ? 1.0f/d : 0.0f;
-
-                *(float *) pd = d;
-                *(float *) pm = min;
-                pd += bs;
-                pm += bs;
-
-                for (int l = 0; l < qk; l += 2) {
-                    const float v0 = (src[j + i*qk + l + 0] - min)*id;
-                    const float v1 = (src[j + i*qk + l + 1] - min)*id;
-
-                    const uint8_t vi0 = round(v0);
-                    const uint8_t vi1 = round(v1);
-
-                    assert(vi0 >= 0 && vi0 < 16);
-                    assert(vi1 >= 0 && vi1 < 16);
-
-                    hist[vi0]++;
-                    hist[vi1]++;
-
-                    pp[l/2] = vi0 | (vi1 << 4);
-                }
-
-                memcpy(pb, pp, pp_size);
-                pb += bs;
+                hist[vi0]++;
+                hist[vi1]++;
             }
+            pb += bs;
         }
     }
 
diff --git a/ggml.h b/ggml.h
index 48b6cc028..c7e6814a8 100644
--- a/ggml.h
+++ b/ggml.h
@@ -745,8 +745,8 @@ enum ggml_opt_result ggml_opt(
 // quantization
 //
 
-size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
 
 //
 // system info
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4990c3432..6a4170f80 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,4 +1,9 @@
-set(TEST_TARGET test-tokenizer-0)
-add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
-add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+function(llama_add_test source)
+    get_filename_component(TEST_TARGET ${source} NAME_WE)
+    add_executable(${TEST_TARGET} ${source})
+    target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
+    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
+endfunction()
+
+llama_add_test(test-quantize.c)
+llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
diff --git a/tests/test-quantize.c b/tests/test-quantize.c
new file mode 100644
index 000000000..d59ecb8ab
--- /dev/null
+++ b/tests/test-quantize.c
@@ -0,0 +1,42 @@
+#include "ggml.h"
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+
+int main(void) {
+    #define QK 32
+    float src[QK];
+    uint8_t dst[24];
+    int64_t hist[16];
+
+    for (int i = 0; i < QK; i++) {
+        src[i] = (float)(i + 1);
+    }
+
+    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
+    assert(size == 20);
+    float max_result = ((float *)dst)[0];
+    float max_expected = src[31] / ((1 << 3) - 1);
+    assert(max_result == max_expected);
+    for (int i = 0; i < QK; i++) {
+        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
+        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
+        assert(q4_result == q4_expected);
+    }
+
+    size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
+    assert(size == 24);
+    float delta_result = ((float *)dst)[0];
+    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
+    assert(delta_result == delta_expected);
+    float min_result = ((float *)dst)[1];
+    float min_expected = src[0];
+    assert(min_result == min_expected);
+    for (int i = 0; i < QK; i++) {
+        uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
+        uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
+        assert(q4_result == q4_expected);
+    }
+
+    return 0;
+}

From ee8a7887865a893be208e0a92d6d94d2cb66a789 Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Wed, 22 Mar 2023 19:06:18 +0000
Subject: [PATCH 05/23] Update issue template so people will use it (#404)

---
 .github/ISSUE_TEMPLATE/custom.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md
index fe6a97bea..72224624c 100644
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@@ -1,7 +1,7 @@
 ---
-name: Custom issue template
-about: Used to report user-related issues with the software
-title: "[User] I encountered a problem .."
+name: Issue and enhancement template
+about: Used to report issues and request enhancements for llama.cpp
+title: "[User] Insert summary of your issue or enhancement.."
 labels: ''
 assignees: ''
 
@@ -18,11 +18,11 @@ Please answer the following questions for yourself before submitting an issue.
 
 # Expected Behavior
 
-Please provide a detailed written description of what you were trying to do, and what you expected `lamma.cpp` to do.
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
 
 # Current Behavior
 
-Please provide a detailed written description of what `lamma.cpp` did, instead. 
+Please provide a detailed written description of what `llama.cpp` did, instead. 
 
 # Environment and Context 
 

From f7dc43bc0d759732815856183246f167111587ad Mon Sep 17 00:00:00 2001
From: tjohnman <tjohnman@users.noreply.github.com>
Date: Thu, 23 Mar 2023 01:30:23 +0100
Subject: [PATCH 06/23] Fix instruct mode broken by PR #354 (#409)

Co-authored-by: Johnman <tjohnman@github>
---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index 4569ef2a1..431c94b52 100644
--- a/main.cpp
+++ b/main.cpp
@@ -300,7 +300,7 @@ int main(int argc, char ** argv) {
 #endif
                " - Press Return to return control to LLaMa.\n"
                " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_start;
+        is_interacting = params.interactive_start || params.instruct;
     }
 
     int input_consumed = 0;

From e4412b45e395981068d2850d3fa04cc16c77d70d Mon Sep 17 00:00:00 2001
From: anzz1 <anzz1@live.com>
Date: Thu, 23 Mar 2023 04:20:34 +0200
Subject: [PATCH 07/23] CI: CMake: Separate build and test steps (#376)

* CI: Separate Build and Test steps (CMake)

* CI: Make sure build passes before running tests (CMake)

* CI: Standardise step id names
---
 .github/workflows/build.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6ce9cc726..e9826a735 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -41,19 +41,27 @@ jobs:
 
     steps:
       - name: Clone
+        id: checkout
         uses: actions/checkout@v1
 
       - name: Dependencies
+        id: depends
         run: |
           sudo apt-get update
           sudo apt-get install build-essential
 
       - name: Build
+        id: cmake_build
         run: |
           mkdir build
           cd build
           cmake ..
           cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
           ctest --output-on-failure
 
   macOS-latest-make:
@@ -79,18 +87,26 @@ jobs:
 
     steps:
       - name: Clone
+        id: checkout
         uses: actions/checkout@v1
 
       - name: Dependencies
+        id: depends
         run: |
           brew update
 
       - name: Build
+        id: cmake_build
         run: |
           mkdir build
           cd build
           cmake -DLLAMA_AVX2=OFF ..
           cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
           ctest --output-on-failure
 
   windows-latest-cmake:
@@ -108,6 +124,11 @@ jobs:
           cd build
           cmake ..
           cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
           ctest -C Release --output-on-failure
 
       - name: Get commit hash

From 03ace14cfd68a1289ac3b76563534c8ee72a2e53 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 09:48:51 +0200
Subject: [PATCH 08/23] Add link to recent podcast about whisper.cpp and
 llama.cpp

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index b5a113c91..f53595f88 100644
--- a/README.md
+++ b/README.md
@@ -337,6 +337,7 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- A bit of backstory for those who are interested interested: [Changelog podcast](https://changelog.com/podcast/532)
 
 ### Coding guidelines
 

From 93208cfb929c2323e5d2ac6bf354e278040e70ed Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 10:46:58 +0200
Subject: [PATCH 09/23] Adjust repetition penalty ..

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f53595f88..f8743e280 100644
--- a/README.md
+++ b/README.md
@@ -337,7 +337,7 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
-- A bit of backstory for those who are interested interested: [Changelog podcast](https://changelog.com/podcast/532)
+- A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
 
 ### Coding guidelines
 

From 8eea5ae0e5f31238a97c79ea9103c27647380e37 Mon Sep 17 00:00:00 2001
From: anzz1 <anzz1@live.com>
Date: Thu, 23 Mar 2023 12:26:19 +0200
Subject: [PATCH 10/23] Delete SHA256SUMS for now (#416)

Delete this for now to avoid confusion since it contains some wrong checksums from the old tokenizer format
Re-add after #374 is resolved
---
 SHA256SUMS | 53 -----------------------------------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 SHA256SUMS

diff --git a/SHA256SUMS b/SHA256SUMS
deleted file mode 100644
index 532beaea2..000000000
--- a/SHA256SUMS
+++ /dev/null
@@ -1,53 +0,0 @@
-700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
-abe4aec2cdc297e2916011f66c7efd6fb4424e0e84315503005b5c118358cc22  models/7B/ggml-model-f16.bin
-f495fa02a0b5ef265e1864d9680eede7fd23a60b0a2f93edba8091e2a4ca68b9  models/7B/ggml-model-q4_0.bin
-7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
-745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
-d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
-a6bd0537c6873f36c47292df0b6f794e1135f5aafb89c3343bcc9e93264bf167  models/13B/ggml-model-f16.bin
-0fb0951b90f2ec46c1f2f2372af5dacb4614b27e9fb6c10c69fbec58d7dd0e36  models/13B/ggml-model-f16.bin.1
-1c218ba37ae61e15e35efd9949c78d6edf553b6280824c263cad56ae0b9d5a8f  models/13B/ggml-model-q4_0.bin
-c37a20c2ab9fa74b006b389085660269ee06110d1e45a494eb57d4602c9bcdb2  models/13B/ggml-model-q4_0.bin.1
-4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
-e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
-4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
-24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
-1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
-def20ea508f4e36793719f857471e85b85f96e497a2cbffbbaa1b60e2b18202c  models/30B/ggml-model-f16.bin
-b37040aa67fa8608cb2d8e0719132cf3e267fd35ec1e2f0d37dbc9fa43d674f1  models/30B/ggml-model-f16.bin.1
-e7f263557e99069fe29003262ea5fa9ed885dbe79069083e6eb569b328cf30d3  models/30B/ggml-model-f16.bin.2
-2ad6a23af05eb720f202f63d130f4fc5de9b6d2efc95b921be003209a56695aa  models/30B/ggml-model-f16.bin.3
-7de31d005e6d02ebd9603b2cf5329ad2f832b65d08873a098c5cafc4046cb9ed  models/30B/ggml-model-q4_0.bin
-f91feef9f30f9a023616db2e91297ca6d5d5d7b9eb351e452a82115c46f7da9e  models/30B/ggml-model-q4_0.bin.1
-66f3a0916ac7a81839153eb061fa861030ed1892477c2f7af2ce4f98d2f6d06f  models/30B/ggml-model-q4_0.bin.2
-e3c587ba97f83d2088b001bcda3026571065649ee3090bef6743a51390b01d3b  models/30B/ggml-model-q4_0.bin.3
-2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
-135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
-9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
-e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/consolidated.02.pth
-73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e  models/65B/consolidated.03.pth
-882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225  models/65B/consolidated.04.pth
-a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
-72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
-d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
-7eba2625260cd91f8de901fd9704a1aa39448425514a335a0d3878de4ab9dc77  models/65B/ggml-model-f16.bin
-f6aa886575df0785d4231f30cc776d499ccde18857818effc0378c65b178e0b5  models/65B/ggml-model-f16.bin.1
-076037141682f5d7537955058c4740ab27f285aa4588915f830874a589c0693d  models/65B/ggml-model-f16.bin.2
-7853d96d2903ad7de2b2a89c4acf5a33a2f8e3c24ac39c9df6b44cdb42bf530a  models/65B/ggml-model-f16.bin.3
-b16b7b941abb3bc03a14df1656140855e9360a5371c83e919b9da83a72362314  models/65B/ggml-model-f16.bin.4
-5291270216f888697695acb78ef28df0c080f9e85d3245c92fb9992d1fde6678  models/65B/ggml-model-f16.bin.5
-0685ee77715f34686841006f8f94d3e7eaf148b97cecc9d3eee72808b0f7989c  models/65B/ggml-model-f16.bin.6
-00d993d73bb21d7c29388ffe0dced008cbaa0d391831dea77d7eb8f0b5c404b9  models/65B/ggml-model-f16.bin.7
-4e398f05842206e08cdc5e7bb4f6c7c34b9dc373435ece6f261b14b7b4fe9b89  models/65B/ggml-model-q4_0.bin
-4c4e899e3b12d9f57c9dcea5a1fb41bbc72023323535551f6273582ca7d7294b  models/65B/ggml-model-q4_0.bin.1
-d7b4594bbbd192043b3db0e5acc2561c42e6944e1cb91cc6e61510eee89dbcd8  models/65B/ggml-model-q4_0.bin.2
-9a099d271648863d923d0d097391ea0bc75591f27a2ca3a327760f42e6b69af2  models/65B/ggml-model-q4_0.bin.3
-5ee474051e418c5732b7949190b084d9d679db447f83c1de0d2a82daaa1a0cfa  models/65B/ggml-model-q4_0.bin.4
-a45aa05e7212bd6782790722d68056c5419667ea6b564ccc94bbcb8111d79b8b  models/65B/ggml-model-q4_0.bin.5
-a58fda714b759c28ad5e4c1d8bf8fda7b158fd5e4c4a49f851f36342fa97a105  models/65B/ggml-model-q4_0.bin.6
-a3540cfcbcda33c223c6b0d606034adbd78f17e0e5de1582b78795e78754f7a8  models/65B/ggml-model-q4_0.bin.7
-999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
-1f582babc2bd56bb63b33141898748657d369fd110c4358b2bc280907882bf13  models/alpaca-7B/ggml-model-q4_0.bin
-e17730c6b62b565b098af023ca446dcb9e3535d4222ead6369c7aae67207eb3d  models/alpaca-13B/ggml-model-q4_0.bin
-9bcd1bb30e679c939f367be11b030fe20b3eb9a3606b9bc4106420f1827b6ae4  models/alpaca-30B/ggml-model-q4_0.bin
-36079249f53c292a4c2302d7784005dcae94c865f0bedfdbfa51d9ddad402935  models/alpaca-30B/params.json

From 8a3e5ef801339e57b9b0449220e9ffb11a6648e2 Mon Sep 17 00:00:00 2001
From: Gary Mulder <gjmulder@gmail.com>
Date: Thu, 23 Mar 2023 11:30:40 +0000
Subject: [PATCH 11/23] Move model section from issue template to README.md
 (#421)

* Update custom.md

* Removed Model section as it is better placed in README.md

* Updates to README.md model section

* Inserted text that was removed from  issue template about obtaining models from FB and links to papers describing the various models

* Removed IPF down links for the Alpaca 7B models as these look to be in the old data format and probably shouldn't be directly linked to, anyway

* Updated the perplexity section to point at Perplexity scores #406 discussion
---
 .github/ISSUE_TEMPLATE/custom.md | 19 +++------------
 README.md                        | 40 ++++++++++++++++----------------
 2 files changed, 23 insertions(+), 36 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/custom.md
index 72224624c..0d508802d 100644
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@@ -44,20 +44,6 @@ $ make --version
 $ g++ --version
 ```
 
-# Models
-
-* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
-* If your issue is with model conversion please verify the `sha256sum` of each of your `consolidated*.pth` and `ggml-model-XXX.bin` files to confirm that you have the correct model data files before logging an issue. [Latest sha256 sums for your reference](https://github.com/ggerganov/llama.cpp/issues/238).
-* If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
-  * LLaMA:
-    * [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
-    * [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
-  * GPT-3
-    * [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
-  * GPT-3.5 / InstructGPT / ChatGPT:
-    * [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
-    * [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-
 # Failure Information (for bugs)
 
 Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
@@ -75,8 +61,9 @@ Please provide detailed steps for reproducing the issue. We are not sitting in f
 
 Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
 
-Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. e.g.
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
 
+Example environment info:
 ```
 llama.cpp$ git log | head -1
 commit 2af23d30434a677c6416812eea52ccc0af65119c
@@ -103,8 +90,8 @@ GNU Make 4.3
 $ md5sum ./models/65B/ggml-model-q4_0.bin
 dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
 ```
-Here's a run with the Linux command [perf](https://www.brendangregg.com/perf.html)
 
+Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
 ```
 llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
 main: seed = 1679149377
diff --git a/README.md b/README.md
index f8743e280..e486454fe 100644
--- a/README.md
+++ b/README.md
@@ -191,17 +191,8 @@ Note the use of `--color` to distinguish between user input and generated text.
 
 ### Instruction mode with Alpaca
 
-First, download the `ggml` Alpaca model into the `./models` folder:
-
-```
-# use one of these
-# TODO: add a script to simplify the download
-curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
-curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
-curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
-```
-
-Now run the `main` tool like this:
+1. First, download the `ggml` Alpaca model into the `./models` folder
+2. Run the `main` tool like this:
 
 ```
 ./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins
@@ -228,26 +219,34 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
 
-* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [Pull Request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
-
+* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
 * Please verify the sha256 checksums of all of your `consolidated*.pth` and corresponding converted `ggml-model-*.bin` model files to confirm that you have the correct model data files before creating an issue relating to your model files.
+* The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
 
-The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
+  `sha256sum --ignore-missing -c SHA256SUMS` on Linux
 
-`sha256sum --ignore-missing -c SHA256SUMS` on Linux
+  or
 
-or
-
-`shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
+  `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
 
+* If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
+  * LLaMA:
+    * [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
+    * [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+  * GPT-3
+    * [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
+  * GPT-3.5 / InstructGPT / ChatGPT:
+    * [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
+    * [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+    
 ### Perplexity (Measuring model quality)
 
 You can pass `--perplexity` as a command line option to measure perplexity over the given prompt.  For more background,
 see https://huggingface.co/docs/transformers/perplexity.  However, in general, lower perplexity is better for LLMs.
 
-#### Measurements
+#### Latest measurements
 
-https://github.com/ggerganov/llama.cpp/pull/270 is the unofficial tracking page for now.  llama.cpp is measuring very well
+The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406).  `llama.cpp` is measuring very well
 compared to the baseline implementations.  Quantization has a small negative impact to quality, but, as you can see, running
 13B at q4_0 beats the 7B f16 model by a significant amount.
 
@@ -347,3 +346,4 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+

From a140219e81cfb80356438112cd2290d701b282bb Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Thu, 23 Mar 2023 05:41:32 -0600
Subject: [PATCH 12/23] Fix Makefile echo escape codes (by removing them).
 (#418)

---
 Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index edb0c64c8..91eebaebd 100644
--- a/Makefile
+++ b/Makefile
@@ -231,7 +231,9 @@ clean:
 
 main: main.cpp ggml.o llama.o utils.o
 	$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
-	@echo "\x1b[36mrun ./main -h for help\x1b[0m"
+	@echo
+	@echo '====  Run ./main -h for help.  ===='
+	@echo
 
 quantize: quantize.cpp ggml.o llama.o utils.o
 	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)

From a50e39c6fe36be3de0941b3c05aaf9c37912fd47 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Thu, 23 Mar 2023 14:15:48 +0000
Subject: [PATCH 13/23] Revert "Delete SHA256SUMS for now" (#429)

* Revert "Delete SHA256SUMS for now (#416)"

This reverts commit 8eea5ae0e5f31238a97c79ea9103c27647380e37.

* Remove ggml files until they can be verified
* Remove alpaca json
* Add also model/tokenizer.model to SHA256SUMS + update README

---------

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 README.md  |  2 +-
 SHA256SUMS | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 SHA256SUMS

diff --git a/README.md b/README.md
index e486454fe..ee8dc1dcb 100644
--- a/README.md
+++ b/README.md
@@ -220,7 +220,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
 
 * The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
-* Please verify the sha256 checksums of all of your `consolidated*.pth` and corresponding converted `ggml-model-*.bin` model files to confirm that you have the correct model data files before creating an issue relating to your model files.
+* Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
 * The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
 
   `sha256sum --ignore-missing -c SHA256SUMS` on Linux
diff --git a/SHA256SUMS b/SHA256SUMS
new file mode 100644
index 000000000..63fac21ae
--- /dev/null
+++ b/SHA256SUMS
@@ -0,0 +1,20 @@
+700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
+7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
+745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
+d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
+4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
+e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
+4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
+24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
+1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
+2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
+135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
+9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
+e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/consolidated.02.pth
+73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e  models/65B/consolidated.03.pth
+882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225  models/65B/consolidated.04.pth
+a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
+72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
+d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
+999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
+9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model

From a18c19259a3cb9dec332d613e8f15704f678a468 Mon Sep 17 00:00:00 2001
From: Ben Siraphob <bensiraphob@gmail.com>
Date: Wed, 22 Mar 2023 00:37:02 -0500
Subject: [PATCH 14/23] Fix Nix build

---
 flake.nix | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flake.nix b/flake.nix
index da4bd7ba3..4c2717e0d 100644
--- a/flake.nix
+++ b/flake.nix
@@ -28,8 +28,8 @@
           ];
           installPhase = ''
             mkdir -p $out/bin
-            mv llama $out/bin/llama
-            mv quantize $out/bin/quantize
+            mv bin/main $out/bin/llama
+            mv bin/quantize $out/bin/quantize
             echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
             cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
             chmod +x $out/bin/convert-pth-to-ggml

From ea10d3ded2994106596ddf8e4ed02741b3e053e6 Mon Sep 17 00:00:00 2001
From: anzz1 <anzz1@live.com>
Date: Thu, 23 Mar 2023 19:54:28 +0200
Subject: [PATCH 15/23] Command line args bounds checking (#424)

* command line args bounds checking

* unknown and invalid param exit codes 0 -> 1
---
 utils.cpp | 101 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 84 insertions(+), 17 deletions(-)

diff --git a/utils.cpp b/utils.cpp
index 1d5309c3a..45c9cabb1 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -26,41 +26,95 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
     }
 
+    bool invalid_param = false;
+    std::string arg;
     for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
+        arg = argv[i];
 
         if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seed = std::stoi(argv[i]);
         } else if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads = std::stoi(argv[i]);
         } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.prompt = argv[i];
         } else if (arg == "-f" || arg == "--file") {
-            std::ifstream file(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
             if (params.prompt.back() == '\n') {
                 params.prompt.pop_back();
             }
         } else if (arg == "-n" || arg == "--n_predict") {
-            params.n_predict = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_predict = std::stoi(argv[i]);
         } else if (arg == "--top_k") {
-            params.top_k = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_k = std::stoi(argv[i]);
         } else if (arg == "-c" || arg == "--ctx_size") {
-            params.n_ctx = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_ctx = std::stoi(argv[i]);
         } else if (arg == "--memory_f16") {
             params.memory_f16 = true;
         } else if (arg == "--top_p") {
-            params.top_p = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.top_p = std::stof(argv[i]);
         } else if (arg == "--temp") {
-            params.temp = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.temp = std::stof(argv[i]);
         } else if (arg == "--repeat_last_n") {
-            params.repeat_last_n = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_last_n = std::stoi(argv[i]);
         } else if (arg == "--repeat_penalty") {
-            params.repeat_penalty = std::stof(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.repeat_penalty = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch_size") {
-            params.n_batch = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_batch = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
-            params.model = argv[++i];
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--interactive-first") {
@@ -70,13 +124,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--color") {
             params.use_color = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
-            params.antiprompt.push_back(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.antiprompt.push_back(argv[i]);
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--ignore-eos") {
             params.ignore_eos = true;
         } else if (arg == "--n_parts") {
-            params.n_parts = std::stoi(argv[++i]);
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_parts = std::stoi(argv[i]);
         } else if (arg == "-h" || arg == "--help") {
             gpt_print_usage(argc, argv, params);
             exit(0);
@@ -85,9 +147,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             gpt_print_usage(argc, argv, params);
-            exit(0);
+            exit(1);
         }
     }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        gpt_print_usage(argc, argv, params);
+        exit(1);
+    }
 
     return true;
 }

From ad072fc5ad6f6905a7224ff6ea07c0644aa075b1 Mon Sep 17 00:00:00 2001
From: nusu-github <29514220+nusu-github@users.noreply.github.com>
Date: Fri, 24 Mar 2023 05:16:48 +0900
Subject: [PATCH 16/23] Generate library with CMake (#430)

* Generate library with CMake

BUILD_SHARED_LIBS to allow llama library to be generated.

* Turn ON PIC when BUILD_SHARED_LIBS is ON
---
 CMakeLists.txt | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d952afb4f..51af97c4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -218,6 +218,9 @@ add_library(utils OBJECT
 target_include_directories(utils PUBLIC .)
 target_compile_features(utils PUBLIC cxx_std_11) # don't bump
 target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
 
 add_library(ggml OBJECT
             ggml.c
@@ -226,6 +229,9 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
 
 add_library(llama
             llama.cpp
@@ -234,6 +240,10 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
+if (BUILD_SHARED_LIBS)
+    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+endif()
 
 #
 # Executables

From 20a1a4e09c522a80e2a0db51643d25fa38326065 Mon Sep 17 00:00:00 2001
From: Timmy Knight <r2d2fish@gmail.com>
Date: Thu, 23 Mar 2023 10:18:13 -1000
Subject: [PATCH 17/23] Fix GPTQ converter (#423)

* Fix GPTQ converter

* Fix comment

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-gptq-to-ggml.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py
index 7fccb4d56..6c77808fc 100644
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -36,7 +36,8 @@ fname_out = sys.argv[3]
 
 fout = open(fname_out, "wb")
 
-fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
+fout.write(struct.pack("i", 1)) # file version
 fout.write(struct.pack("i", n_vocab))
 fout.write(struct.pack("i", n_embd))
 fout.write(struct.pack("i", n_mult))
@@ -49,27 +50,21 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
     if tokenizer.is_unknown(i):
-        # "<unk>" token (translated as ??)
         text = " \u2047 ".encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
     elif tokenizer.is_control(i):
-        # "<s>"/"</s>" tokens
-        fout.write(struct.pack("i", 0))
+        text = b""
     elif tokenizer.is_byte(i):
-        # "<U+XX>" tokens (which may be invalid UTF-8)
         piece = tokenizer.id_to_piece(i)
         if len(piece) != 6:
-            print("Invalid token: " + piece)
+            print(f"Invalid token: {piece}")
             sys.exit(1)
         byte_value = int(piece[3:-1], 16)
-        fout.write(struct.pack("i", 1))
-        fout.write(struct.pack("B", byte_value))
+        text = struct.pack("B", byte_value)
     else:
-        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
         text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    fout.write(struct.pack("f", tokenizer.get_score(i)))
 
 def write_header(shape, dst_name, ftype_cur):
     sname = dst_name.encode('utf-8')

From 2e17dfd80a473099dacc0f41c9146d233c6a5972 Mon Sep 17 00:00:00 2001
From: rabidcopy <rabidcopy@yahoo.com>
Date: Thu, 23 Mar 2023 15:22:47 -0500
Subject: [PATCH 18/23] Replace EOS with newline to prevent context/memory
 being flushed by EOS in interactive mode (#333)

* Improve interactive mode's coherence after EOS

Aims to improve coherence and ability to resume the interactive session when the user is given input back after an end of text token is reached.
Not sure what token 13 is or why it seems to help. See conversation for examples.

* Make newline token a constant

* dynamically determine newline token

* relocate previous newline token const

* cleanup whitespace

* print a new line on end of text in interactive

this may need to be looked into further when not using a reverse prompt

* only print manual newline with reverse prompt

fix formatting of reverse prompts so they don't end up at the end of the current line while not introducing unnecessary new lines otherwise

* alternate approach to replace end of text tokens

* Inject the reverse prompt again after eos in interactive mode

* tokenize reverse prompt when needed

makes this PR compatible with https://github.com/ggerganov/llama.cpp/pull/330

* tokenize and inject only first reverse prompt

thanks to tjohnman

* tokenize first reverse prompt once

* add newline token

* add newline token

* tokenize/inject reverse prompt for refactor

this doesn't seem right though

* tokenize nothing for antiprompt if no reverse

* Update main.cpp

* Update main.cpp

* tokenize and inject reverse prompt as needed

this doesn't seem to work if the reverse prompt is tokenized outside earlier on

* not needed

* remove newline token

* remove newline token

* tokenize newline token

* add space to comment

* Update main.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 main.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/main.cpp b/main.cpp
index 431c94b52..5ba6d5a75 100644
--- a/main.cpp
+++ b/main.cpp
@@ -258,6 +258,9 @@ int main(int argc, char ** argv) {
         params.interactive = true;
     }
 
+    // determine newline token
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
     fprintf(stderr, "\n");
     fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
     fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
@@ -359,6 +362,16 @@ int main(int argc, char ** argv) {
                 last_n_tokens.push_back(id);
             }
 
+            // replace end of text token with newline token when in interactive mode
+            if (id == llama_token_eos() && params.interactive) {
+                id = llama_token_newline.front();
+                if (params.antiprompt.size() != 0) {
+                    // tokenize and inject first reverse prompt
+                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                }
+            }
+
             // add it to the context
             embd.push_back(id);
 
@@ -451,12 +464,8 @@ int main(int argc, char ** argv) {
 
         // end of text token
         if (embd.back() == llama_token_eos()) {
-            if (params.interactive) {
-                is_interacting = true;
-            } else {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
+            fprintf(stderr, " [end of text]\n");
+            break;
         }
 
         // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.

From 0ba5a3a9a5efedb1aeecbbc70a4e9825542472d5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 22:32:02 +0200
Subject: [PATCH 19/23] Obsolete

---
 download-pth.py | 66 -------------------------------------------------
 1 file changed, 66 deletions(-)
 delete mode 100644 download-pth.py

diff --git a/download-pth.py b/download-pth.py
deleted file mode 100644
index 129532c0c..000000000
--- a/download-pth.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-import sys
-from tqdm import tqdm
-import requests
-
-if len(sys.argv) < 3:
-    print("Usage: download-pth.py dir-model model-type\n")
-    print("  model-type: Available models 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-modelsDir = sys.argv[1]
-model = sys.argv[2]
-
-num = {
-    "7B": 1,
-    "13B": 2,
-    "30B": 4,
-    "65B": 8,
-}
-
-if model not in num:
-    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
-    sys.exit(1)
-
-print(f"Downloading model {model}")
-
-files = ["checklist.chk", "params.json"]
-
-for i in range(num[model]):
-    files.append(f"consolidated.0{i}.pth")
-
-resolved_path = os.path.abspath(os.path.join(modelsDir, model))
-os.makedirs(resolved_path, exist_ok=True)
-
-for file in files:
-    dest_path = os.path.join(resolved_path, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-
-    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
-
-files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
-for file in files2:
-    dest_path = os.path.join(modelsDir, file)
-    
-    if os.path.exists(dest_path):
-        print(f"Skip file download, it already exists: {file}")
-        continue
-    
-    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:
-                    f.write(chunk)
-                    t.update(len(chunk))
\ No newline at end of file

From 4cc053b6d5e9df7ac21fa06b7208a70c156d4d7a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 22:39:44 +0200
Subject: [PATCH 20/23] Remove oboslete command from Docker script

---
 .devops/tools.sh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/.devops/tools.sh b/.devops/tools.sh
index 352e04942..b0196b60d 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -16,11 +16,7 @@ elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
     ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
     ./main $arg2
-elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
-    python3 ./download-pth.py $arg2
 elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
-    echo "Downloading model..."
-    python3 ./download-pth.py "$1" "$2"
     echo "Converting PTH to GGML..."
     for i in `ls $1/$2/ggml-model-f16.bin*`; do
         if [ -f "${i/f16/q4_0}" ]; then
@@ -39,8 +35,6 @@ else
     echo "              ex: \"/models/7B/\" 1"
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
-    echo "              ex: \"/models/\" 7B"
-    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
+    echo "  --all-in-one (-a): Execute --convert & --quantize"
     echo "              ex: \"/models/\" 7B"
 fi

From 404e1da38ec8025707031a8027da14dc1590f952 Mon Sep 17 00:00:00 2001
From: Jed Fox <git@jedfox.com>
Date: Thu, 23 Mar 2023 16:42:52 -0400
Subject: [PATCH 21/23] Fix quantize script not finding models in parent
 directory (#428)

---
 quantize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quantize.py b/quantize.py
index 6320b0a26..16b5963d3 100644
--- a/quantize.py
+++ b/quantize.py
@@ -57,6 +57,7 @@ def main():
     # )
 
     args = parser.parse_args()
+    args.models_path = os.path.abspath(args.models_path)
 
     if not os.path.isfile(args.quantize_script_path):
         print(

From 483bab2e3d4a868fe679d8bb32827d2a4df214dc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 23 Mar 2023 23:22:01 +0200
Subject: [PATCH 22/23] Avoid the transposed X branch in the Z = X * Y matrix
 multiplication (#439)

Should make results reproducible for different number of threads and batch sizes
---
 llama.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 7de3c19c8..d55219256 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,11 +727,13 @@ static bool llama_eval_internal(
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                ggml_cpy(ctx0,
+                    ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                    ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
 
             // KQV = transpose(V) * KQ_soft_max
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

From 4870e455b3653f7d7769fa5772b2c90ffad088df Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 24 Mar 2023 00:11:53 +0200
Subject: [PATCH 23/23] Fix memory allocation issues and seg faults

---
 llama.cpp | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index d55219256..cf796cce3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -102,6 +102,9 @@ struct llama_context {
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
+
+    // work buffer for transformer evaluation
+    std::vector<uint8_t> buf_eval;
 };
 
 struct llama_context_params llama_context_default_params() {
@@ -627,27 +630,19 @@ static bool llama_eval_internal(
     const int n_rot   = hparams.n_embd/hparams.n_head;
 
     auto & mem_per_token = lctx.mem_per_token;
+    auto & buf_eval = lctx.buf_eval;
 
-    // TODO: fix this hardcoded size
-    static size_t buf_size = 512u*1024*1024;
-    static void * buf = malloc(buf_size);
+    if (mem_per_token*(n_past + N + 16) > buf_eval.size()) {
+        const size_t buf_size_new = 1.618*buf_eval.size();
 
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
-        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_eval.size(), buf_size_new);
 
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
+        buf_eval.resize(buf_size_new);
     }
 
     struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
+        /*.mem_size   =*/ buf_eval.size(),
+        /*.mem_buffer =*/ buf_eval.data(),
     };
 
     struct ggml_context * ctx0 = ggml_init(params);
@@ -832,10 +827,11 @@ static bool llama_eval_internal(
         memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
     }
 
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
+    if (N == 1) {
+        mem_per_token = ggml_used_mem(ctx0)/(n_past + N);
     }
-    //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    //fprintf(stderr, "\nused_mem = %zu, %zu MB\n", ggml_used_mem(ctx0), ggml_used_mem(ctx0)/1024/1024);
 
     ggml_free(ctx0);
 
@@ -1416,6 +1412,8 @@ struct llama_context * llama_init_from_file(
         return nullptr;
     }
 
+    ctx->buf_eval.resize(512u*1024u*1024u);
+
     return ctx;
 }