From b6e7f9b09e9c340ec97a2fae61c1eb8db861f2f9 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Sat, 22 Apr 2023 08:21:32 +0200
Subject: [PATCH 01/74] llama : add api for getting/setting the complete state:
 rng, logits, embedding and kv_cache (#1105)

* reserve correct size for logits

* add functions to get and set the whole llama state:

including rng, logits, embedding and kv_cache

* remove unused variables

* remove trailing whitespace

* fix comment
---
 llama.cpp | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 llama.h   |  12 ++++++
 2 files changed, 133 insertions(+), 1 deletion(-)
diff --git a/llama.cpp b/llama.cpp
index 0345b61c6..00cce6e2a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -27,6 +27,7 @@
 #include <thread>
 #include <atomic>
 #include <mutex>
+#include <sstream>
 
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -1787,7 +1788,7 @@ struct llama_context * llama_init_from_file(
         if (params.logits_all) {
             ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
         } else {
-            ctx->logits.reserve(hparams.n_ctx);
+            ctx->logits.reserve(hparams.n_vocab);
         }
 
         if (params.embedding){
@@ -2252,3 +2253,122 @@ const char * llama_print_system_info(void) {
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
     return ctx->model.tensors_by_name;
 }
+
+// Returns the size of the state
+size_t llama_get_state_size(struct llama_context * ctx) {
+    const size_t s_bool = sizeof(int32_t);
+    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
+    // for reference, std::mt19937(1337) serializes to 6701 bytes.
+    const size_t s_rng_size = sizeof(size_t);
+    const size_t s_rng = 64*1024;
+    const size_t s_logits_capacity = sizeof(size_t);
+    const size_t s_logits_size = sizeof(size_t);
+    const size_t s_logits = ctx->logits.capacity() * sizeof(float);
+    const size_t s_embedding_size = sizeof(size_t);
+    const size_t s_embedding = ctx->embedding.size() * sizeof(float);
+    const size_t s_kv_size = sizeof(size_t);
+    const size_t s_kv_ntok = sizeof(int);
+    const size_t s_kv = llama_get_kv_cache_size(ctx);
+    const size_t s_total = (
+        + s_rng_size
+        + s_rng
+        + s_logits_capacity
+        + s_logits_size
+        + s_logits
+        + s_embedding_size
+        + s_embedding
+        + s_kv_size
+        + s_kv_ntok
+        + s_kv
+    );
+    return s_total;
+}
+
+// Copies the state to the specified destination address
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
+    std::stringstream rng_ss;
+    rng_ss << ctx->rng;
+    const size_t rng_size = rng_ss.str().size();
+    char rng_buf[64*1024];
+    memset(&rng_buf[0], 0, 64*1024);
+    memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+    const size_t logits_capacity = ctx->logits.capacity();
+    const size_t logits_size = ctx->logits.size();
+    const size_t embedding_size = ctx->embedding.size();
+    const size_t kv_size = llama_get_kv_cache_size(ctx);
+    const int kv_ntok = llama_get_kv_cache_token_count(ctx);
+
+    uint8_t * out = dest;
+    memcpy(out, &rng_size, sizeof(size_t)); out += sizeof(size_t);
+    memcpy(out, &rng_buf[0], 64*1024); out += 64*1024;
+    memcpy(out, &logits_capacity, sizeof(size_t)); out += sizeof(size_t);
+    memcpy(out, &logits_size, sizeof(size_t)); out += sizeof(size_t);
+    if (logits_size) {
+        memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
+    }
+    out += logits_capacity * sizeof(float);
+    memcpy(out, &embedding_size, sizeof(size_t)); out += sizeof(size_t);
+    if (embedding_size) {
+        memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float)); out += embedding_size * sizeof(float);
+    }
+    memcpy(out, &kv_size, sizeof(size_t)); out += sizeof(size_t);
+    memcpy(out, &kv_ntok, sizeof(int)); out += sizeof(int);
+    if (kv_size) {
+        memcpy(out, llama_get_kv_cache(ctx), kv_size); out += kv_size;
+    }
+    const size_t written = out - dest;
+    const size_t expected = llama_get_state_size(ctx);
+    LLAMA_ASSERT(written == expected);
+    return written;
+}
+
+// Sets the state reading from the specified source address
+size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
+    size_t rng_size;
+    char rng_buf[64*1024];
+    std::stringstream rng_ss;
+
+    const uint8_t * in = src;
+    memcpy(&rng_size, in, sizeof(size_t)); in += sizeof(size_t);
+    memcpy(&rng_buf[0], in, 64*1024); in += 64*1024;
+    rng_ss.str(std::string(&rng_buf[0], rng_size));
+    rng_ss >> ctx->rng;
+    LLAMA_ASSERT(rng_ss.fail() == false);
+
+    size_t logits_capacity;
+    size_t logits_size;
+    size_t embedding_size;
+    size_t kv_size;
+    int kv_ntok;
+
+    memcpy(&logits_capacity, in, sizeof(size_t)); in += sizeof(size_t);
+    memcpy(&logits_size, in, sizeof(size_t)); in += sizeof(size_t);
+    LLAMA_ASSERT(ctx->logits.capacity() == logits_capacity);
+    if (logits_size) {
+        ctx->logits.resize(logits_size);
+        memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
+    }
+    in += logits_capacity * sizeof(float);
+    memcpy(&embedding_size, in, sizeof(size_t)); in += sizeof(size_t);
+    LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
+    if (embedding_size) {
+        memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
+        in += embedding_size * sizeof(float);
+    }
+    memcpy(&kv_size, in, sizeof(size_t)); in += sizeof(size_t);
+    memcpy(&kv_ntok, in, sizeof(int)); in += sizeof(int);
+    if (kv_size) {
+        LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
+        void * k_data = ctx->model.kv_self.k->data; // remember data pointers
+        void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
+        memcpy(ctx->model.kv_self.buf.addr, in, kv_size);
+        ctx->model.kv_self.k->data = k_data; // restore correct data pointers
+        ctx->model.kv_self.v->data = v_data;
+        in += kv_size;
+    }
+    ctx->model.kv_self.n = kv_ntok;
+    const size_t nread = in - src;
+    const size_t expected = llama_get_state_size(ctx);
+    LLAMA_ASSERT(nread == expected);
+    return nread;
+}
diff --git a/llama.h b/llama.h
index e95ff73b8..f68a0cb40 100644
--- a/llama.h
+++ b/llama.h
@@ -129,6 +129,18 @@ extern "C" {
                           size_t   n_size,
                              int   n_token_count);
 
+    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
+    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls

From e9a9cb0c54461ffbda75b7b2f99f3ea5562291c2 Mon Sep 17 00:00:00 2001
From: Clint Herron <hanclinto@gmail.com>
Date: Sat, 22 Apr 2023 02:54:33 -0400
Subject: [PATCH 02/74] examples : Improve Alpaca Default Repeat Penalty:
 Better Match Alpaca.cpp Experience (#1107)

* Moving parameters to separate lines for readability.

* Increasing repeate_penalty to 1.1 to make alpaca more usable by default.

* Adding trailing newline.
---
 examples/alpaca.sh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/alpaca.sh b/examples/alpaca.sh
index 8d6261730..aef207f36 100755
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -7,4 +7,13 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
+./main -m ./models/ggml-alpaca-7b-q4.bin \
+       --color \
+       -f ./prompts/alpaca.txt \
+       --ctx_size 2048 \
+       -n -1 \
+       -ins -b 256 \
+       --top_k 10000 \
+       --temp 0.2 \
+       --repeat_penalty 1.1 \
+       -t 7

From c5aa5e577741d0359ad26ec50b9e21a74c65d911 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Sat, 22 Apr 2023 07:37:05 +0000
Subject: [PATCH 03/74] ggml : AVX2 optimization for vec_dot_q4_3_q8_0 and
 refactoring (#1099)

* AVX2 optimization for vec_dot_q4_3_q8_0 and refactoring

* finish AVX vectorization of quantize_row_q8_0

* Rename hsum_int_8 to hsum_i32_8
---
 ggml.c | 213 +++++++++++++++++++++++++--------------------------------
 1 file changed, 92 insertions(+), 121 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2ea4e68fd..814776381 100644
--- a/ggml.c
+++ b/ggml.c
@@ -450,6 +450,24 @@ static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
     return bytes;
 }
 
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+    __m128 res = _mm256_extractf128_ps(x, 1);
+    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+    res = _mm_add_ss(res, _mm_movehdup_ps(res));
+    return _mm_cvtss_f32(res);
+}
+
+// horizontally add 8 int32_t
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
 #if __AVX2__ || __AVX512F__
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
@@ -470,6 +488,24 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
     return bytes;
 }
 
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+    const __m256i ones = _mm256_set1_epi16(1);
+    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+    return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+    // Get absolute values of x vectors
+    const __m256i ax = _mm256_sign_epi8(x, x);
+    // Sign the values of the y vectors
+    const __m256i sy = _mm256_sign_epi8(y, x);
+    // Perform multiplication and create 16-bit values
+    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+    return sum_i16_pairs_float(dot);
+}
+
 static inline __m128i packNibbles( __m256i bytes )
 {
     // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
@@ -1273,29 +1309,6 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
     }
 }
 
-#ifdef __AVX2__
-// There is no better way of doing this?
-// I guess not, AVX is not very good at horizontal sums.
-// The commented solution for a hotrizontal sum was suggested by @pubby as being slightly
-// faster than the solution below. As I don't have an AVX2 system handt right now to test,
-// keeping the original.
-// TODO: Please try and if it does make a differece, uncomment and remove the implementation below.
-//static inline float horizontal_sum(__m256i a) {
-//    __m256i b = _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(a)));
-//    __m256i sum = _mm256_add_epi32(a, b);
-//    __m256i hi = _mm256_unpackhi_epi64(sum, sum);
-//    sum = _mm256_add_epi32(sum, hi);
-//    return _mm256_cvtsi256_si32(sum) + _mm256_extract_epi32(sum, 4);
-//}
-static inline float horizontal_sum(__m256i a) {
-    __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extracti128_si256(a, 1));
-    __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-#endif
-
 static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
@@ -1384,9 +1397,8 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
 #if defined(__AVX2__)
-
         // Compute the sum of the quants and set y[i].s
-        y[i].s = d * horizontal_sum(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
 
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
@@ -1413,6 +1425,11 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         __m128i ni6 = _mm256_castsi256_si128( i3 );
         __m128i ni7 = _mm256_extractf128_si256( i3, 1);
 
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = d * hsum_i32_8(_mm256_set_m128i(s1, s0));
+
         // Convert int32 to int16
         ni0 = _mm_packs_epi32( ni0, ni1 );
         ni2 = _mm_packs_epi32( ni2, ni3 );
@@ -1430,14 +1447,6 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
     // scalar
     quantize_row_q8_0_reference(x, y, k);
 #endif
-#if defined __AVX__
-    // TODO: vectorize this
-    for (int i=0; i<nb; ++i) {
-        int sum = 0;
-        for (int l=0; l<QK8_0; ++l) sum += y[i].qs[l];
-        y[i].s = y[i].d * sum;
-    }
-#endif
 }
 
 static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
@@ -2374,8 +2383,6 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     const block_q4_0 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
 
-    float sumf = 0.0;
-
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -2441,7 +2448,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 #endif
     }
 
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) - 8 * sum8;
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) - 8 * sum8;
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -2459,32 +2466,13 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 
         __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 
-        // Get absolute values of x vectors
-        const __m256i ax = _mm256_sign_epi8(bx, bx);
-
-        // Sign the values of the y vectors
-        const __m256i sy = _mm256_sign_epi8(by, bx);
-
-        // Perform multiplication and create 16-bit values
-        const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-
-        const __m256i ones = _mm256_set1_epi16(1);
-        __m256i xy_q = _mm256_madd_epi16(ones, dot);
-
-        /* Convert to vectore of 8 int32_t to 8 floats */
-        __m256 q = _mm256_cvtepi32_ps( xy_q );
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
 
         /* Multiply q with scale and accumulate */
         acc = _mm256_fmadd_ps( d, q, acc );
     }
 
-    // Return horizontal sum of the acc vector
-    __m128 res = _mm256_extractf128_ps( acc, 1 );
-    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
-    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
-    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
-
-    sumf = _mm_cvtss_f32( res );
+    *s = hsum_float_8(acc);
 #elif defined(__AVX__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -2523,15 +2511,10 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
     }
 
-    // Return horizontal sum of the acc vector
-    __m128 res = _mm256_extractf128_ps( acc, 1 );
-    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
-    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
-    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
-
-    sumf = _mm_cvtss_f32( res );
+    *s = hsum_float_8(acc);
 #else
     // scalar
+    float sumf = 0.0;
     for (int i = 0; i < nb; i++) {
         const float d0 = x[i].d;
         const float d1 = y[i].d;
@@ -2553,9 +2536,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         }
         sumf += d0*d1*sumi;
     }
-#endif
-
     *s = sumf;
+#endif
 }
 
 static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -2567,8 +2549,6 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
     const block_q4_1 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
 
-    float sumf = 0.0;
-
     // TODO: add AVX / WASM SIMD / etc
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2635,7 +2615,7 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
 #endif
     }
 
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -2646,7 +2626,6 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
     for (int i = 0; i < nb; ++i) {
         const float * d0 = &x[i].d;
         const float * d1 = &y[i].d;
-        //const float * m0 = &x[i].m;
 
         summs += x[i].m * y[i].s;
 
@@ -2660,33 +2639,16 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
         const __m256i bx = bytes_from_nibbles_32(x[i].qs);
         const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
 
-        // Get absolute values of x vectors
-        const __m256i ax = _mm256_sign_epi8( bx, bx );
-
-        // Sign the values of the y vectors
-        const __m256i sy = _mm256_sign_epi8( by, bx );
-
-        // Perform multiplication and create 16-bit values
-        const __m256i dot = _mm256_maddubs_epi16( ax, sy );
-        const __m256i ones = _mm256_set1_epi16( 1 );
-        const __m256i xy_q = _mm256_madd_epi16( ones, dot );
-
-        // Convert to vector of 8 int32_t to 8 floats
-        const __m256 xy = _mm256_cvtepi32_ps( xy_q );
+        const __m256 xy = mul_sum_i8_pairs_float(bx, by);
 
         // Accumulate d0*d1*x*y
         acc = _mm256_fmadd_ps( d0d1, xy, acc );
     }
 
-    // Return horizontal sum of the acc vector
-    __m128 res = _mm256_extractf128_ps( acc, 1 );
-    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
-    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
-    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
-
-    sumf = _mm_cvtss_f32( res ) + summs;
+    *s = hsum_float_8(acc) + summs;
 #else
     // scalar
+    float sumf = 0.0;
     for (int i = 0; i < nb; i++) {
         const float d0 = x[i].d;
         const float m0 = x[i].m;
@@ -2708,9 +2670,8 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
             sumf += f0*f2 + f1*f3;
         }
     }
-#endif
-
     *s = sumf;
+#endif
 }
 
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -2723,8 +2684,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
     const block_q4_2 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
 
-    float sumf = 0.0;
-
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -2802,7 +2761,7 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
 #endif
     }
 
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -2824,32 +2783,16 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
 
         __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 
-        // Get absolute values of x vectors
-        const __m256i ax = _mm256_sign_epi8(bx, bx);
-        // Sign the values of the y vectors
-        const __m256i sy = _mm256_sign_epi8(by, bx);
-        // Perform multiplication and create 16-bit values
-        const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-
-        const __m256i ones = _mm256_set1_epi16(1);
-        __m256i xy_q = _mm256_madd_epi16(ones, dot);
-
-        /* Convert to vectore of 8 int32_t to 8 floats */
-        __m256 q = _mm256_cvtepi32_ps(xy_q);
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
 
         /* Multiply q with scale and accumulate */
         acc = _mm256_fmadd_ps(d, q, acc);
     }
 
-    // Return horizontal sum of the acc vector
-    __m128 res = _mm256_extractf128_ps(acc, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(acc));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-
-    sumf = _mm_cvtss_f32(res);
+    *s = hsum_float_8(acc);
 #else
     // scalar
+    float sumf = 0.0;
     for (int i = 0; i < nb; i++) {
         const uint8_t * restrict x0 = x[2*i + 0].qs;
         const uint8_t * restrict x1 = x[2*i + 1].qs;
@@ -2884,9 +2827,8 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
         sumf += (d0 * y[i].d) * sumi_0;
         sumf += (d1 * y[i].d) * sumi_1;
     }
-#endif
-
     *s = sumf;
+#endif
 }
 
 static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
@@ -2899,8 +2841,6 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
     const block_q4_3 * restrict x = vx;
     const block_q8_0 * restrict y = vy;
 
-    float sumf = 0.0;
-
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
@@ -2986,9 +2926,41 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
 #endif
     }
 
-    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
+        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
+        const __m256 dx = _mm256_set_m128(d1, d0);
+
+        const __m128 m0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].m));
+        const __m128 m1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].m));
+        const __m256 mx = _mm256_set_m128(m1, m0);
+
+        const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
+        const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
+        const __m256i bx = _mm256_set_m128i(bx1, bx0);
+
+        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256i syi = _mm256_maddubs_epi16(_mm256_set1_epi8(1), by);
+        const __m256 syf = sum_i16_pairs_float(syi);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        const __m256 sxy = _mm256_fmadd_ps(q, dx, _mm256_mul_ps(mx, syf));
+        acc = _mm256_fmadd_ps(sxy, dy, acc);
+    }
+
+    *s = hsum_float_8(acc);
 #else
     // scalar
+    float sumf = 0.0;
     for (int i = 0; i < nb; i++) {
         const uint8_t * restrict x0 = x[2*i + 0].qs;
         const uint8_t * restrict x1 = x[2*i + 1].qs;
@@ -3031,9 +3003,8 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
         sumf += (d0*sxy_0 + m0*sy_0)*y[i].d;
         sumf += (d1*sxy_1 + m1*sy_1)*y[i].d;
     }
-#endif
-
     *s = sumf;
+#endif
 }
 
 

From 955ef9a5d53d8f911fe00580ac9bd0caa56430af Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 22 Apr 2023 10:55:35 +0300
Subject: [PATCH 04/74] ggml : alternative Q4_3 implementation using modified
 Q8_0 (#1109)

* ggml : prefer vzip to vuzp

This way we always use the same type of instruction across all quantizations

* ggml : alternative Q4_3 implementation using modified Q8_0

* ggml : fix Q4_3 scalar imlpementation

* ggml : slight improvement of Q4_3 - no need for loop unrolling

* ggml : fix AVX paths for Q8_0 quantization
---
 ggml.c | 150 +++++++++++++++++++++++++++------------------------------
 1 file changed, 70 insertions(+), 80 deletions(-)

diff --git a/ggml.c b/ggml.c
index 814776381..72b392fdb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -656,10 +656,11 @@ static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong
 #define QK8_0 32
 typedef struct {
     float   d;          // delta
-    float   s;          // d * sum(qs[i])
+    float   s0;         // d * sum(qs[i]) low
+    float   s1;         // d * sum(qs[i]) high
     int8_t  qs[QK8_0];  // quants
 } block_q8_0;
-static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+static_assert(sizeof(block_q8_0) == 3*sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
 
 
 // reference implementation for deterministic creation of model files
@@ -1299,13 +1300,22 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
         y[i].d = d;
 
-        int sum = 0;
-        for (int l = 0; l < QK8_0; ++l) {
-            const float v = x[i*QK8_0 + l]*id;
-            y[i].qs[l] = roundf(v);
-            sum += y[i].qs[l];
+        int sum0 = 0;
+        int sum1 = 0;
+
+        for (int l = 0; l < QK8_0/2; ++l) {
+            const float v0 = x[i*QK8_0           + l]*id;
+            const float v1 = x[i*QK8_0 + QK8_0/2 + l]*id;
+
+            y[i].qs[          l] = roundf(v0);
+            y[i].qs[QK8_0/2 + l] = roundf(v1);
+
+            sum0 += y[i].qs[          l];
+            sum1 += y[i].qs[QK8_0/2 + l];
         }
-        y[i].s = d * sum;
+
+        y[i].s0 = d * sum0;
+        y[i].s1 = d * sum1;
     }
 }
 
@@ -1335,9 +1345,11 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
 
         y[i].d = d;
 
-        int32x4_t accv = vdupq_n_s32(0);
+        int32x4_t accv0 = vdupq_n_s32(0);
+        int32x4_t accv1 = vdupq_n_s32(0);
 
-        for (int l = 0; l < 8; l++) {
+        // low half
+        for (int l = 0; l < 4; l++) {
             const float32x4_t v  = vmulq_n_f32(srcv[l], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
@@ -1346,12 +1358,30 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
             y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
             y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
 
-            accv = vaddq_s32(accv, vi);
+            accv0 = vaddq_s32(accv0, vi);
         }
-        int32_t sum = vaddvq_s32(accv);
-        y[i].s = d * sum;
+
+        // high half
+        for (int l = 4; l < 8; l++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
+
+            accv1 = vaddq_s32(accv1, vi);
+        }
+
+        const int32_t sum0 = vaddvq_s32(accv0);
+        const int32_t sum1 = vaddvq_s32(accv1);
+
+        y[i].s0 = d * sum0;
+        y[i].s1 = d * sum1;
     }
 #elif defined(__AVX2__) || defined(__AVX__)
+    // TODO !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1398,7 +1428,9 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
 
 #if defined(__AVX2__)
         // Compute the sum of the quants and set y[i].s
-        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+        //y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+        y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1));
+        y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3));
 
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
@@ -2395,7 +2427,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         const block_q8_0 * restrict y0 = &y[i + 0];
         const block_q8_0 * restrict y1 = &y[i + 1];
 
-        sum8 += x0->d * y0->s + x1->d * y1->s;
+        sum8 += x0->d * (y0->s0 + y0->s1) + x1->d * (y1->s0 + y1->s1);
 
         const uint8x16_t m4b   = vdupq_n_u8(0xf);
 
@@ -2562,7 +2594,7 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
         const block_q8_0 * restrict y0 = &y[i + 0];
         const block_q8_0 * restrict y1 = &y[i + 1];
 
-        summs += x0->m * y0->s + x1->m * y1->s;
+        summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
 
         const uint8x16_t m4b = vdupq_n_u8(0xf);
 
@@ -2575,22 +2607,22 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
         const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
         const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
 
+        // interleave
+        const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
+        const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
+        const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
+        const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
+
         // load y
         const int8x16_t v1_0l = vld1q_s8(y0->qs);
         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
         const int8x16_t v1_1l = vld1q_s8(y1->qs);
         const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
 
-        // interleave
-        const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
-        const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
-        const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
-        const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
-
 #if defined(__ARM_FEATURE_DOTPROD)
         // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0ls), v0_0h, v1_0hs);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1ls), v0_1h, v1_1hs);
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
 
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
         sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
@@ -2627,7 +2659,7 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
         const float * d0 = &x[i].d;
         const float * d1 = &y[i].d;
 
-        summs += x[i].m * y[i].s;
+        summs += x[i].m * (y[i].s0 + y[i].s1);
 
         const __m256 d0v = _mm256_broadcast_ss( d0 );
         const __m256 d1v = _mm256_broadcast_ss( d1 );
@@ -2845,88 +2877,53 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
-    for (int i = 0; i < nb; i += 2) {
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    for (int i = 0; i < nb; ++i) {
         const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
         const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
-        const block_q4_3 * restrict x1_0 = &x[2*(i + 1) + 0];
-        const block_q4_3 * restrict x1_1 = &x[2*(i + 1) + 1];
 
         const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
 
-        const uint8x16_t m4b = vdupq_n_u8(0xf);
-
-        const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
-        const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
-        const float x1_0d = GGML_FP16_TO_FP32(x1_0->d);
-        const float x1_1d = GGML_FP16_TO_FP32(x1_1->d);
-
-        const float x0_0m = GGML_FP16_TO_FP32(x0_0->m);
-        const float x0_1m = GGML_FP16_TO_FP32(x0_1->m);
-        const float x1_0m = GGML_FP16_TO_FP32(x1_0->m);
-        const float x1_1m = GGML_FP16_TO_FP32(x1_1->m);
+        summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
+        summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
 
         const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
-        const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs));
 
         // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, vdupq_n_u8(0xf)));
         const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
 
         // interleave
         const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
         const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
 
         // load y
         const int8x16_t v1_0l = vld1q_s8(y0->qs);
         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
 
-        const int16x8_t sy0_0 = vaddq_s16(vmovl_s8(vget_low_s8(v1_0l)), vmovl_s8(vget_high_s8(v1_0l)));
-        const int16x8_t sy0_1 = vaddq_s16(vmovl_s8(vget_low_s8(v1_0h)), vmovl_s8(vget_high_s8(v1_0h)));
-
-        const int16x8_t sy1_0 = vaddq_s16(vmovl_s8(vget_low_s8(v1_1l)), vmovl_s8(vget_high_s8(v1_1l)));
-        const int16x8_t sy1_1 = vaddq_s16(vmovl_s8(vget_low_s8(v1_1h)), vmovl_s8(vget_high_s8(v1_1h)));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy0_0), vget_high_s16(sy0_0))), x0_0m*y0->d);
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy0_1), vget_high_s16(sy0_1))), x0_1m*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy1_0), vget_high_s16(sy1_0))), x1_0m*y1->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy1_1), vget_high_s16(sy1_1))), x1_1m*y1->d);
+        const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
+        const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
 
 #if defined(__ARM_FEATURE_DOTPROD)
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), x1_0d*y1->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), x1_1d*y1->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
 #else
         const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
         const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
         const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
         const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
 
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
 
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(ph0), x0_1d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(pl1), x1_0d*y1->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph1), x1_1d*y1->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d);
 #endif
     }
 
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+    *s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1;
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -2971,9 +2968,6 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
         const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
         const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
 
-        int sy_0 = 0;
-        int sy_1 = 0;
-
         int sxy_0 = 0;
         int sxy_1 = 0;
 
@@ -2993,15 +2987,11 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
             const int y0_1 = y0[2*(j + QK8_0/4) + 0];
             const int y1_1 = y0[2*(j + QK8_0/4) + 1];
 
-            sy_0 += y0_0 + y1_0;
-            sy_1 += y0_1 + y1_1;
-
             sxy_0 += x0_0*y0_0 + x1_0*y1_0;
             sxy_1 += x0_1*y0_1 + x1_1*y1_1;
         }
 
-        sumf += (d0*sxy_0 + m0*sy_0)*y[i].d;
-        sumf += (d1*sxy_1 + m1*sy_1)*y[i].d;
+        sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1;
     }
     *s = sumf;
 #endif

From 872c365a9176a011b13d31269bb3121fa89c37e1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 22 Apr 2023 11:08:12 +0300
Subject: [PATCH 05/74] ggml : fix AVX build + update to new Q8_0 format

---
 Makefile  | 10 +++++++---
 ggml.c    | 12 ++++++++++--
 llama.cpp |  6 +++---
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/Makefile b/Makefile
index 3b48eec99..b297959c9 100644
--- a/Makefile
+++ b/Makefile
@@ -74,13 +74,17 @@ endif
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	# Use all CPU extensions that are available:
-	CFLAGS += -march=native -mtune=native
+	CFLAGS   += -march=native -mtune=native
 	CXXFLAGS += -march=native -mtune=native
+
+	# Usage AVX-only
+	#CFLAGS   += -mfma -mf16c -mavx
+	#CXXFLAGS += -mfma -mf16c -mavx
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS += -mcpu=power9
+		CFLAGS   += -mcpu=power9
 		CXXFLAGS += -mcpu=power9
 	endif
 	# Require c++23's std::byteswap for big-endian support.
@@ -114,7 +118,7 @@ ifdef LLAMA_GPROF
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
-	CFLAGS += -mcpu=native
+	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
diff --git a/ggml.c b/ggml.c
index 72b392fdb..46c0292fe 100644
--- a/ggml.c
+++ b/ggml.c
@@ -468,6 +468,14 @@ static inline int hsum_i32_8(const __m256i a) {
     return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
 }
 
+// horizontally add 4 int32_t
+static inline int hsum_i32_4(const __m128i a) {
+    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
+    const __m128i sum64 = _mm_add_epi32(hi64, a);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+
 #if __AVX2__ || __AVX512F__
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
@@ -1381,7 +1389,6 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         y[i].s1 = d * sum1;
     }
 #elif defined(__AVX2__) || defined(__AVX__)
-    // TODO !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1460,7 +1467,8 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         // Compute the sum of the quants and set y[i].s
         const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
         const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = d * hsum_i32_8(_mm256_set_m128i(s1, s0));
+        y[i].s0 = d * hsum_i32_4(s0);
+        y[i].s1 = d * hsum_i32_4(s1);
 
         // Convert int32 to int16
         ni0 = _mm_packs_epi32( ni0, ni1 );
diff --git a/llama.cpp b/llama.cpp
index 00cce6e2a..4e92f5515 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -68,7 +68,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
         { MODEL_65B,   512ull * MB },
     };
     return _MEM_REQ_SCRATCH1;
-};
+}
 
 // 2*n_embd*n_ctx*n_layer*sizeof(float16)
 static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
@@ -80,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
         { MODEL_65B,  5120ull * MB },
     };
     return _MEM_REQ_KV_SELF;
-};
+}
 
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
@@ -93,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
         { MODEL_65B, 1536ull * MB },
     };
     return _MEM_REQ_EVAL;
-};
+}
 
 // default hparams (LLaMA 7B)
 struct llama_hparams {

From 7e312f165c5047d6e16680d1eebc83055e95c313 Mon Sep 17 00:00:00 2001
From: Howard Su <howard0su@gmail.com>
Date: Sat, 22 Apr 2023 16:18:20 +0800
Subject: [PATCH 06/74] cmake : fix build under Windows when enable
 BUILD_SHARED_LIBS (#1100)

* Fix build under Windows when enable BUILD_SHARED_LIBS

* Make AVX512 test on Windows to build the shared libs
---
 .github/workflows/build.yml | 2 +-
 CMakeLists.txt              | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7e8a29b1e..b2a35613e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -169,7 +169,7 @@ jobs:
          - build: 'avx'
            defines: '-DLLAMA_AVX2=OFF'
          - build: 'avx512'
-           defines: '-DLLAMA_AVX512=ON'
+           defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
 
     steps:
       - name: Clone
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c3c60167..2d4e30e5a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -201,6 +201,10 @@ endif()
 
 if (MSVC)
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+
+    if (BUILD_SHARED_LIBS)
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    endif()
 endif()
 
 if (LLAMA_LTO)

From 10f19c1121068ce3dab9bece03a8b9caaea2db36 Mon Sep 17 00:00:00 2001
From: eiery <19350831+eiery@users.noreply.github.com>
Date: Sat, 22 Apr 2023 04:27:05 -0400
Subject: [PATCH 07/74] llama : have n_batch default to 512 (#1091)

* set default n_batch to 512 when using BLAS

* spacing

* alternate implementation of setting different n_batch for BLAS

* set n_batch to 512 for all cases
---
 examples/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/common.h b/examples/common.h
index cbbc2dfab..0470368d5 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -20,7 +20,7 @@ struct gpt_params {
     int32_t repeat_last_n = 64;   // last n tokens to penalize
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 8;    // batch size for prompt processing
+    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters

From 36b4f7e06406eed8a605cc9f2921d9244ef6a8e5 Mon Sep 17 00:00:00 2001
From: wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
Date: Sat, 22 Apr 2023 16:56:35 +0800
Subject: [PATCH 08/74] llama : print timings on ctrl+c exit (#1021)

* print timings on ctrl+c exit

* remove redundant free memory call.

* add global pointer to ctx.
---
 examples/main/main.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 65db79263..6d79a7e6f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -25,6 +25,7 @@
 #endif
 
 static console_state con_st;
+static llama_context ** g_ctx;
 
 static bool is_interacting = false;
 
@@ -36,6 +37,7 @@ void sigint_handler(int signo) {
         if (!is_interacting) {
             is_interacting=true;
         } else {
+            llama_print_timings(*g_ctx);
             _exit(130);
         }
     }
@@ -92,8 +94,9 @@ int main(int argc, char ** argv) {
 
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";
-
+    
     llama_context * ctx;
+    g_ctx = &ctx;
 
     // load the model
     {

From 5f939498d517b4dddbe904f202e895a3ecfb9dc4 Mon Sep 17 00:00:00 2001
From: unbounded <haakon@likedan.net>
Date: Sat, 22 Apr 2023 11:10:39 +0200
Subject: [PATCH 09/74] ggml : unit test for quantization functions (#953)

* Unit test for quantization functions

Use the ggml_internal_get_quantize_fn function to loop through all
quantization formats and run a sanity check on the result.

Also add a microbenchmark that times these functions directly without
running the rest of the GGML graph.

* test-quantize-fns: CI fixes

Fix issues uncovered in CI
 - need to use sizes divisible by 32*8 for loop unrolling
 - use intrinsic header that should work on Mac

* test-quantize: remove

Per PR comment, subsumed by test-quantize-fns

* test-quantize: fix for q8_0 intermediates
---
 tests/CMakeLists.txt         |   3 +-
 tests/test-quantize-fns.cpp  | 154 +++++++++++++++++
 tests/test-quantize-perf.cpp | 310 +++++++++++++++++++++++++++++++++++
 tests/test-quantize.c        |  42 -----
 4 files changed, 466 insertions(+), 43 deletions(-)
 create mode 100644 tests/test-quantize-fns.cpp
 create mode 100644 tests/test-quantize-perf.cpp
 delete mode 100644 tests/test-quantize.c

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 157d7336e..81eadbc4d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,5 +6,6 @@ function(llama_add_test source)
 endfunction()
 
 # llama_add_test(test-double-float.c) # SLOW
-llama_add_test(test-quantize.c)
+llama_add_test(test-quantize-fns.cpp)
+llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
new file mode 100644
index 000000000..5a5410152
--- /dev/null
+++ b/tests/test-quantize-fns.cpp
@@ -0,0 +1,154 @@
+// Unit tests for quantization specific functions - quantize, dequantize and dot product
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+
+const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001;
+const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002;
+const float MAX_DOT_PRODUCT_ERROR = 0.02;
+
+const char* RESULT_STR[] = {"ok", "FAILED"};
+
+
+// Generate synthetic data
+void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+// Calculate RMSE between two float arrays
+float array_rmse(const float * a1, const float * a2, size_t n) {
+    double sum = 0;
+    for (size_t i = 0; i < n; i++) {
+        double diff = a1[i] - a2[i];
+        sum += diff * diff;
+    }
+    return sqrtf(sum) / n;
+}
+
+// Total quantization error on test data
+float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(test_size);
+    std::vector<float> tmp_out(test_size);
+
+    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+    return array_rmse(test_data, tmp_out.data(), test_size);
+}
+
+// Total quantization error on test data
+float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(test_size);
+    std::vector<float> tmp_out(test_size);
+    std::vector<float> tmp_out_ref(test_size);
+
+    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+
+    qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
+
+    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
+}
+
+float dot_product(const float * a1, const float * a2, size_t test_size) {
+    double sum = 0;
+    for (size_t i = 0; i < test_size; i++) {
+        sum += a1[i] * a2[i];
+    }
+    return sum;
+}
+
+// Total dot product error
+float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
+    std::vector<uint8_t> tmp_q1(test_size);
+    std::vector<uint8_t> tmp_q2(test_size*2);
+
+    qfns.quantize_row_q(test_data1, tmp_q1.data(), test_size);
+    qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
+
+    float result = INFINITY;
+    qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
+
+    const float dot_ref = dot_product(test_data1, test_data2, test_size);
+
+    return fabsf(result - dot_ref) / test_size;
+}
+
+int main(int argc, char * argv[]) {
+    bool verbose = false;
+    const size_t test_size = 32 * 128;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-v") {
+            verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+
+    std::vector<float> test_data(test_size);
+    std::vector<float> test_data2(test_size);
+
+    generate_data(0.0, test_data.size(), test_data.data());
+    generate_data(1.0, test_data2.size(), test_data2.data());
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    int num_failed = 0;
+    bool failed = false;
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+
+        if (qfns.quantize_row_q) {
+            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
+            failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
+            }
+
+            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
+            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
+            }
+
+            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
+            failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
+            }
+        }
+    }
+
+    if (num_failed || verbose) {
+        printf("%d tests failed\n", num_failed);
+    }
+
+    ggml_free(ctx);
+
+    return num_failed > 0;
+}
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
new file mode 100644
index 000000000..883df05fe
--- /dev/null
+++ b/tests/test-quantize-perf.cpp
@@ -0,0 +1,310 @@
+// Benchmark quantization specific functions on synthetic data
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <inttypes.h>
+#include <math.h>
+#include <memory>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#define MAX_ALIGNMENT 64
+#define QK 32
+#define WARMUP 5
+#define ITERATIONS 10
+
+#define L1_SIZE      32*128
+#define L2_SIZE     32*2048
+#define L3_SIZE    32*20480
+#define MEM_SIZE 32*2048000
+
+struct quantize_perf_params {
+    std::vector<std::string> include_types;
+    std::vector<size_t> test_sizes;
+    size_t alignment_offset = 0;
+    bool op_quantize_row_q_reference = false;
+    bool op_quantize_row_q = false;
+    bool op_dequantize_row_q = false;
+    bool op_quantize_row_q_dot = false;
+    bool op_vec_dot_q = false;
+};
+
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#include <x86intrin.h>
+inline int64_t cpu_cycles() {
+// Rough way to detect new-ish CPUs
+#ifdef __POPCNT__
+    unsigned int dummy;
+    return __rdtscp(&dummy);
+#else
+    return __rdtsc();
+#endif
+}
+
+#else
+
+#define cpu_cycles() 0
+
+#endif
+
+
+// Generate synthetic data
+void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+float gigabytes_per_second(size_t bytes, int64_t usecs) {
+    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
+}
+
+void * align_with_offset(void * ptr, int offset) {
+    size_t dummy_size = MAX_ALIGNMENT * 4;
+    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
+}
+
+void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
+    int64_t min_time_us = INT64_MAX;
+    int64_t total_time_us = 0;
+    int64_t min_time_cycles = INT64_MAX;
+    int64_t total_time_cycles = 0;
+
+    for (int i = 0; i < WARMUP; i++) {
+        function();
+    }
+
+
+    for (int i = 0; i < ITERATIONS; i++) {
+        const int64_t start_time = ggml_time_us();
+        const int64_t start_cycles = cpu_cycles();
+
+        function();
+
+        const int64_t end_cycles = cpu_cycles();
+        const int64_t end_time = ggml_time_us();
+
+        total_time_cycles += end_cycles - start_cycles;
+        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
+        total_time_us += end_time - start_time;
+        min_time_us = std::min(min_time_us, end_time - start_time);
+    }
+
+    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
+    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * ITERATIONS));
+    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
+    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * ITERATIONS, total_time_us));
+}
+
+int main(int argc, char * argv[]) {
+    quantize_perf_params params {};
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "--size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            size_t size = std::stoi(argv[i]);
+            if (size % 32 != 0) {
+                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
+                invalid_param = true;
+                break;
+            }
+            params.test_sizes.push_back(size);
+        } else if (arg == "-3") {
+            // quick select sizes that probably fit in CPU caches
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+        } else if (arg == "-4") {
+            // quick select cache sizes + memory
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+            params.test_sizes.push_back(MEM_SIZE);
+        } else if (arg == "--op") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string op {argv[i]};
+            if (op == "quantize_row_q_reference") {
+                params.op_quantize_row_q_reference = true;
+            } else if (op == "quantize_row_q") {
+                params.op_quantize_row_q = true;
+            } else if (op == "dequantize_row_q") {
+                params.op_dequantize_row_q = true;
+            } else if (op == "quantize_row_q_dot") {
+                params.op_quantize_row_q_dot = true;
+            } else if (op == "vec_dot_q") {
+                params.op_vec_dot_q = true;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_types.push_back(argv[i]);
+        } else if (arg == "--alignment-offset") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int alignment = std::stoi(argv[i]);
+            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
+            fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
+                invalid_param = true;
+                break;
+            }
+            params.alignment_offset = alignment;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return 1;
+    }
+
+    if (params.test_sizes.empty()) {
+        params.test_sizes.push_back(L1_SIZE);
+    }
+    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
+        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
+    }
+
+    std::sort(params.test_sizes.begin(), params.test_sizes.end());
+    size_t largest = params.test_sizes.back();
+
+    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_out_v(largest*4 + MAX_ALIGNMENT*2);
+
+    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
+    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
+    float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
+    float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
+    float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
+
+    generate_data(0, largest, test_data1);
+    generate_data(1, largest, test_data2);
+
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
+            continue;
+        }
+
+        if (qfns.quantize_row_q) {
+            printf("%s\n", ggml_type_name(type));
+
+            if (params.op_quantize_row_q_reference) {
+                printf("  quantize_row_q_reference\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q_reference(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q) {
+                printf("  quantize_row_q\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_dequantize_row_q) {
+                printf("  dequantize_row_q\n");
+                qfns.quantize_row_q(test_data1, test_q1, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.dequantize_row_q(test_q1, test_out, size);
+                        return test_out[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q_dot) {
+                printf("  quantize_row_q_dot\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q_dot(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_vec_dot_q) {
+                printf("  vec_dot_q\n");
+                qfns.quantize_row_q(test_data1, test_q1, largest);
+                qfns.quantize_row_q(test_data2, test_q2, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        float result;
+                        qfns.vec_dot_q(size, &result, test_q1, test_q2);
+                        return result;
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+        }
+    }
+
+    ggml_free(ctx);
+
+    return 0;
+}
diff --git a/tests/test-quantize.c b/tests/test-quantize.c
deleted file mode 100644
index 993e9dcc3..000000000
--- a/tests/test-quantize.c
+++ /dev/null
@@ -1,42 +0,0 @@
-#include "ggml.h"
-#undef NDEBUG
-#include <assert.h>
-#include <math.h>
-
-int main(void) {
-    #define QK 32
-    float src[QK];
-    uint8_t dst[24];
-    int64_t hist[16];
-
-    for (int i = 0; i < QK; i++) {
-        src[i] = (float)(i + 1);
-    }
-
-    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
-    assert(size == 20);
-    float max_result = ((float *)dst)[0];
-    float max_expected = src[31] / ((1 << 3) - 1);
-    assert(max_result == max_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
-        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
-        assert(q4_result == q4_expected);
-    }
-
-    size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
-    assert(size == 24);
-    float delta_result = ((float *)dst)[0];
-    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
-    assert(delta_result == delta_expected);
-    float min_result = ((float *)dst)[1];
-    float min_expected = src[0];
-    assert(min_result == min_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
-        uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
-        assert(q4_result == q4_expected);
-    }
-
-    return 0;
-}

From c50b628810f36a3e6e0324371f6db579eacefa0e Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Sat, 22 Apr 2023 10:54:13 +0000
Subject: [PATCH 10/74] Fix CI: ARM NEON, quantization unit tests, editorconfig
 (#1122)

---
 examples/main/main.cpp       |  2 +-
 ggml.c                       | 16 ++++++++--------
 llama.cpp                    |  1 -
 tests/test-quantize-fns.cpp  |  2 +-
 tests/test-quantize-perf.cpp |  2 +-
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 6d79a7e6f..decf41a9f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
 
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";
-    
+
     llama_context * ctx;
     g_ctx = &ctx;
 
diff --git a/ggml.c b/ggml.c
index 46c0292fe..d9a95af8d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2635,15 +2635,15 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
         sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0ls));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0hs));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
 
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1ls));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1ls));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1hs));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1hs));
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
diff --git a/llama.cpp b/llama.cpp
index 4e92f5515..34327ecfa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2256,7 +2256,6 @@ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_te
 
 // Returns the size of the state
 size_t llama_get_state_size(struct llama_context * ctx) {
-    const size_t s_bool = sizeof(int32_t);
     // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
     // for reference, std::mt19937(1337) serializes to 6701 bytes.
     const size_t s_rng_size = sizeof(size_t);
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 5a5410152..7e091e8c4 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -120,7 +120,7 @@ int main(int argc, char * argv[]) {
         ggml_type type = (ggml_type) i;
         quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
 
-        if (qfns.quantize_row_q) {
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             const float total_error = total_quantization_error(qfns, test_size, test_data.data());
             failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
             num_failed += failed;
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
index 883df05fe..d5514455d 100644
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -225,7 +225,7 @@ int main(int argc, char * argv[]) {
             continue;
         }
 
-        if (qfns.quantize_row_q) {
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             printf("%s\n", ggml_type_name(type));
 
             if (params.op_quantize_row_q_reference) {

From 857308d1e8fb6afe33edb481d48560eee8fe7d7c Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Sat, 22 Apr 2023 13:12:29 +0000
Subject: [PATCH 11/74] ci : trigger CI for drafts, but not most PR actions
 (#1125)

---
 .github/workflows/build.yml | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b2a35613e..7c40b0c12 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,7 +12,7 @@ on:
       - master
     paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
   pull_request:
-    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
+    types: [opened, synchronize, reopened]
     paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
 
 env:
@@ -20,8 +20,6 @@ env:
 
 jobs:
   ubuntu-latest-make:
-    if: github.event.pull_request.draft == false
-
     runs-on: ubuntu-latest
 
     steps:
@@ -41,8 +39,6 @@ jobs:
           make
 
   ubuntu-latest-cmake:
-    if: github.event.pull_request.draft == false
-
     runs-on: ubuntu-latest
 
     steps:
@@ -71,8 +67,6 @@ jobs:
           ctest --verbose
 
   ubuntu-latest-cmake-sanitizer:
-    if: github.event.pull_request.draft == false
-
     runs-on: ubuntu-latest
 
     continue-on-error: true
@@ -108,8 +102,6 @@ jobs:
           ctest --verbose
 
   macOS-latest-make:
-    if: github.event.pull_request.draft == false
-
     runs-on: macos-latest
 
     steps:
@@ -128,8 +120,6 @@ jobs:
           make
 
   macOS-latest-cmake:
-    if: github.event.pull_request.draft == false
-
     runs-on: macOS-latest
 
     steps:
@@ -157,8 +147,6 @@ jobs:
           ctest --verbose
 
   windows-latest-cmake:
-    if: github.event.pull_request.draft == false
-
     runs-on: windows-latest
 
     strategy:

From 0e018fe008eacebdbcfa2d61b6c988c245c961cd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 22 Apr 2023 16:31:56 +0300
Subject: [PATCH 12/74] ggml : fix Q4_3 cuBLAS

---
 CMakeLists.txt | 2 ++
 ggml.c         | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d4e30e5a..11ebe9eb6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -312,6 +312,7 @@ add_library(ggml OBJECT
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
@@ -324,6 +325,7 @@ add_library(llama
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
+
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
diff --git a/ggml.c b/ggml.c
index d9a95af8d..281b20283 100644
--- a/ggml.c
+++ b/ggml.c
@@ -7992,6 +7992,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
         else if (type == GGML_TYPE_Q4_2) {
             dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
         }
+        else if (type == GGML_TYPE_Q4_3) {
+            dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
+        }
         else {
             GGML_ASSERT(false);
         }

From c9e2c26f413377b352845f442cdab976ce85a05d Mon Sep 17 00:00:00 2001
From: Yishuo Wang <MeouSker77@outlook.com>
Date: Sun, 23 Apr 2023 15:57:05 +0800
Subject: [PATCH 13/74] A better `packNibbles` and `mul_sum_i8_pairs_float`
 implementation using AVX512 (#1119)

---
 ggml.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ggml.c b/ggml.c
index 281b20283..3c45c5e9d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -509,14 +509,25 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
     const __m256i ax = _mm256_sign_epi8(x, x);
     // Sign the values of the y vectors
     const __m256i sy = _mm256_sign_epi8(y, x);
+#if __AVXVNNI__
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+    return _mm256_cvtepi32_ps(summed_pairs);
+#else
     // Perform multiplication and create 16-bit values
     const __m256i dot = _mm256_maddubs_epi16(ax, sy);
     return sum_i16_pairs_float(dot);
+#endif
 }
 
 static inline __m128i packNibbles( __m256i bytes )
 {
     // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
+#if __AVX512F__
+    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
+    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
+    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
+#else
     const __m256i lowByte = _mm256_set1_epi16( 0xFF );
     __m256i high = _mm256_andnot_si256( lowByte, bytes );
     __m256i low = _mm256_and_si256( lowByte, bytes );
@@ -527,6 +538,7 @@ static inline __m128i packNibbles( __m256i bytes )
     __m128i r0 = _mm256_castsi256_si128( bytes );
     __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
     return _mm_packus_epi16( r0, r1 );
+#endif
 }
 #else
 static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )

From c6524f46eb93fdb949330293a8469fd70080bd5a Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Sun, 23 Apr 2023 10:21:26 +0200
Subject: [PATCH 14/74] readme : update gpt4all instructions (#980)

---
 README.md | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 324d49f07..7bf2cc1ba 100644
--- a/README.md
+++ b/README.md
@@ -275,18 +275,19 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 
-- Obtain the `gpt4all-lora-quantized.bin` model
-- It is distributed in the old `ggml` format, which is now obsoleted
-- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
-convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
+- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
+- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
+- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
+- It is distributed in the old `ggml` format which is now obsoleted
+- You have to convert it to the new format using `convert.py`:
 
-  ```bash
-  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
-  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
-  ```
+```bash
+python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin
+```
 
-- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
-- The original model is saved in the same folder with a suffix `.orig`
+- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models
+
+- The newer GPT4All-J model is not yet supported!
 
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
 

From 53c8434398b3cba7ac6298cdd44abd40f0e640b1 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Sun, 23 Apr 2023 11:01:03 +0000
Subject: [PATCH 15/74] Improve AVX2 for vec_dot_q4_3_q8_0 (#1138)

---
 ggml.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/ggml.c b/ggml.c
index 3c45c5e9d..3ee2d0814 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2947,6 +2947,7 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
+    float summs = 0.0f;
 
     // Main loop
     for (int i = 0; i < nb; i++) {
@@ -2954,9 +2955,8 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
         const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
         const __m256 dx = _mm256_set_m128(d1, d0);
 
-        const __m128 m0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].m));
-        const __m128 m1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].m));
-        const __m256 mx = _mm256_set_m128(m1, m0);
+        summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0
+               + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1;
 
         const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
         const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
@@ -2965,16 +2965,12 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
         const __m256 dy = _mm256_broadcast_ss(&y[i].d);
         const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
 
-        const __m256i syi = _mm256_maddubs_epi16(_mm256_set1_epi8(1), by);
-        const __m256 syf = sum_i16_pairs_float(syi);
-
         const __m256 q = mul_sum_i8_pairs_float(bx, by);
 
-        const __m256 sxy = _mm256_fmadd_ps(q, dx, _mm256_mul_ps(mx, syf));
-        acc = _mm256_fmadd_ps(sxy, dy, acc);
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
     }
 
-    *s = hsum_float_8(acc);
+    *s = hsum_float_8(acc) + summs;
 #else
     // scalar
     float sumf = 0.0;

From e4422e299c10c7e84c8e987770ef40d31905a76b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 23 Apr 2023 18:15:39 +0300
Subject: [PATCH 16/74] ggml : better PERF prints + support "LLAMA_PERF=1 make"

---
 Makefile  | 4 ++++
 ggml.c    | 4 ++--
 llama.cpp | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index b297959c9..0c7b6548d 100644
--- a/Makefile
+++ b/Makefile
@@ -117,6 +117,10 @@ ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
 endif
+ifdef LLAMA_PERF
+	CFLAGS   += -DGGML_PERF
+	CXXFLAGS += -DGGML_PERF
+endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
diff --git a/ggml.c b/ggml.c
index 3ee2d0814..23dae2d9b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11239,7 +11239,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
 
         perf_total_per_op_us[node->op] += node->perf_time_us;
 
-        GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
+        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
                 node->ne[0], node->ne[1], node->ne[2],
                 GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
@@ -11253,7 +11253,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * node = cgraph->leafs[i];
 
-        GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n",
+        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
                 i,
                 node->ne[0], node->ne[1],
                 GGML_OP_LABEL[node->op]);
diff --git a/llama.cpp b/llama.cpp
index 34327ecfa..8c1d65778 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1250,9 +1250,11 @@ static bool llama_eval_internal(
     ggml_build_forward_expand(&gf, inpL);
     ggml_graph_compute       (ctx0, &gf);
 
+#ifdef GGML_PERF
     // print timing information per ggml operation (for debugging purposes)
     // requires GGML_PERF to be defined
-    //ggml_graph_print(&gf);
+    ggml_graph_print(&gf);
+#endif
 
     // plot the computation graph in dot format (for debugging purposes)
     //if (n_past%100 == 0) {

From ec9cdb6752dd96b3cc74d90ad1adeba5b4fa2b0e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 23 Apr 2023 18:32:52 +0300
Subject: [PATCH 17/74] ggml : do not print perf ops that have not been used at
 all

---
 ggml.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 23dae2d9b..f8f73af3e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11237,7 +11237,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * node = cgraph->nodes[i];
 
-        perf_total_per_op_us[node->op] += node->perf_time_us;
+        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
 
         GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
                 i,
@@ -11260,6 +11260,10 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
     }
 
     for (int i = 0; i < GGML_OP_COUNT; i++) {
+        if (perf_total_per_op_us[i] == 0) {
+            continue;
+        }
+
         GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
     }
 

From edce63baa9dbd3963c3441bce07ee0acbb635697 Mon Sep 17 00:00:00 2001
From: DannyDaemonic <DannyDaemonic@gmail.com>
Date: Sun, 23 Apr 2023 08:37:02 -0700
Subject: [PATCH 18/74] Added README.md for main with examples and explanations
 (#1139)

---
 examples/main/README.md | 182 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 2 deletions(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index f09e7ba97..dcfbdfd99 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,3 +1,181 @@
-# main
+# llama.cpp/example/main
 
-TODO
+This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Common Options](#common-options)
+3. [Input Prompts](#input-prompts)
+4. [Interaction](#interaction)
+5. [Context Management](#context-management)
+6. [Generation Flags](#generation-flags)
+7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+8. [Additional Options](#additional-options)
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+```bash
+./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
+```
+
+For an interactive experience, try this command:
+
+```bash
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
+```
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
+-   `-t N, --threads N`: Set the number of threads to use during computation. It is recommended to set this to the number of physical cores your CPU has.
+-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+
+## Input Prompts
+
+The `main` program provides several ways to interact with the LLaMA models using input prompts:
+
+-   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
+-   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
+-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
+-   `--random-prompt`: Start with a randomized prompt.
+
+## Interaction
+
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+
+In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
+
+### Interaction Options
+
+-   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
+-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
+-   `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
+-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
+
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+
+### Reverse Prompts
+
+Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
+
+-   `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
+
+To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
+
+### In-Prefix
+
+The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
+
+```sh
+./main -r "User:" --in-prefix " "
+```
+
+### Instruction Mode
+
+Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
+
+-   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
+
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+
+## Context Management
+
+During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
+
+### Context Size
+
+The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+
+-   `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+
+### Keep Prompt
+
+The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
+
+-   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+
+By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+
+## Generation Flags
+
+The following options are related to controlling the text generation process, influencing the diversity, creativity, and quality of the generated text. Understanding these options will help you fine-tune the output according to your needs:
+
+### Number of Tokens to Predict
+
+-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+
+The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
+
+### RNG Seed
+
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
+
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run.
+
+### Temperature
+
+-   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
+
+Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
+
+Example usage: `--temp 0.8`
+
+### Repeat Penalty
+
+-   `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+
+Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+
+Example usage: `--repeat_penalty 1.1`
+
+### Top-K Sampling
+
+-   `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
+
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+
+Example usage: `--top_k 40`
+
+### Top-P Sampling
+
+-   `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+
+Example usage: `--top_p 0.9`
+
+By adjusting these options, you can control the diversity, quality, and creativity of the generated text to better suit your needs. You can experiment with different combinations of values to find the best settings for your specific use case.
+
+## Performance Tuning and Memory Options
+
+These options help improve the performance and memory usage of the LLaMA models:
+
+-   `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores.
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
+-   `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
+-   `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.
+-   `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+
+By understanding and using these performance tuning settings, you can optimize the LLaMA model's behavior to achieve the best performance for your specific needs.
+
+## Additional Options
+
+These options provide extra functionality and customization when running the LLaMA models:
+
+-   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
+-   `--verbose-prompt`: Print the prompt before generating text.
+-   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+-   `--lora FNAME`: Apply a LoRA (Layer-wise Relevance Approximation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

From 284685f1692258c2bcf08b86b723b80ba2e66c7a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 23 Apr 2023 19:57:09 +0300
Subject: [PATCH 19/74] scripts : add helper scripts to synch ggml repo

---
 scripts/sync-ggml.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100755 scripts/sync-ggml.sh

diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
new file mode 100755
index 000000000..e6e39ff8f
--- /dev/null
+++ b/scripts/sync-ggml.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+cp -rpv ../ggml/src/ggml.c          ./ggml.c
+cp -rpv ../ggml/src/ggml-cuda.cu    ./ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-cuda.h     ./ggml-cuda.h
+cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h

From 1d78fecdab4087028a38517e86ed129f077174d8 Mon Sep 17 00:00:00 2001
From: slaren <2141330+slaren@users.noreply.github.com>
Date: Sun, 23 Apr 2023 23:03:44 +0200
Subject: [PATCH 20/74] Fix LoRA acronym (#1145)

---
 examples/main/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index dcfbdfd99..5cbc5033b 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -177,5 +177,5 @@ These options provide extra functionality and customization when running the LLa
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
--   `--lora FNAME`: Apply a LoRA (Layer-wise Relevance Approximation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

From c4fe84fb0d28851a5c10e5a633f82ae2ba3b7fae Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Apr 2023 07:40:02 +0300
Subject: [PATCH 21/74] llama : refactor get / set state + remove redundant kv
 cache API (#1143)

---
 llama.cpp | 323 ++++++++++++++++++++++++++++++------------------------
 llama.h   |  14 ---
 2 files changed, 181 insertions(+), 156 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 8c1d65778..bc0ef1281 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2072,35 +2072,191 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
     }
 }
 
-// Returns the KV cache that will contain the context for the
-// ongoing prediction with the model.
-const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
-    return ctx->model.kv_self.buf.addr;
-}
-
-// Returns the size of the KV cache
-size_t llama_get_kv_cache_size(struct llama_context * ctx) {
-    return ctx->model.kv_self.buf.size;
-}
-
 int llama_get_kv_cache_token_count(struct llama_context * ctx) {
     return ctx->model.kv_self.n;
 }
 
-// Sets the KV cache containing the current context for the model
-void llama_set_kv_cache(
-        struct llama_context * ctx,
-               const uint8_t * kv_cache,
-                      size_t   n_size,
-                         int   n_token_count) {
-    // Make sure we have the same kv cache setup
-    LLAMA_ASSERT(ctx->model.kv_self.buf.size == n_size);
-    void * k_data = ctx->model.kv_self.k->data; // remember data pointers
-    void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
-    memcpy(ctx->model.kv_self.buf.addr, kv_cache, n_size);
-    ctx->model.kv_self.k->data = k_data; // restore correct data pointers
-    ctx->model.kv_self.v->data = v_data;
-    ctx->model.kv_self.n = n_token_count;
+#define LLAMA_MAX_RNG_STATE 64*1024
+
+// Returns the size of the state
+size_t llama_get_state_size(struct llama_context * ctx) {
+    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
+    // for reference, std::mt19937(1337) serializes to 6701 bytes.
+    const size_t s_rng_size        = sizeof(size_t);
+    const size_t s_rng             = LLAMA_MAX_RNG_STATE;
+    const size_t s_logits_capacity = sizeof(size_t);
+    const size_t s_logits_size     = sizeof(size_t);
+    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
+    const size_t s_embedding_size  = sizeof(size_t);
+    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
+    const size_t s_kv_size         = sizeof(size_t);
+    const size_t s_kv_ntok         = sizeof(int);
+    const size_t s_kv              = ctx->model.kv_self.buf.size;
+
+    const size_t s_total = (
+        + s_rng_size
+        + s_rng
+        + s_logits_capacity
+        + s_logits_size
+        + s_logits
+        + s_embedding_size
+        + s_embedding
+        + s_kv_size
+        + s_kv_ntok
+        + s_kv
+    );
+
+    return s_total;
+}
+
+// Copies the state to the specified destination address
+size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
+    uint8_t * out = dest;
+
+    // copy rng
+    {
+        std::stringstream rng_ss;
+        rng_ss << ctx->rng;
+
+        const size_t rng_size = rng_ss.str().size();
+        char rng_buf[LLAMA_MAX_RNG_STATE];
+
+        memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
+        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+
+        memcpy(out, &rng_size,   sizeof(rng_size));    out += sizeof(rng_size);
+        memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
+    }
+
+    // copy logits
+    {
+        const size_t logits_cap  = ctx->logits.capacity();
+        const size_t logits_size = ctx->logits.size();
+
+        memcpy(out, &logits_cap,  sizeof(logits_cap));  out += sizeof(logits_cap);
+        memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
+
+        if (logits_size) {
+            memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
+        }
+
+        out += logits_cap * sizeof(float);
+    }
+
+    // copy embeddings
+    {
+        const size_t embedding_size = ctx->embedding.size();
+
+        memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
+
+        if (embedding_size) {
+            memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
+            out += embedding_size * sizeof(float);
+        }
+    }
+
+    // copy kv cache
+    {
+        const size_t kv_size = ctx->model.kv_self.buf.size;
+        const int    kv_ntok = llama_get_kv_cache_token_count(ctx);
+
+        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
+        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
+
+        if (kv_size) {
+            memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
+        }
+    }
+
+    const size_t written  = out - dest;
+    const size_t expected = llama_get_state_size(ctx);
+
+    LLAMA_ASSERT(written == expected);
+
+    return written;
+}
+
+// Sets the state reading from the specified source address
+size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
+    const uint8_t * in = src;
+
+    // set rng
+    {
+        size_t rng_size;
+        char   rng_buf[LLAMA_MAX_RNG_STATE];
+
+        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
+        memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
+
+        std::stringstream rng_ss;
+        rng_ss.str(std::string(&rng_buf[0], rng_size));
+        rng_ss >> ctx->rng;
+
+        LLAMA_ASSERT(rng_ss.fail() == false);
+    }
+
+    // set logits
+    {
+        size_t logits_cap;
+        size_t logits_size;
+
+        memcpy(&logits_cap,  in, sizeof(logits_cap));  in += sizeof(logits_cap);
+        memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
+
+        LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
+
+        if (logits_size) {
+            ctx->logits.resize(logits_size);
+            memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
+        }
+
+        in += logits_cap * sizeof(float);
+    }
+
+    // set embeddings
+    {
+        size_t embedding_size;
+
+        memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
+
+        LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
+
+        if (embedding_size) {
+            memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
+            in += embedding_size * sizeof(float);
+        }
+    }
+
+    // set kv cache
+    {
+        size_t kv_size;
+        int kv_ntok;
+
+        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
+        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
+
+        if (kv_size) {
+            LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
+
+            void * k_data = ctx->model.kv_self.k->data; // remember data pointers
+            void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
+
+            memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
+
+            ctx->model.kv_self.k->data = k_data; // restore correct data pointers
+            ctx->model.kv_self.v->data = v_data;
+
+        }
+
+        ctx->model.kv_self.n = kv_ntok;
+    }
+
+    const size_t nread    = in - src;
+    const size_t expected = llama_get_state_size(ctx);
+
+    LLAMA_ASSERT(nread == expected);
+
+    return nread;
 }
 
 int llama_eval(
@@ -2256,120 +2412,3 @@ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_te
     return ctx->model.tensors_by_name;
 }
 
-// Returns the size of the state
-size_t llama_get_state_size(struct llama_context * ctx) {
-    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
-    // for reference, std::mt19937(1337) serializes to 6701 bytes.
-    const size_t s_rng_size = sizeof(size_t);
-    const size_t s_rng = 64*1024;
-    const size_t s_logits_capacity = sizeof(size_t);
-    const size_t s_logits_size = sizeof(size_t);
-    const size_t s_logits = ctx->logits.capacity() * sizeof(float);
-    const size_t s_embedding_size = sizeof(size_t);
-    const size_t s_embedding = ctx->embedding.size() * sizeof(float);
-    const size_t s_kv_size = sizeof(size_t);
-    const size_t s_kv_ntok = sizeof(int);
-    const size_t s_kv = llama_get_kv_cache_size(ctx);
-    const size_t s_total = (
-        + s_rng_size
-        + s_rng
-        + s_logits_capacity
-        + s_logits_size
-        + s_logits
-        + s_embedding_size
-        + s_embedding
-        + s_kv_size
-        + s_kv_ntok
-        + s_kv
-    );
-    return s_total;
-}
-
-// Copies the state to the specified destination address
-size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
-    std::stringstream rng_ss;
-    rng_ss << ctx->rng;
-    const size_t rng_size = rng_ss.str().size();
-    char rng_buf[64*1024];
-    memset(&rng_buf[0], 0, 64*1024);
-    memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
-    const size_t logits_capacity = ctx->logits.capacity();
-    const size_t logits_size = ctx->logits.size();
-    const size_t embedding_size = ctx->embedding.size();
-    const size_t kv_size = llama_get_kv_cache_size(ctx);
-    const int kv_ntok = llama_get_kv_cache_token_count(ctx);
-
-    uint8_t * out = dest;
-    memcpy(out, &rng_size, sizeof(size_t)); out += sizeof(size_t);
-    memcpy(out, &rng_buf[0], 64*1024); out += 64*1024;
-    memcpy(out, &logits_capacity, sizeof(size_t)); out += sizeof(size_t);
-    memcpy(out, &logits_size, sizeof(size_t)); out += sizeof(size_t);
-    if (logits_size) {
-        memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
-    }
-    out += logits_capacity * sizeof(float);
-    memcpy(out, &embedding_size, sizeof(size_t)); out += sizeof(size_t);
-    if (embedding_size) {
-        memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float)); out += embedding_size * sizeof(float);
-    }
-    memcpy(out, &kv_size, sizeof(size_t)); out += sizeof(size_t);
-    memcpy(out, &kv_ntok, sizeof(int)); out += sizeof(int);
-    if (kv_size) {
-        memcpy(out, llama_get_kv_cache(ctx), kv_size); out += kv_size;
-    }
-    const size_t written = out - dest;
-    const size_t expected = llama_get_state_size(ctx);
-    LLAMA_ASSERT(written == expected);
-    return written;
-}
-
-// Sets the state reading from the specified source address
-size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    size_t rng_size;
-    char rng_buf[64*1024];
-    std::stringstream rng_ss;
-
-    const uint8_t * in = src;
-    memcpy(&rng_size, in, sizeof(size_t)); in += sizeof(size_t);
-    memcpy(&rng_buf[0], in, 64*1024); in += 64*1024;
-    rng_ss.str(std::string(&rng_buf[0], rng_size));
-    rng_ss >> ctx->rng;
-    LLAMA_ASSERT(rng_ss.fail() == false);
-
-    size_t logits_capacity;
-    size_t logits_size;
-    size_t embedding_size;
-    size_t kv_size;
-    int kv_ntok;
-
-    memcpy(&logits_capacity, in, sizeof(size_t)); in += sizeof(size_t);
-    memcpy(&logits_size, in, sizeof(size_t)); in += sizeof(size_t);
-    LLAMA_ASSERT(ctx->logits.capacity() == logits_capacity);
-    if (logits_size) {
-        ctx->logits.resize(logits_size);
-        memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
-    }
-    in += logits_capacity * sizeof(float);
-    memcpy(&embedding_size, in, sizeof(size_t)); in += sizeof(size_t);
-    LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
-    if (embedding_size) {
-        memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
-        in += embedding_size * sizeof(float);
-    }
-    memcpy(&kv_size, in, sizeof(size_t)); in += sizeof(size_t);
-    memcpy(&kv_ntok, in, sizeof(int)); in += sizeof(int);
-    if (kv_size) {
-        LLAMA_ASSERT(ctx->model.kv_self.buf.size == kv_size);
-        void * k_data = ctx->model.kv_self.k->data; // remember data pointers
-        void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
-        memcpy(ctx->model.kv_self.buf.addr, in, kv_size);
-        ctx->model.kv_self.k->data = k_data; // restore correct data pointers
-        ctx->model.kv_self.v->data = v_data;
-        in += kv_size;
-    }
-    ctx->model.kv_self.n = kv_ntok;
-    const size_t nread = in - src;
-    const size_t expected = llama_get_state_size(ctx);
-    LLAMA_ASSERT(nread == expected);
-    return nread;
-}
diff --git a/llama.h b/llama.h
index f68a0cb40..e9e3abea5 100644
--- a/llama.h
+++ b/llama.h
@@ -112,23 +112,9 @@ extern "C" {
                       const char * path_base_model,
                              int   n_threads);
 
-    // Returns the KV cache that will contain the context for the
-    // ongoing prediction with the model.
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
-
-    // Returns the size of the KV cache
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
-
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
 
-    // Sets the KV cache containing the current context for the model
-    LLAMA_API void llama_set_kv_cache(
-            struct llama_context * ctx,
-                   const uint8_t * kv_cache,
-                          size_t   n_size,
-                             int   n_token_count);
-
     // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
     LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
 

From e4cf982e0d4fcfbb4b977a52dbeacd115da10c3b Mon Sep 17 00:00:00 2001
From: slaren <2141330+slaren@users.noreply.github.com>
Date: Mon, 24 Apr 2023 17:29:58 +0200
Subject: [PATCH 22/74] Fix cuda compilation (#1128)

* Fix: Issue with CUBLAS compilation error due to missing -fPIC flag

---------

Co-authored-by: B1gM8c <89020353+B1gM8c@users.noreply.github.com>
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 0c7b6548d..8fbb19c46 100644
--- a/Makefile
+++ b/Makefile
@@ -109,9 +109,9 @@ ifdef LLAMA_CUBLAS
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
+	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg

From 2ec83428de7a876ecbbe484e1de42b73b5a40e25 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Mon, 24 Apr 2023 15:38:26 +0000
Subject: [PATCH 23/74] Fix build for gcc 8 and test in CI (#1154)

---
 .github/workflows/build.yml | 10 +++++-----
 ggml.c                      |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7c40b0c12..179080576 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,8 +19,8 @@ env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 
 jobs:
-  ubuntu-latest-make:
-    runs-on: ubuntu-latest
+  ubuntu-focal-make:
+    runs-on: ubuntu-20.04
 
     steps:
       - name: Clone
@@ -31,12 +31,12 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential gcc-8
 
       - name: Build
         id: make_build
         run: |
-          make
+          CC=gcc-8 make
 
   ubuntu-latest-cmake:
     runs-on: ubuntu-latest
@@ -216,7 +216,7 @@ jobs:
     runs-on: ubuntu-latest
 
     needs:
-      - ubuntu-latest-make
+      - ubuntu-focal-make
       - ubuntu-latest-cmake
       - macOS-latest-make
       - macOS-latest-cmake
diff --git a/ggml.c b/ggml.c
index f8f73af3e..6e46c0e5a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -436,7 +436,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
 {
     // Load 8 bytes from memory
-    __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
+    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
 
     // Expand bytes into uint16_t values
     __m128i bytes = _mm_cvtepu8_epi16( tmp );

From 9b0a4d421459f4e5e1af735c9784c3247b379025 Mon Sep 17 00:00:00 2001
From: mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
Date: Mon, 24 Apr 2023 17:45:32 +0200
Subject: [PATCH 24/74] examples/main README improvements and some light
 refactoring (#1131)

---
 README.md               |  2 +-
 examples/common.cpp     |  4 +---
 examples/common.h       |  2 +-
 examples/main/README.md | 14 ++++++++++++--
 examples/main/main.cpp  |  6 +++---
 5 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 7bf2cc1ba..44cf72124 100644
--- a/README.md
+++ b/README.md
@@ -241,7 +241,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```
 
-Note the use of `--color` to distinguish between user input and generated text.
+Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
 
 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
 
diff --git a/examples/common.cpp b/examples/common.cpp
index a0b6f10ad..c0e87eb9f 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.interactive = true;
         } else if (arg == "--embedding") {
             params.embedding = true;
-        } else if (arg == "--interactive-start") {
-            params.interactive = true;
         } else if (arg == "--interactive-first") {
-            params.interactive_start = true;
+            params.interactive_first = true;
         } else if (arg == "-ins" || arg == "--instruct") {
             params.instruct = true;
         } else if (arg == "--color") {
diff --git a/examples/common.h b/examples/common.h
index 0470368d5..6f26b514d 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -43,7 +43,7 @@ struct gpt_params {
     bool interactive       = false; // interactive mode
 
     bool embedding         = false; // get only sentence embedding
-    bool interactive_start = false; // wait for user input immediately
+    bool interactive_first = false; // wait for user input immediately
 
     bool instruct          = false; // instruction mode (used for Alpaca models)
     bool ignore_eos        = false; // do not stop generating after eos
diff --git a/examples/main/README.md b/examples/main/README.md
index 5cbc5033b..234bf2eb5 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
 ./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
 ```
 
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
+```
+
 For an interactive experience, try this command:
 
 ```bash
 ./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
 ```
 
+Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
+
 ## Common Options
 
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
@@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
 
 -   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
 
+Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
+
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
 
 ## Context Management
@@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
 
 The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
 
-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
 
 ### RNG Seed
 
@@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
 
 -   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
 
-Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
+Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
 
 Example usage: `--temp 0.8`
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index decf41a9f..f9c9e9d98 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
 
     // in instruct mode, we inject a prefix and a suffix to each input by the user
     if (params.instruct) {
-        params.interactive_start = true;
+        params.interactive_first = true;
         params.antiprompt.push_back("### Instruction:\n\n");
     }
 
     // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_start) {
+    if (params.antiprompt.size() != 0 || params.interactive_first) {
         params.interactive = true;
     }
 
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
 #endif
                " - Press Return to return control to LLaMa.\n"
                " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_start;
+        is_interacting = params.interactive_first;
     }
 
     bool is_antiprompt = false;

From 957c8ae21d1e7052ea45a40ee8c0407b909e90cc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Apr 2023 18:47:03 +0300
Subject: [PATCH 25/74] llama : increase scratch buffer size for 65B (ref
 #1152)

Temporary solution
---
 llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index bc0ef1281..28d27916a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
         { MODEL_7B,    512ull * MB },
         { MODEL_13B,   512ull * MB },
         { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
     };
     return _MEM_REQ_SCRATCH0;
 }
@@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
         { MODEL_7B,    512ull * MB },
         { MODEL_13B,   512ull * MB },
         { MODEL_30B,   512ull * MB },
-        { MODEL_65B,   512ull * MB },
+        { MODEL_65B,  1024ull * MB },
     };
     return _MEM_REQ_SCRATCH1;
 }

From 0c5692345d5c046dbc6a7d311a00ae5842ac39c3 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 24 Apr 2023 18:23:31 +0200
Subject: [PATCH 26/74] examples : add save_load_state example (#1150)

* add save_load_state example

* use <cstdio> instead of <iostream> and fprintf / printf instead of cout

* renamed save-load-state example files replacing underscores by dashes
---
 examples/CMakeLists.txt                      |   1 +
 examples/save-load-state/CMakeLists.txt      |   4 +
 examples/save-load-state/save-load-state.cpp | 128 +++++++++++++++++++
 3 files changed, 133 insertions(+)
 create mode 100644 examples/save-load-state/CMakeLists.txt
 create mode 100644 examples/save-load-state/save-load-state.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 67a7cea54..be35363f5 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -34,4 +34,5 @@ else()
     add_subdirectory(quantize-stats)
     add_subdirectory(perplexity)
     add_subdirectory(embedding)
+    add_subdirectory(save-load-state)
 endif()
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
new file mode 100644
index 000000000..cff79fa1f
--- /dev/null
+++ b/examples/save-load-state/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET save-load-state)
+add_executable(${TARGET} save-load-state.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
new file mode 100644
index 000000000..39aa7f82c
--- /dev/null
+++ b/examples/save-load-state/save-load-state.cpp
@@ -0,0 +1,128 @@
+#include <vector>
+#include <cstdio>
+#include <chrono>
+
+#include "common.h"
+#include "llama.h"
+#include "llama.cpp"
+
+using namespace std;
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.model = "models/llama-7B/ggml-model.bin";
+    params.seed = 42;
+    params.n_threads = 4;
+    params.repeat_last_n = 64;
+    params.prompt = "The quick brown fox";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    auto lparams = llama_context_default_params();
+
+    lparams.n_ctx      = params.n_ctx;
+    lparams.n_parts    = params.n_parts;
+    lparams.seed       = params.seed;
+    lparams.f16_kv     = params.memory_f16;
+    lparams.use_mmap   = params.use_mmap;
+    lparams.use_mlock  = params.use_mlock;
+
+    auto n_past = 0;
+    auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
+
+    // init
+    auto ctx = llama_init_from_file(params.model.c_str(), lparams);
+    auto tokens = vector<llama_token>(params.n_ctx);
+    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
+
+    if (n_prompt_tokens < 1) {
+        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
+        return 1;
+    }
+
+    // evaluate prompt
+
+    llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
+
+    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
+    n_past += n_prompt_tokens;
+
+    // Save state (rng, logits, embedding and kv_cache) to file
+    FILE *fp_write = fopen("dump_state.bin", "wb");
+    auto state_size = llama_get_state_size(ctx);
+    auto state_mem = new uint8_t[state_size];
+    llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+    fwrite(state_mem, 1, state_size, fp_write);
+    fclose(fp_write);
+
+    // save state (last tokens)
+    auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
+    auto n_past_saved = n_past;
+
+    // first run
+    printf("\n%s", params.prompt.c_str());
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token = llama_sample_top_p_top_k(
+            ctx,
+            &last_n_tokens_data.back() - params.repeat_last_n,
+            params.repeat_last_n,
+            40,
+            1.0,
+            1.0,
+            1.1);
+        auto next_token_str = llama_token_to_str(ctx, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+
+    // free old model
+    llama_free(ctx);
+
+    // load new model
+
+    auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
+
+    // Load state (rng, logits, embedding and kv_cache) from file
+    FILE *fp_read = fopen("dump_state.bin", "rb");
+    auto state_size2 = llama_get_state_size(ctx2);
+    if (state_size != state_size2) {
+        fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+    }
+    fread(state_mem, 1, state_size, fp_read);
+    llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+    fclose(fp_read);
+
+    // restore state (last tokens)
+    last_n_tokens_data = last_n_tokens_data_saved;
+    n_past = n_past_saved;
+
+    // second run
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token = llama_sample_top_p_top_k(
+            ctx2,
+            &last_n_tokens_data.back() - params.repeat_last_n,
+            params.repeat_last_n,
+            40,
+            1.0,
+            1.0,
+            1.1);
+        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+    return 0;
+}

From 8a0f8673ba1cdc6aa6df27a9fbc698431ca70e8d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 24 Apr 2023 22:18:25 +0300
Subject: [PATCH 27/74] ggml : export symbols (#1155)

---
 ggml.h | 1301 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 660 insertions(+), 641 deletions(-)

diff --git a/ggml.h b/ggml.h
index 460d4ffe0..275890781 100644
--- a/ggml.h
+++ b/ggml.h
@@ -169,14 +169,27 @@
 //
 //
 
-#ifdef  __cplusplus
-extern "C" {
+#ifdef GGML_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BUILD
+#            define GGML_API __declspec(dllexport)
+#        else
+#            define GGML_API __declspec(dllimport)
+#        endif
+#    else
+#        define GGML_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define GGML_API
 #endif
 
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
 
+#define GGML_FILE_MAGIC   0x67676d6c // "ggml"
+#define GGML_FILE_VERSION 1
+
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        16
@@ -184,682 +197,688 @@ extern "C" {
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 
-#ifdef __ARM_NEON
-// we use the built-in 16-bit float type
-typedef __fp16 ggml_fp16_t;
-#else
-typedef uint16_t ggml_fp16_t;
+#ifdef  __cplusplus
+extern "C" {
 #endif
 
-// convert FP16 <-> FP32
-float       ggml_fp16_to_fp32(ggml_fp16_t x);
-ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-struct ggml_object;
-struct ggml_context;
-
-enum ggml_type {
-    // explicitly numbered values are used in llama.cpp files
-    GGML_TYPE_F32  = 0,
-    GGML_TYPE_F16  = 1,
-    GGML_TYPE_Q4_0 = 2,
-    GGML_TYPE_Q4_1 = 3,
-    GGML_TYPE_Q4_2 = 4,
-    GGML_TYPE_Q4_3 = 5,
-    GGML_TYPE_Q8_0 = 6,
-    GGML_TYPE_I8,
-    GGML_TYPE_I16,
-    GGML_TYPE_I32,
-    GGML_TYPE_COUNT,
-};
-
-// available tensor operations:
-enum ggml_op {
-    GGML_OP_NONE = 0,
-
-    GGML_OP_DUP,
-    GGML_OP_ADD,
-    GGML_OP_SUB,
-    GGML_OP_MUL,
-    GGML_OP_DIV,
-    GGML_OP_SQR,
-    GGML_OP_SQRT,
-    GGML_OP_SUM,
-    GGML_OP_MEAN,
-    GGML_OP_REPEAT,
-    GGML_OP_ABS,
-    GGML_OP_SGN,
-    GGML_OP_NEG,
-    GGML_OP_STEP,
-    GGML_OP_RELU,
-    GGML_OP_GELU,
-    GGML_OP_SILU,
-    GGML_OP_NORM, // normalize
-    GGML_OP_RMS_NORM,
-
-    GGML_OP_MUL_MAT,
-
-    GGML_OP_SCALE,
-    GGML_OP_CPY,
-    GGML_OP_CONT,
-    GGML_OP_RESHAPE,
-    GGML_OP_VIEW,
-    GGML_OP_PERMUTE,
-    GGML_OP_TRANSPOSE,
-    GGML_OP_GET_ROWS,
-    GGML_OP_DIAG_MASK_INF,
-    GGML_OP_SOFT_MAX,
-    GGML_OP_ROPE,
-    GGML_OP_CONV_1D_1S,
-    GGML_OP_CONV_1D_2S,
-
-    GGML_OP_FLASH_ATTN,
-    GGML_OP_FLASH_FF,
-
-    GGML_OP_MAP_UNARY,
-    GGML_OP_MAP_BINARY,
-
-    GGML_OP_COUNT,
-};
-
-
-// ggml object
-struct ggml_object {
-    size_t offs;
-    size_t size;
-
-    struct ggml_object * next;
-
-    char padding[8];
-};
-
-static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
-// n-dimensional tensor
-struct ggml_tensor {
-    enum ggml_type type;
-
-    int    n_dims;
-    int64_t ne[GGML_MAX_DIMS]; // number of elements
-    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                               // nb[0] = sizeof(type)
-                               // nb[1] = nb[0]   * ne[0] + padding
-                               // nb[i] = nb[i-1] * ne[i-1]
-
-    // compute data
-    enum ggml_op op;
-
-    bool is_param;
-
-    struct ggml_tensor * grad;
-    struct ggml_tensor * src0;
-    struct ggml_tensor * src1;
-    struct ggml_tensor * opt[GGML_MAX_OPT];
-
-    // thread scheduling
-    int n_tasks;
-
-    // performance
-    int     perf_runs;
-    int64_t perf_cycles;
-    int64_t perf_time_us;
-
-    void * data;
-    char padding[8];
-};
-
-// computation graph
-struct ggml_cgraph {
-    int n_nodes;
-    int n_leafs;
-    int n_threads;
-
-    size_t work_size;
-    struct ggml_tensor * work;
-
-    struct ggml_tensor * nodes[GGML_MAX_NODES];
-    struct ggml_tensor * grads[GGML_MAX_NODES];
-    struct ggml_tensor * leafs[GGML_MAX_NODES];
-
-    // performance
-    int     perf_runs;
-    int64_t perf_cycles;
-    int64_t perf_time_us;
-};
-
-// scratch buffer
-struct ggml_scratch {
-    size_t offs;
-    size_t size;
-    void * data;
-};
-
-struct ggml_init_params {
-    // memory pool
-    size_t mem_size;   // bytes
-    void * mem_buffer; // if NULL, memory will be allocated internally
-    bool   no_alloc;   // don't allocate memory for the tensor data
-};
-
-void    ggml_time_init(void); // call this once at the beginning of the program
-int64_t ggml_time_ms(void);
-int64_t ggml_time_us(void);
-int64_t ggml_cycles(void);
-int64_t ggml_cycles_per_ms(void);
+#ifdef __ARM_NEON
+    // we use the built-in 16-bit float type
+    typedef __fp16 ggml_fp16_t;
+#else
+    typedef uint16_t ggml_fp16_t;
+#endif
+
+    // convert FP16 <-> FP32
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
+
+    struct ggml_object;
+    struct ggml_context;
+
+    enum ggml_type {
+        GGML_TYPE_F32  = 0,
+        GGML_TYPE_F16  = 1,
+        GGML_TYPE_Q4_0 = 2,
+        GGML_TYPE_Q4_1 = 3,
+        GGML_TYPE_Q4_2 = 4,
+        GGML_TYPE_Q4_3 = 5,
+        GGML_TYPE_Q8_0 = 6,
+        GGML_TYPE_I8,
+        GGML_TYPE_I16,
+        GGML_TYPE_I32,
+        GGML_TYPE_COUNT,
+    };
+
+    // available tensor operations:
+    enum ggml_op {
+        GGML_OP_NONE = 0,
+
+        GGML_OP_DUP,
+        GGML_OP_ADD,
+        GGML_OP_SUB,
+        GGML_OP_MUL,
+        GGML_OP_DIV,
+        GGML_OP_SQR,
+        GGML_OP_SQRT,
+        GGML_OP_SUM,
+        GGML_OP_MEAN,
+        GGML_OP_REPEAT,
+        GGML_OP_ABS,
+        GGML_OP_SGN,
+        GGML_OP_NEG,
+        GGML_OP_STEP,
+        GGML_OP_RELU,
+        GGML_OP_GELU,
+        GGML_OP_SILU,
+        GGML_OP_NORM, // normalize
+        GGML_OP_RMS_NORM,
+
+        GGML_OP_MUL_MAT,
+
+        GGML_OP_SCALE,
+        GGML_OP_CPY,
+        GGML_OP_CONT,
+        GGML_OP_RESHAPE,
+        GGML_OP_VIEW,
+        GGML_OP_PERMUTE,
+        GGML_OP_TRANSPOSE,
+        GGML_OP_GET_ROWS,
+        GGML_OP_DIAG_MASK_INF,
+        GGML_OP_SOFT_MAX,
+        GGML_OP_ROPE,
+        GGML_OP_CONV_1D_1S,
+        GGML_OP_CONV_1D_2S,
+
+        GGML_OP_FLASH_ATTN,
+        GGML_OP_FLASH_FF,
+
+        GGML_OP_MAP_UNARY,
+        GGML_OP_MAP_BINARY,
+
+        GGML_OP_COUNT,
+    };
+
+
+    // ggml object
+    struct ggml_object {
+        size_t offs;
+        size_t size;
+
+        struct ggml_object * next;
+
+        char padding[8];
+    };
+
+    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
+    // n-dimensional tensor
+    struct ggml_tensor {
+        enum ggml_type type;
+
+        int     n_dims;
+        int64_t ne[GGML_MAX_DIMS]; // number of elements
+        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                                   // nb[0] = sizeof(type)
+                                   // nb[1] = nb[0]   * ne[0] + padding
+                                   // nb[i] = nb[i-1] * ne[i-1]
+
+        // compute data
+        enum ggml_op op;
+
+        bool is_param;
+
+        struct ggml_tensor * grad;
+        struct ggml_tensor * src0;
+        struct ggml_tensor * src1;
+        struct ggml_tensor * opt[GGML_MAX_OPT];
+
+        // thread scheduling
+        int n_tasks;
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+
+        void * data;
+        char padding[8];
+    };
+
+    // computation graph
+    struct ggml_cgraph {
+        int n_nodes;
+        int n_leafs;
+        int n_threads;
+
+        size_t work_size;
+        struct ggml_tensor * work;
+
+        struct ggml_tensor * nodes[GGML_MAX_NODES];
+        struct ggml_tensor * grads[GGML_MAX_NODES];
+        struct ggml_tensor * leafs[GGML_MAX_NODES];
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+    };
+
+    // scratch buffer
+    struct ggml_scratch {
+        size_t offs;
+        size_t size;
+        void * data;
+    };
 
-void ggml_print_object (const struct ggml_object * obj);
-void ggml_print_objects(const struct ggml_context * ctx);
+    struct ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+    };
 
-int64_t ggml_nelements(const struct ggml_tensor * tensor);
-size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
+    // misc
 
-int    ggml_blck_size (enum ggml_type type);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
+    GGML_API int64_t ggml_time_ms(void);
+    GGML_API int64_t ggml_time_us(void);
+    GGML_API int64_t ggml_cycles(void);
+    GGML_API int64_t ggml_cycles_per_ms(void);
 
-const char * ggml_type_name(enum ggml_type type);
+    GGML_API void    ggml_print_object (const struct ggml_object * obj);
+    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
 
-size_t ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
 
-bool ggml_is_quantized(enum ggml_type type);
+    GGML_API int     ggml_blck_size (enum ggml_type type);
+    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+
+    GGML_API const char * ggml_type_name(enum ggml_type type);
+
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
+
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);
+
+    // main
+
+    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API void    ggml_free(struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int    n_dims,
+            const int64_t *ne);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2,
+            int64_t ne3);
+
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
+    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 
-struct ggml_context * ggml_init(struct ggml_init_params params);
-void ggml_free(struct ggml_context * ctx);
-
-size_t ggml_used_mem(const struct ggml_context * ctx);
-
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
-
-struct ggml_tensor * ggml_new_tensor(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int    n_dims,
-        const int64_t *ne);
-
-struct ggml_tensor * ggml_new_tensor_1d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0);
-
-struct ggml_tensor * ggml_new_tensor_2d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1);
-
-struct ggml_tensor * ggml_new_tensor_3d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2);
-
-struct ggml_tensor * ggml_new_tensor_4d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2,
-        int64_t ne3);
-
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-
-struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
-
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
- void * ggml_get_data    (const struct ggml_tensor * tensor);
-float * ggml_get_data_f32(const struct ggml_tensor * tensor);
-
-//
-// operations on tensors with backpropagation
-//
-
-struct ggml_tensor * ggml_dup(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_add(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-
-struct ggml_tensor * ggml_add_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_sub(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_mul(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_div(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_sqr(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_sqrt(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// return scalar
-// TODO: compute sum along rows
-struct ggml_tensor * ggml_sum(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// mean along rows
-struct ggml_tensor * ggml_mean(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// if a is the same shape as b, and a is not parameter, return a
-// otherwise, return a new tensor: repeat(a) to fit in b
-struct ggml_tensor * ggml_repeat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_abs(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_sgn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_neg(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_step(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// TODO: double-check this computation is correct
-struct ggml_tensor * ggml_gelu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// normalize along rows
-// TODO: eps is hardcoded to 1e-5 for now
-struct ggml_tensor * ggml_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// A: m rows, n columns
-// B: p rows, n columns (i.e. we transpose it internally)
-// result is m columns, p rows
-struct ggml_tensor * ggml_mul_mat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-//
-// operations on tensors without backpropagation
-//
-
-// in-place, returns view(a)
-struct ggml_tensor * ggml_scale(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// a -> b, return view(b)
-struct ggml_tensor * ggml_cpy(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// make contiguous
-struct ggml_tensor * ggml_cont(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// return view(a), b specifies the new shape
-// TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// return view(a)
-// TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1);
-
-// return view(a)
-// TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2);
-
-// offset in bytes
-struct ggml_tensor * ggml_view_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        size_t                offset);
-
-struct ggml_tensor * ggml_view_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        size_t                nb1, // row stride in bytes
-        size_t                offset);
-
-struct ggml_tensor * ggml_view_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        size_t                nb1, // row   stride in bytes
-        size_t                nb2, // slice stride in bytes
-        size_t                offset);
-
-struct ggml_tensor * ggml_permute(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   axis0,
-        int                   axis1,
-        int                   axis2,
-        int                   axis3);
-
-// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-struct ggml_tensor * ggml_transpose(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_get_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// set elements above the diagonal to -INF
-// in-place, returns view(a)
-struct ggml_tensor * ggml_diag_mask_inf(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past);
-
-// in-place, returns view(a)
-struct ggml_tensor * ggml_soft_max(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// rotary position embedding
-// in-place, returns view(a)
-// if mode & 1 == 1, skip n_past elements
-// if mode & 2 == 1, GPT-NeoX style
-// TODO: avoid creating a new tensor every time
-struct ggml_tensor * ggml_rope(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_dims,
-        int                   mode);
-
-// padding = 1
-// TODO: we don't support extra parameters for now
-//       that's why we are hard-coding the stride, padding, and dilation
-//       not great ..
-struct ggml_tensor * ggml_conv_1d_1s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_conv_1d_2s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_flash_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        bool                  masked);
-
-struct ggml_tensor * ggml_flash_ff(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b0,
-        struct ggml_tensor  * b1,
-        struct ggml_tensor  * c0,
-        struct ggml_tensor  * c1);
-
-// Mapping operations
-typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
-typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-struct ggml_tensor * ggml_map_unary_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun);
-
-struct ggml_tensor * ggml_map_binary_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun);
-
-//
-// automatic differentiation
-//
-
-void ggml_set_param(
-        struct ggml_context * ctx,
-        struct ggml_tensor * tensor);
-
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-
-struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
-
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-void ggml_graph_reset  (struct ggml_cgraph * cgraph);
-
-// print info and performance information for the graph
-void ggml_graph_print(const struct ggml_cgraph * cgraph);
-
-// dump the graph into a file using the dot format
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-
-//
-// optimization
-//
-
-// optimization methods
-enum ggml_opt_type {
-    GGML_OPT_ADAM,
-    GGML_OPT_LBFGS,
-};
-
-// linesearch methods
-enum ggml_linesearch {
-    GGML_LINESEARCH_DEFAULT = 1,
-
-    GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-    GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-    GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-};
-
-// optimization return values
-enum ggml_opt_result {
-    GGML_OPT_OK = 0,
-    GGML_OPT_DID_NOT_CONVERGE,
-    GGML_OPT_NO_CONTEXT,
-    GGML_OPT_INVALID_WOLFE,
-    GGML_OPT_FAIL,
-
-    GGML_LINESEARCH_FAIL = -128,
-    GGML_LINESEARCH_MINIMUM_STEP,
-    GGML_LINESEARCH_MAXIMUM_STEP,
-    GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-    GGML_LINESEARCH_INVALID_PARAMETERS,
-};
-
-// optimization parameters
-//
-//   see ggml.c (ggml_opt_default_params) for default values
-//
-struct ggml_opt_params {
-    enum ggml_opt_type type;
-
-    int n_threads;
-
-    // delta-based convergence test
     //
-    //   if past == 0 - disabled
-    //   if past > 0:
-    //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+    // operations on tensors with backpropagation
     //
-    int past;
-    float delta;
 
-    // maximum number of iterations without improvement
+    GGML_API struct ggml_tensor * ggml_dup(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_add(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_sub(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_mul(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_div(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_sqr(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sqrt(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // return scalar
+    // TODO: compute sum along rows
+    GGML_API struct ggml_tensor * ggml_sum(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // mean along rows
+    GGML_API struct ggml_tensor * ggml_mean(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // if a is the same shape as b, and a is not parameter, return a
+    // otherwise, return a new tensor: repeat(a) to fit in b
+    GGML_API struct ggml_tensor * ggml_repeat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_abs(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sgn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_neg(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_step(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_relu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // TODO: double-check this computation is correct
+    GGML_API struct ggml_tensor * ggml_gelu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_silu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // normalize along rows
+    // TODO: eps is hardcoded to 1e-5 for now
+    GGML_API struct ggml_tensor * ggml_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_rms_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // A: m rows, n columns
+    // B: p rows, n columns (i.e. we transpose it internally)
+    // result is m columns, p rows
+    GGML_API struct ggml_tensor * ggml_mul_mat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     //
-    //   if 0 - disabled
-    //   if > 0:
-    //     assume convergence if no cost improvement in this number of iterations
+    // operations on tensors without backpropagation
     //
-    int max_no_improvement;
 
-    bool print_forward_graph;
-    bool print_backward_graph;
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
 
-    // ADAM parameters
-    struct {
-        int n_iter;
+    // a -> b, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
 
-        float alpha; // learning rate
-        float beta1;
-        float beta2;
-        float eps;   // epsilon for numerical stability
-        float eps_f; // epsilon for convergence test
-        float eps_g; // epsilon for convergence test
-    } adam;
+    // make contiguous
+    GGML_API struct ggml_tensor * ggml_cont(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
 
-    // LBFGS parameters
-    struct {
-        int m; // number of corrections to approximate the inv. Hessian
-        int n_iter;
-        int max_linesearch;
+    // return view(a), b specifies the new shape
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
 
-        float eps;      // convergence tolerance
-        float ftol;     // line search tolerance
-        float wolfe;
-        float min_step;
-        float max_step;
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
 
-        enum ggml_linesearch linesearch;
-    } lbfgs;
-};
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
 
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+    // offset in bytes
+    GGML_API struct ggml_tensor * ggml_view_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            size_t                offset);
 
-// optimize the function defined by the tensor f
-enum ggml_opt_result ggml_opt(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f);
+    GGML_API struct ggml_tensor * ggml_view_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            size_t                nb1, // row stride in bytes
+            size_t                offset);
 
-//
-// quantization
-//
+    GGML_API struct ggml_tensor * ggml_view_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                offset);
 
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API struct ggml_tensor * ggml_permute(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   axis0,
+            int                   axis1,
+            int                   axis2,
+            int                   axis3);
 
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
+    GGML_API struct ggml_tensor * ggml_transpose(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
 
-//
-// system info
-//
+    GGML_API struct ggml_tensor * ggml_get_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
 
-int ggml_cpu_has_avx(void);
-int ggml_cpu_has_avx2(void);
-int ggml_cpu_has_avx512(void);
-int ggml_cpu_has_avx512_vbmi(void);
-int ggml_cpu_has_avx512_vnni(void);
-int ggml_cpu_has_fma(void);
-int ggml_cpu_has_neon(void);
-int ggml_cpu_has_arm_fma(void);
-int ggml_cpu_has_f16c(void);
-int ggml_cpu_has_fp16_va(void);
-int ggml_cpu_has_wasm_simd(void);
-int ggml_cpu_has_blas(void);
-int ggml_cpu_has_cublas(void);
-int ggml_cpu_has_sse3(void);
-int ggml_cpu_has_vsx(void);
+    // set elements above the diagonal to -INF
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // rotary position embedding
+    // in-place, returns view(a)
+    // if mode & 1 == 1, skip n_past elements
+    // if mode & 2 == 1, GPT-NeoX style
+    // TODO: avoid creating a new tensor every time
+    GGML_API struct ggml_tensor * ggml_rope(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
+    // padding = 1
+    // TODO: we don't support extra parameters for now
+    //       that's why we are hard-coding the stride, padding, and dilation
+    //       not great ..
+    GGML_API struct ggml_tensor * ggml_conv_1d_1s(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_conv_1d_2s(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_flash_attn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * q,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            bool                  masked);
+
+    GGML_API struct ggml_tensor * ggml_flash_ff(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b0,
+            struct ggml_tensor  * b1,
+            struct ggml_tensor  * c0,
+            struct ggml_tensor  * c1);
+
+    // Mapping operations
+    GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
+    GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+    GGML_API struct ggml_tensor * ggml_map_unary_f32(
+            struct ggml_context        * ctx,
+            struct ggml_tensor         * a,
+            const  ggml_unary_op_f32_t fun);
+
+    GGML_API struct ggml_tensor * ggml_map_binary_f32(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+            const  ggml_binary_op_f32_t fun);
+
+    //
+    // automatic differentiation
+    //
+
+    GGML_API void ggml_set_param(
+            struct ggml_context * ctx,
+            struct ggml_tensor * tensor);
+
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+
+    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
+    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
+
+    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+
+    // print info and performance information for the graph
+    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
+
+    // dump the graph into a file using the dot format
+    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+
+    //
+    // optimization
+    //
+
+    // optimization methods
+    enum ggml_opt_type {
+        GGML_OPT_ADAM,
+        GGML_OPT_LBFGS,
+    };
+
+    // linesearch methods
+    enum ggml_linesearch {
+        GGML_LINESEARCH_DEFAULT = 1,
+
+        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
+        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
+        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+    };
+
+    // optimization return values
+    enum ggml_opt_result {
+        GGML_OPT_OK = 0,
+        GGML_OPT_DID_NOT_CONVERGE,
+        GGML_OPT_NO_CONTEXT,
+        GGML_OPT_INVALID_WOLFE,
+        GGML_OPT_FAIL,
+
+        GGML_LINESEARCH_FAIL = -128,
+        GGML_LINESEARCH_MINIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
+        GGML_LINESEARCH_INVALID_PARAMETERS,
+    };
+
+    // optimization parameters
+    //
+    //   see ggml.c (ggml_opt_default_params) for default values
+    //
+    struct ggml_opt_params {
+        enum ggml_opt_type type;
+
+        int n_threads;
+
+        // delta-based convergence test
+        //
+        //   if past == 0 - disabled
+        //   if past > 0:
+        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+        //
+        int past;
+        float delta;
+
+        // maximum number of iterations without improvement
+        //
+        //   if 0 - disabled
+        //   if > 0:
+        //     assume convergence if no cost improvement in this number of iterations
+        //
+        int max_no_improvement;
+
+        bool print_forward_graph;
+        bool print_backward_graph;
+
+        // ADAM parameters
+        struct {
+            int n_iter;
+
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float eps_f; // epsilon for convergence test
+            float eps_g; // epsilon for convergence test
+        } adam;
+
+        // LBFGS parameters
+        struct {
+            int m; // number of corrections to approximate the inv. Hessian
+            int n_iter;
+            int max_linesearch;
+
+            float eps;      // convergence tolerance
+            float ftol;     // line search tolerance
+            float wolfe;
+            float min_step;
+            float max_step;
+
+            enum ggml_linesearch linesearch;
+        } lbfgs;
+    };
+
+    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+
+    // optimize the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt(
+            struct ggml_context * ctx,
+            struct ggml_opt_params params,
+            struct ggml_tensor * f);
+
+    //
+    // quantization
+    //
+
+    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
+
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+
+    //
+    // system info
+    //
+
+    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx2       (void);
+    GGML_API int ggml_cpu_has_avx512     (void);
+    GGML_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_API int ggml_cpu_has_fma        (void);
+    GGML_API int ggml_cpu_has_neon       (void);
+    GGML_API int ggml_cpu_has_arm_fma    (void);
+    GGML_API int ggml_cpu_has_f16c       (void);
+    GGML_API int ggml_cpu_has_fp16_va    (void);
+    GGML_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_API int ggml_cpu_has_blas       (void);
+    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_sse3       (void);
+    GGML_API int ggml_cpu_has_vsx        (void);
 
 
-//
-// Internal types and functions exposed for tests and benchmarks
-//
+    //
+    // Internal types and functions exposed for tests and benchmarks
+    //
 
 #ifdef  __cplusplus
-// restrict not standard in C++
+    // restrict not standard in C++
 #define GGML_RESTRICT
 #else
 #define GGML_RESTRICT restrict
 #endif
-typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+    typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+    typedef void (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+    typedef void (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
-typedef struct {
-    dequantize_row_q_t dequantize_row_q;
-    quantize_row_q_t   quantize_row_q;
-    quantize_row_q_t   quantize_row_q_reference;
-    quantize_row_q_t   quantize_row_q_dot;
-    vec_dot_q_t        vec_dot_q;
-} quantize_fns_t;
+    typedef struct {
+        dequantize_row_q_t dequantize_row_q;
+        quantize_row_q_t   quantize_row_q;
+        quantize_row_q_t   quantize_row_q_reference;
+        quantize_row_q_t   quantize_row_q_dot;
+        vec_dot_q_t        vec_dot_q;
+    } quantize_fns_t;
 
-quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
+    quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
 
 #ifdef  __cplusplus
 }

From 54bb60e26858be251a0eb3cb70f80322aff804a0 Mon Sep 17 00:00:00 2001
From: xaedes <xaedes@gmail.com>
Date: Mon, 24 Apr 2023 23:02:02 +0200
Subject: [PATCH 28/74] ggml : fix bug in ggml_compute_forward_sum_f32 (#1162)

The sum over all rows is now computed instead of just the last row
---
 ggml.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 6e46c0e5a..850588995 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6779,15 +6779,20 @@ static void ggml_compute_forward_sum_f32(
     const size_t nb02 = src0->nb[2];
     const size_t nb03 = src0->nb[3];
 
+    ggml_float sum     = 0;
+    float      row_sum = 0;
+
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
             for (int64_t i01 = 0; i01 < ne01; i01++) {
                 ggml_vec_sum_f32(ne00,
-                        (float *) (dst->data),
+                        &row_sum,
                         (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
+                sum += row_sum;
             }
         }
     }
+    ((float *) dst->data)[0] = sum;
 }
 
 static void ggml_compute_forward_sum(

From dd0eabc049fb1efc631cab8eb0a646808d704e18 Mon Sep 17 00:00:00 2001
From: unbounded <haakon@likedan.net>
Date: Tue, 25 Apr 2023 19:20:46 +0200
Subject: [PATCH 29/74] ggml : use full range for Q4_0 and Q4_2 quantization
 (#729)

* Use full range for q4_0 quantization

By keeping the sign of the highest magnitude, we can make sure the
highest value maps to -8, which is currently unused.
This is a bit of a freebie since it is fully backwards compatible with
the current format.

* Update quantize_row_q4_0 for AVX/AVX2

* Update quantize_row_q4_0 for WASM

Untested

* Update quantize_row_q4_0 for Arm NEON

* Update quantize_row_q4_0 for PowerPC

Untested

* Use full range for q4_2 quantization
---
 ggml.c | 206 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 132 insertions(+), 74 deletions(-)

diff --git a/ggml.c b/ggml.c
index 850588995..b4dd88db5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -692,13 +692,17 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
 
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
+        float max = 0.0f;
 
         for (int l = 0; l < QK4_0; l++) {
             const float v = x[i*QK4_0 + l];
-            amax = MAX(amax, fabsf(v));
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
         }
 
-        const float d = amax / ((1 << 3) - 1);
+        const float d = max / -8;
         const float id = d ? 1.0f/d : 0.0f;
 
         y[i].d = d;
@@ -707,8 +711,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
             const float v0 = x[i*QK4_0 + l + 0]*id;
             const float v1 = x[i*QK4_0 + l + 1]*id;
 
-            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
-            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+            const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
+            const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);
 
             assert(vi0 < 16);
             assert(vi1 < 16);
@@ -728,28 +732,42 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
 
 #if defined(__POWER9_VECTOR__)
     const vector float v85 = vec_splats(8.5f);
+    const vector signed int v15 = vec_splats(15);
     for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
+        float max = 0.0f;
+        float min = 0.0f;
 
         vector float srcv [8];
-        vector float asrcv[8];
-        vector float amaxv[8];
+        vector float maxv[8];
+        vector float minv[8];
 
         for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
+        //for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
 
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
-        amaxv[0] = vec_max(amaxv[0], amaxv[2]);
-        amaxv[4] = vec_max(amaxv[4], amaxv[6]);
-        //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
-        amaxv[0] = vec_max(amaxv[0], amaxv[4]);
+        for (int l = 0; l < 4; l++) maxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
+        //for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]);
+        maxv[0] = vec_max(maxv[0], maxv[2]);
+        maxv[4] = vec_max(maxv[4], maxv[6]);
+        //for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]);
+        maxv[0] = vec_max(maxv[0], maxv[4]);
 
-        amax = MAX(
-                MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
-                MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
+        for (int l = 0; l < 4; l++) minv[2*l] = vec_min(asrcv[2*l], asrcv[2*l+1]);
+        //for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]);
+        minv[0] = vec_min(minv[0], minv[2]);
+        minv[4] = vec_min(minv[4], minv[6]);
+        //for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]);
+        minv[0] = vec_min(minv[0], minv[4]);
 
-        const float d = amax / ((1 << 3) - 1);
+
+        max = MAX(
+                MAX(vec_extract(maxv[0], 0), vec_extract(maxv[0], 1)),
+                MAX(vec_extract(maxv[0], 2), vec_extract(maxv[0], 3)));
+        min = MIN(
+                MIN(vec_extract(minv[0], 0), vec_extract(minv[0], 1)),
+                MIN(vec_extract(minv[0], 2), vec_extract(minv[0], 3)));
+
+        const float magnitude = max >= fabsf(min) ? max : min;
+        const float d = magnitude / -8;
         const float id = d ? 1.0/d : 0.0;
 
         y[i].d = d;
@@ -759,27 +777,33 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
         for (int l = 0; l < 8; l++) {
             const vector float vf  = vec_madd(srcv[l], vid, v85);
             const vector signed int vi = vec_signed(vf);
+            const vector signed int vc = vec_min(vi, v15);
 
-            pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
-            pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
+            pb[2*l + 0] = vec_extract(vc, 0) | (vec_extract(vc, 1) << 4);
+            pb[2*l + 1] = vec_extract(vc, 2) | (vec_extract(vc, 3) << 4);
         }
     }
 #elif __ARM_NEON
     for (int i = 0; i < nb; i++) {
         float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
+        float32x4_t maxv[8];
+        float32x4_t minv[8];
 
         for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
 
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l+1]);
+        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l+2]);
+        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l+4]);
 
-        const float amax = vmaxvq_f32(amaxv[0]);
+        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l+1]);
+        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l+2]);
+        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l+4]);
 
-        const float d = amax / ((1 << 3) - 1);
+        const float max = vmaxvq_f32(maxv[0]);
+        const float min = vminvq_f32(minv[0]);
+
+        const float magnitude = max >= fabsf(min) ? max : min;
+        const float d = magnitude / -8;
         const float id = d ? 1.0f/d : 0.0f;
 
         y[i].d = d;
@@ -788,9 +812,10 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[l], id);
             const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
             const int32x4_t   vi = vcvtq_s32_f32(vf);
+            const int32x4_t   vc = vminq_s32(vi, vdupq_n_s32(15));
 
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
+            y[i].qs[2*l + 0] = vgetq_lane_s32(vc, 0) | (vgetq_lane_s32(vc, 1) << 4);
+            y[i].qs[2*l + 1] = vgetq_lane_s32(vc, 2) | (vgetq_lane_s32(vc, 3) << 4);
         }
     }
 #elif defined(__AVX2__)
@@ -802,22 +827,31 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
         __m256 v3 = _mm256_loadu_ps( x + 24 );
         x += 32;
 
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+        // Compute max for the block
+        __m256 max  = _mm256_max_ps( v0, v1 );
+        __m256 maxTmp = _mm256_max_ps( v2, v3 );
+        max = _mm256_max_ps( max, maxTmp );
 
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
         max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
         max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
         const float maxScalar = _mm_cvtss_f32( max4 );
 
+        // Compute min for the block
+        __m256 min  = _mm256_min_ps( v0, v1 );
+        __m256 minTmp = _mm256_min_ps( v2, v3 );
+        min = _mm256_min_ps( min, minTmp );
+
+        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
+        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
+        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
+        const float minScalar = _mm_cvtss_f32( min4 );
+
         // Quantize these floats
-        const float d = maxScalar / 7.0f;
+        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
+        const float d = magnitude / -8.0f;
         y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
+        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
         const __m256 mul = _mm256_set1_ps( id );
 
         // Apply the multiplier
@@ -850,9 +884,11 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
         const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
         i0 = _mm256_permutevar8x32_epi32( i0, perm );
 
-        // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
+        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
         const __m256i off = _mm256_set1_epi8( 8 );
         i0 = _mm256_add_epi8( i0, off );
+        const __m256i maxNibble = _mm256_set1_epi8( 15 );
+        i0 = _mm256_min_epi8( i0, maxNibble );
 
         // Compress the vector into 4 bit/value, and store
         __m128i res = packNibbles( i0 );
@@ -867,22 +903,31 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
         __m256 v3 = _mm256_loadu_ps( x + 24 );
         x += 32;
 
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+        // Compute max for the block
+        __m256 max  = _mm256_max_ps( v0, v1 );
+        __m256 maxTmp = _mm256_max_ps( v2, v3 );
+        max = _mm256_max_ps( max, maxTmp );
 
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
         max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
         max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
         const float maxScalar = _mm_cvtss_f32( max4 );
 
+        // Compute min for the block
+        __m256 min  = _mm256_min_ps( v0, v1 );
+        __m256 minTmp = _mm256_min_ps( v2, v3 );
+        min = _mm256_min_ps( min, minTmp );
+
+        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
+        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
+        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
+        const float minScalar = _mm_cvtss_f32( min4 );
+
         // Quantize these floats
-        const float d = maxScalar / 7.0f;
+        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
+        const float d = magnitude / -8.0f;
         y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
+        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
         const __m256 mul = _mm256_set1_ps( id );
 
         // Apply the multiplier
@@ -923,10 +968,13 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
         ni0 = _mm_packs_epi16( ni0, ni2 );
         ni4 = _mm_packs_epi16( ni4, ni6 );
 
-        // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
-        const __m128i off = _mm_set1_epi8( 8);
+        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
+        const __m128i off = _mm_set1_epi8( 8 );
         ni0 = _mm_add_epi8( ni0, off );
         ni4 = _mm_add_epi8( ni4, off );
+        const __m128i maxNibble = _mm_set1_epi8( 15 );
+        ni0 = _mm_min_epi8( ni0, maxNibble );
+        ni4 = _mm_min_epi8( ni4, maxNibble );
 
         // Compress the vector into 4 bit/value, and store
         __m128i res = packNibbles( ni0, ni4 );
@@ -934,24 +982,32 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
     }
 #elif defined(__wasm_simd128__)
     for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
+        float max = 0.0f;
+        float min = 0.0f;
 
         v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
+        v128_t maxv[8];
+        v128_t minv[8];
 
         for (int l = 0; l < 8; l++) srcv[l]  = wasm_v128_load(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = wasm_f32x4_abs(srcv[l]);
 
-        for (int l = 0; l < 4; l++) amaxv[2*l] = wasm_f32x4_max(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = wasm_f32x4_max(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = wasm_f32x4_max(amaxv[8*l], amaxv[8*l+4]);
+        for (int l = 0; l < 4; l++) maxv[2*l] = wasm_f32x4_max(srcv[2*l], srcv[2*l+1]);
+        for (int l = 0; l < 2; l++) maxv[4*l] = wasm_f32x4_max(maxv[4*l], maxv[4*l+2]);
+        for (int l = 0; l < 1; l++) maxv[8*l] = wasm_f32x4_max(maxv[8*l], maxv[8*l+4]);
 
-        amax = MAX(
-                MAX(wasm_f32x4_extract_lane(amaxv[0], 0), wasm_f32x4_extract_lane(amaxv[0], 1)),
-                MAX(wasm_f32x4_extract_lane(amaxv[0], 2), wasm_f32x4_extract_lane(amaxv[0], 3)));
+        for (int l = 0; l < 4; l++) minv[2*l] = wasm_f32x4_min(srcv[2*l], srcv[2*l+1]);
+        for (int l = 0; l < 2; l++) minv[4*l] = wasm_f32x4_min(minv[4*l], minv[4*l+2]);
+        for (int l = 0; l < 1; l++) minv[8*l] = wasm_f32x4_min(minv[8*l], minv[8*l+4]);
 
-        const float d = amax / ((1 << 3) - 1);
+        max = MAX(
+                MAX(wasm_f32x4_extract_lane(maxv[0], 0), wasm_f32x4_extract_lane(maxv[0], 1)),
+                MAX(wasm_f32x4_extract_lane(maxv[0], 2), wasm_f32x4_extract_lane(maxv[0], 3)));
+        min = MIN(
+                MIN(wasm_f32x4_extract_lane(minv[0], 0), wasm_f32x4_extract_lane(minv[0], 1)),
+                MIN(wasm_f32x4_extract_lane(minv[0], 2), wasm_f32x4_extract_lane(minv[0], 3)));
+
+        const float magnitude = max >= fabsf(min) ? max : min;
+        const float d = magnitude / -8;
         const float id = d ? 1.0/d : 0.0;
 
         y[i].d = d;
@@ -960,9 +1016,10 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
             const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
             const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
             const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
+            const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
 
-            y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
-            y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
+            y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
+            y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
         }
     }
 #else
@@ -1143,13 +1200,17 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
 
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
+        float max = 0.0f;
 
         for (int l = 0; l < QK4_2; l++) {
             const float v = x[i*QK4_2 + l];
-            amax = MAX(amax, fabsf(v));
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
         }
 
-        const float d = amax / ((1 << 3) - 1);
+        const float d = max / -8;
 
         const float id = d ? 1.0f/d : 0.0f;
 
@@ -1159,8 +1220,8 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
             const float v0 = x[i*QK4_2 + l + 0]*id;
             const float v1 = x[i*QK4_2 + l + 1]*id;
 
-            const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
-            const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
+            const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f));
+            const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f));
 
             assert(vi0 < 16);
             assert(vi1 < 16);
@@ -1254,9 +1315,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
 
     block_q4_2 * restrict y = vy;
 
-    //quantize_row_q4_2_reference(x, y, k);
-    // This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
-    quantize_row_q4_2_rmse(x, y, k);
+    quantize_row_q4_2_reference(x, y, k);
 }
 
 static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
@@ -1807,7 +1866,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_2] = {
         .dequantize_row_q         = dequantize_row_q4_2,
         .quantize_row_q           = quantize_row_q4_2,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_rmse, //quantize_row_q4_2_reference,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
         .vec_dot_q                = ggml_vec_dot_q4_2_q8_0,
     },
@@ -12144,8 +12203,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
     for (int j = 0; j < n; j += k) {
         block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
 
-        //quantize_row_q4_2_reference(src + j, y, k);
-        quantize_row_q4_2_rmse(src + j, y, k);
+        quantize_row_q4_2_reference(src + j, y, k);
 
         for (int i = 0; i < nb; i++) {
             for (int l = 0; l < QK4_2; l += 2) {

From 7a32fcb3b29f4db8aed8a85dc58eb958fb118153 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 25 Apr 2023 23:40:51 +0300
Subject: [PATCH 30/74] ggml : add Q8_0 quantization format (rename the old one
 to Q8_1) (ARM NEON) (#1179)

* ggml : add Q8_0 quantization format (rename the old one to Q8_1)

* tests : fix test-quantize-fns

* ggml : finalize Q8_0 implementation

* ggml : use q4_0_q8_0 and q4_2_q8_0

* ggml : fix Q8_0 dot product bug (ARM)

* ggml : Q8_0 unroll x2

* ggml : fix bug - using wrong block type

* ggml : extend quantize_fns_t with "vec_dot_type"

* ggml : fix Q8_0 to use 255 values out of 256

* ggml : fix assert using wrong QK4_2 instead of QK4_3
---
 examples/quantize/quantize.cpp |   1 +
 ggml-cuda.cu                   |  28 +++
 ggml-cuda.h                    |   1 +
 ggml.c                         | 407 +++++++++++++++++++++------------
 ggml.h                         |   3 +
 llama.cpp                      |   4 +
 llama.h                        |   1 +
 tests/test-quantize-fns.cpp    |  14 +-
 8 files changed, 312 insertions(+), 147 deletions(-)

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 5b4812c62..ad39a805d 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -16,6 +16,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
         fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
         fprintf(stderr, "  type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
+        fprintf(stderr, "  type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
         return 1;
     }
 
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index fa511c1dc..f104ed5ac 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -37,6 +37,13 @@ typedef struct {
 } block_q4_3;
 static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
 
+#define QK8_0 32
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
     const block_q4_0 * x = (const block_q4_0 *) vx;
 
@@ -131,6 +138,22 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
     }
 }
 
+static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    const int8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK8_0; l++) {
+        const int8_t vi = pp[l];
+
+        y[i*QK8_0 + l] = vi*d;
+    }
+}
+
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_0;
     dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
@@ -151,6 +174,11 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
     dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
 }
 
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK8_0;
+    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 16
 
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 370bbc75f..4048ea491 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -35,6 +35,7 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml.c b/ggml.c
index b4dd88db5..064510eda 100644
--- a/ggml.c
+++ b/ggml.c
@@ -676,12 +676,18 @@ static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong
 #define QK8_0 32
 typedef struct {
     float   d;          // delta
-    float   s0;         // d * sum(qs[i]) low
-    float   s1;         // d * sum(qs[i]) high
     int8_t  qs[QK8_0];  // quants
 } block_q8_0;
-static_assert(sizeof(block_q8_0) == 3*sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
 
+#define QK8_1 32
+typedef struct {
+    float   d;          // delta
+    float   s0;         // d * sum(qs[i]) low
+    float   s1;         // d * sum(qs[i]) high
+    int8_t  qs[QK8_1];  // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -1231,85 +1237,6 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
     }
 }
 
-static inline int nearest_int(float fval) {
-    assert(fval <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates,
-        const float * restrict candidates, int8_t * restrict L) {
-    assert (nmin >= INT8_MIN);
-    assert (nmax <= INT8_MAX);
-    float amax = 0;
-    for (int i=0; i<n; ++i) amax = MAX(amax, fabsf(X[i]));
-    if (!amax) { // all zero
-        for (int i=0; i<n; ++i) L[i] = 0;
-        return 1.f;
-    }
-    float best = 0, bestScale = 0;
-    for (int si=0; si<nCandidates; ++si) {
-        float iscale = candidates[si]/amax;
-        float sumlxP = 0; int suml2P = 0;
-        float sumlxM = 0; int suml2M = 0;
-        for (int i=0; i<n; ++i) {
-            int l = nearest_int(iscale*X[i]);
-            int lp = MAX(nmin, MIN(nmax, +l));
-            int lm = MAX(nmin, MIN(nmax, -l));
-            sumlxP += X[i]*lp; suml2P += lp*lp;
-            sumlxM += X[i]*lm; suml2M += lm*lm;
-        }
-        float sumlxP2 = sumlxP*sumlxP;
-        float sumlxM2 = sumlxM*sumlxM;
-        if (sumlxP2*suml2M > sumlxM2*suml2P) {
-            if (sumlxP2 > best*suml2P) {
-                best = sumlxP2/suml2P; bestScale = iscale;
-            }
-        } else {
-            if (sumlxM2 > best*suml2M) {
-                best = sumlxM2/suml2M; bestScale = -iscale;
-            }
-        }
-    }
-    float sumlx = 0; int suml2 = 0;
-    for (int i=0; i<n; ++i) {
-        int l = nearest_int(bestScale*X[i]);
-        l = MAX(nmin, MIN(nmax, l));
-        sumlx += X[i]*l; suml2 += l*l;
-        L[i] = l;
-    }
-    float scale = sumlx/suml2;
-    return scale;
-}
-
-static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restrict y, int k) {
-#define CANDIDATE_COUNT 8
-    static const float candidates[CANDIDATE_COUNT] = { +8.7f, +8.3f, +8.1f, +7.8f, +7.3f, +7.0f, +6.3f, +5.7f };
-    assert(k % QK4_2 == 0);
-
-    int8_t L[QK4_2];
-
-    const int nb = k / QK4_2;
-
-    for (int i = 0; i < nb; i++) {
-        float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L);
-        y[i].d = GGML_FP32_TO_FP16(scale);
-
-        for (int l = 0; l < QK4_2; l += 2) {
-            const uint8_t vi0 = (uint8_t)(L[l+0] + 8);
-            const uint8_t vi1 = (uint8_t)(L[l+1] + 8);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
-
-        x += QK4_2;
-    }
-}
-
 static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) {
     assert(k % QK4_2 == 0);
 
@@ -1379,18 +1306,52 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
         y[i].d = d;
 
+        for (int l = 0; l < QK8_0; ++l) {
+            const float v0 = x[i*QK8_0 + l]*id;
+
+            y[i].qs[l] = roundf(v0);
+        }
+    }
+}
+
+static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK8_0 == 0);
+
+    block_q8_0 * restrict y = vy;
+
+    quantize_row_q8_0_reference(x, y, k);
+}
+
+// reference implementation for deterministic creation of model files
+static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int l = 0; l < QK8_1; l++) {
+            const float v = x[i*QK8_1 + l];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
         int sum0 = 0;
         int sum1 = 0;
 
-        for (int l = 0; l < QK8_0/2; ++l) {
-            const float v0 = x[i*QK8_0           + l]*id;
-            const float v1 = x[i*QK8_0 + QK8_0/2 + l]*id;
+        for (int l = 0; l < QK8_1/2; ++l) {
+            const float v0 = x[i*QK8_1           + l]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + l]*id;
 
             y[i].qs[          l] = roundf(v0);
-            y[i].qs[QK8_0/2 + l] = roundf(v1);
+            y[i].qs[QK8_1/2 + l] = roundf(v1);
 
             sum0 += y[i].qs[          l];
-            sum1 += y[i].qs[QK8_0/2 + l];
+            sum1 += y[i].qs[QK8_1/2 + l];
         }
 
         y[i].s0 = d * sum0;
@@ -1398,11 +1359,11 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
     }
 }
 
-static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
+static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK8_1 == 0);
+    const int nb = k / QK8_1;
 
-    block_q8_0 * restrict y = vy;
+    block_q8_1 * restrict y = vy;
 
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
@@ -1556,7 +1517,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
     }
 #else
     // scalar
-    quantize_row_q8_0_reference(x, y, k);
+    quantize_row_q8_1_reference(x, y, k);
 #endif
 }
 
@@ -1843,10 +1804,28 @@ static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, in
     }
 }
 
+static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    const block_q8_0 * restrict x = vx;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = x[i].d;
+
+        const int8_t * restrict pp = x[i].qs;
+
+        for (int l = 0; l < QK8_0; ++l) {
+            y[i*QK8_0 + l] = pp[l]*d;
+        }
+    }
+}
+
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = {
@@ -1855,13 +1834,15 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
         .vec_dot_q                = ggml_vec_dot_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q4_1] = {
         .dequantize_row_q         = dequantize_row_q4_1,
         .quantize_row_q           = quantize_row_q4_1,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_1_q8_0,
+        .quantize_row_q_dot       = quantize_row_q8_1,
+        .vec_dot_q                = ggml_vec_dot_q4_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
     },
     [GGML_TYPE_Q4_2] = {
         .dequantize_row_q         = dequantize_row_q4_2,
@@ -1869,20 +1850,31 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
         .vec_dot_q                = ggml_vec_dot_q4_2_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
     },
     [GGML_TYPE_Q4_3] = {
         .dequantize_row_q         = dequantize_row_q4_3,
         .quantize_row_q           = quantize_row_q4_3,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, // TODO: RMSE optimization
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_3_q8_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference,
+        .quantize_row_q_dot       = quantize_row_q8_1,
+        .vec_dot_q                = ggml_vec_dot_q4_3_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
     },
     [GGML_TYPE_Q8_0] = {
-        .dequantize_row_q         = NULL,   // TODO
+        .dequantize_row_q         = dequantize_row_q8_0,
         .quantize_row_q           = quantize_row_q8_0,
         .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference,
         .quantize_row_q_dot       = quantize_row_q8_0,
+        .vec_dot_q                = ggml_vec_dot_q8_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q8_1] = {
+        .dequantize_row_q         = NULL,   // TODO
+        .quantize_row_q           = quantize_row_q8_1,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference,
+        .quantize_row_q_dot       = quantize_row_q8_1,
         .vec_dot_q                = NULL,   // TODO
+        .vec_dot_type             = GGML_TYPE_Q8_1,
     },
 };
 
@@ -2498,17 +2490,14 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
     float32x4_t sumv1 = vdupq_n_f32(0.0f);
 
-    float sum8 = 0;
-
     for (int i = 0; i < nb; i += 2) {
         const block_q4_0 * restrict x0 = &x[i + 0];
         const block_q4_0 * restrict x1 = &x[i + 1];
         const block_q8_0 * restrict y0 = &y[i + 0];
         const block_q8_0 * restrict y1 = &y[i + 1];
 
-        sum8 += x0->d * (y0->s0 + y0->s1) + x1->d * (y1->s0 + y1->s1);
-
         const uint8x16_t m4b   = vdupq_n_u8(0xf);
+        const int8x16_t  s8b   = vdupq_n_s8(0x8);
 
         const uint8x16_t v0_0 = vld1q_u8(x0->qs);
         const uint8x16_t v0_1 = vld1q_u8(x1->qs);
@@ -2519,6 +2508,12 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
         const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
 
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
         // load y
         const int8x16_t v1_0l = vld1q_s8(y0->qs);
         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
@@ -2533,21 +2528,21 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 
 #if defined(__ARM_FEATURE_DOTPROD)
         // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0ls), v0_0h, v1_0hs);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1ls), v0_1h, v1_1hs);
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs);
 
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
         sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0ls));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0hs));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
 
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1ls));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1ls));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1hs));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1hs));
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
@@ -2559,7 +2554,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 #endif
     }
 
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) - 8 * sum8;
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -2651,14 +2646,14 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 #endif
 }
 
-static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
+static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / QK8_1;
 
-    assert(n % QK8_0 == 0);
+    assert(n % QK8_1 == 0);
     assert(nb % 2 == 0);
 
     const block_q4_1 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q8_1 * restrict y = vy;
 
     // TODO: add AVX / WASM SIMD / etc
 #if defined(__ARM_NEON)
@@ -2670,8 +2665,8 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
     for (int i = 0; i < nb; i += 2) {
         const block_q4_1 * restrict x0 = &x[i + 0];
         const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
+        const block_q8_1 * restrict y0 = &y[i + 0];
+        const block_q8_1 * restrict y1 = &y[i + 1];
 
         summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
 
@@ -2769,7 +2764,7 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
         const  int8_t * restrict p1 = y[i].qs;
 
         // TODO: this is very slow ..
-        for (int j = 0; j < QK8_0/2; j++) {
+        for (int j = 0; j < QK8_1/2; j++) {
             const uint8_t v0 = p0[j];
 
             const float f0 = d0*(v0 & 0xf) + m0;
@@ -2942,15 +2937,15 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
 #endif
 }
 
-static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
+static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / QK8_1;
 
-    assert(n % QK8_0 == 0);
+    assert(n % QK8_1 == 0);
     assert(nb % 2 == 0);
-    assert(QK8_0 == 2*QK4_2);
+    assert(QK8_1 == 2*QK4_3);
 
     const block_q4_3 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
+    const block_q8_1 * restrict y = vy;
 
 #if defined(__ARM_NEON)
     float32x4_t sumv0 = vdupq_n_f32(0.0f);
@@ -2963,7 +2958,7 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
         const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
         const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
 
-        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_1 * restrict y0 = &y[i + 0];
 
         summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
         summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
@@ -3046,7 +3041,7 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
         int sxy_0 = 0;
         int sxy_1 = 0;
 
-        for (int j = 0; j < QK8_0/4; j++) {
+        for (int j = 0; j < QK8_1/4; j++) {
             const uint8_t v0 = x0[j];
             const uint8_t v1 = x1[j];
 
@@ -3059,8 +3054,8 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
             const int y0_0 = y0[2*j + 0];
             const int y1_0 = y0[2*j + 1];
 
-            const int y0_1 = y0[2*(j + QK8_0/4) + 0];
-            const int y1_1 = y0[2*(j + QK8_0/4) + 1];
+            const int y0_1 = y0[2*(j + QK8_1/4) + 0];
+            const int y1_1 = y0[2*(j + QK8_1/4) + 1];
 
             sxy_0 += x0_0*y0_0 + x1_0*y1_0;
             sxy_1 += x0_1*y0_1 + x1_1*y1_1;
@@ -3072,6 +3067,91 @@ static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void *
 #endif
 }
 
+static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / QK8_0;
+
+    assert(n % QK8_0 == 0);
+    assert(nb % 2 == 0);
+    assert(QK8_0 == QK8_0);
+
+    const block_q8_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_q8_0 * restrict x0 = &x[i + 0];
+        const block_q8_0 * restrict x1 = &x[i + 1];
+        const block_q8_0 * restrict y0 = &y[i + 0];
+        const block_q8_0 * restrict y1 = &y[i + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
+
+#else
+        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
+        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
+        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
+        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
+
+        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
+        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
+        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
+        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
+
+        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
+        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
+        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
+        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        const int8_t * restrict x0 = x[i].qs;
+        const int8_t * restrict y0 = y[i].qs;
+
+        int sumi = 0;
+
+        for (int j = 0; j < QK8_0; j++) {
+            const int v0 = x0[j];
+            const int v1 = y0[j];
+
+            sumi += v0*v1;
+        }
+
+        sumf += (x[i].d*y[i].d)*sumi;
+    }
+
+    *s = sumf;
+#endif
+}
 
 // compute GGML_VEC_DOT_UNROLL dot products at once
 // xs - x row stride in bytes
@@ -3269,6 +3349,14 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 #endif
 }
 
+inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+}
+
 inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
 #ifndef GGML_USE_ACCELERATE
     float max = -INFINITY;
@@ -3322,11 +3410,12 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_2] = QK4_2,
     [GGML_TYPE_Q4_3] = QK4_3,
     [GGML_TYPE_Q8_0] = QK8_0,
+    [GGML_TYPE_Q8_1] = QK8_1,
     [GGML_TYPE_I8]   = 1,
     [GGML_TYPE_I16]  = 1,
     [GGML_TYPE_I32]  = 1,
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_BLCK_SIZE is outdated");
+static_assert(GGML_TYPE_COUNT == 11, "GGML_BLCK_SIZE is outdated");
 
 static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = sizeof(float),
@@ -3336,11 +3425,12 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
     [GGML_TYPE_Q4_3] = sizeof(block_q4_3),
     [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
+    [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
     [GGML_TYPE_I8]   = sizeof(int8_t),
     [GGML_TYPE_I16]  = sizeof(int16_t),
     [GGML_TYPE_I32]  = sizeof(int32_t),
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_SIZE is outdated");
+static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_SIZE is outdated");
 
 
 static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@@ -3351,11 +3441,12 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_2] = "q4_2",
     [GGML_TYPE_Q4_3] = "q4_3",
     [GGML_TYPE_Q8_0] = "q8_0",
+    [GGML_TYPE_Q8_1] = "q8_1",
     [GGML_TYPE_I8]   = "i8",
     [GGML_TYPE_I16]  = "i16",
     [GGML_TYPE_I32]  = "i32",
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_NAME is outdated");
+static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_NAME is outdated");
 
 static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = false,
@@ -3365,11 +3456,12 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_2] = true,
     [GGML_TYPE_Q4_3] = true,
     [GGML_TYPE_Q8_0] = true,
+    [GGML_TYPE_Q8_1] = true,
     [GGML_TYPE_I8]   = false,
     [GGML_TYPE_I16]  = false,
     [GGML_TYPE_I32]  = false,
 };
-static_assert(GGML_TYPE_COUNT == 10, "GGML_IS_QUANTIZED is outdated");
+static_assert(GGML_TYPE_COUNT == 11, "GGML_IS_QUANTIZED is outdated");
 
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",
@@ -6581,6 +6673,7 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q4_3:
+        case GGML_TYPE_Q8_0:
             {
                 ggml_compute_forward_add_q_f32(params, src0, src1, dst);
             } break;
@@ -6839,12 +6932,12 @@ static void ggml_compute_forward_sum_f32(
     const size_t nb03 = src0->nb[3];
 
     ggml_float sum     = 0;
-    float      row_sum = 0;
+    ggml_float row_sum = 0;
 
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
             for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32(ne00,
+                ggml_vec_sum_ggf(ne00,
                         &row_sum,
                         (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
                 sum += row_sum;
@@ -8008,6 +8101,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
     const enum ggml_type type = src0->type;
     quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
     vec_dot_q_t      const vec_dot_q          = quantize_fns[type].vec_dot_q;
+    enum ggml_type   const vec_dot_type       = quantize_fns[type].vec_dot_type;
 
     // we don't support permuted src0 or src1
     GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
@@ -8067,6 +8161,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
         else if (type == GGML_TYPE_Q4_3) {
             dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
         }
+        else if (type == GGML_TYPE_Q8_0) {
+            dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
+        }
         else {
             GGML_ASSERT(false);
         }
@@ -8141,7 +8238,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
     if (params->type == GGML_TASK_INIT) {
         char * wdata = params->wdata;
-        const size_t row_size = ne10*GGML_TYPE_SIZE[GGML_TYPE_Q8_0]/GGML_BLCK_SIZE[GGML_TYPE_Q8_0];
+        const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
         for (int64_t i13 = 0; i13 < ne13; ++i13) {
             for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -8172,7 +8269,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
     const int ir1 = MIN(ir0 + dr, nr);
 
     void * wdata = params->wdata;
-    const size_t row_size = ne00*GGML_TYPE_SIZE[GGML_TYPE_Q8_0]/GGML_BLCK_SIZE[GGML_TYPE_Q8_0];
+    const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
     for (int ir = ir0; ir < ir1; ++ir) {
         // src0 indices
@@ -8223,6 +8320,7 @@ static void ggml_compute_forward_mul_mat(
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q4_3:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
             {
                 ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
             } break;
@@ -8452,6 +8550,7 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q4_3:
         case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
             {
                 ggml_compute_forward_get_rows_q(params, src0, src1, dst);
             } break;
@@ -10973,7 +11072,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             } else
 #endif
                             {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_Q8_0]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[GGML_TYPE_Q8_0];
+                                const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
+                                cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
                             }
                         } else {
                             GGML_ASSERT(false);
@@ -12242,6 +12342,27 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_3*sizeof(block_q4_3));
 }
 
+size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int j = 0; j < n; j += k) {
+        block_q8_0 * restrict y = (block_q8_0 *)dst + j/QK8_0;
+
+        quantize_row_q8_0_reference(src + j, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int l = 0; l < QK8_0; ++l) {
+                const int8_t vi = y[i].qs[l];
+
+                hist[vi/16 + 8]++;
+            }
+        }
+    }
+
+    return (n/QK8_0*sizeof(block_q8_0));
+}
+
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
     size_t result = 0;
     switch (type) {
@@ -12269,6 +12390,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
                 result = ggml_quantize_q4_3(src + start, block, n, n, hist);
             } break;
+        case GGML_TYPE_Q8_0:
+            {
+                GGML_ASSERT(start % QK8_0 == 0);
+                block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
+                result = ggml_quantize_q8_0(src + start, block, n, n, hist);
+            } break;
         default:
             assert(false);
     }
diff --git a/ggml.h b/ggml.h
index 275890781..8300a0c62 100644
--- a/ggml.h
+++ b/ggml.h
@@ -223,6 +223,7 @@ extern "C" {
         GGML_TYPE_Q4_2 = 4,
         GGML_TYPE_Q4_3 = 5,
         GGML_TYPE_Q8_0 = 6,
+        GGML_TYPE_Q8_1 = 7,
         GGML_TYPE_I8,
         GGML_TYPE_I16,
         GGML_TYPE_I32,
@@ -832,6 +833,7 @@ extern "C" {
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
 
@@ -876,6 +878,7 @@ extern "C" {
         quantize_row_q_t   quantize_row_q_reference;
         quantize_row_q_t   quantize_row_q_dot;
         vec_dot_q_t        vec_dot_q;
+        enum ggml_type     vec_dot_type;
     } quantize_fns_t;
 
     quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
diff --git a/llama.cpp b/llama.cpp
index 28d27916a..25203c9e9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -484,6 +484,7 @@ struct llama_file_loader {
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q4_2:
                 case GGML_TYPE_Q4_3:
+                case GGML_TYPE_Q8_0:
                     break;
                 default: {
                     throw format("unrecognized tensor type %u\n", shard.type);
@@ -558,6 +559,7 @@ struct llama_file_saver {
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q4_2:
             case GGML_TYPE_Q4_3:
+            case GGML_TYPE_Q8_0:
                 break;
             default: LLAMA_ASSERT(false);
         }
@@ -848,6 +850,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
                                       return "mostly Q4_1, some F16";
         case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
         case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
+        case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
         default:                      return "unknown, may not work";
     }
 }
@@ -1585,6 +1588,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
         case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
+        case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
         default: throw format("invalid output file type %d\n", ftype);
     };
 
diff --git a/llama.h b/llama.h
index e9e3abea5..ab41798d8 100644
--- a/llama.h
+++ b/llama.h
@@ -74,6 +74,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
         LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
     };
 
     LLAMA_API struct llama_context_params llama_context_default_params();
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
index 7e091e8c4..a31a18827 100644
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -36,7 +36,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
 
 // Total quantization error on test data
 float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(test_size);
+    std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
 
     qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
@@ -46,7 +46,7 @@ float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const fl
 
 // Total quantization error on test data
 float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(test_size);
+    std::vector<uint8_t> tmp_q(2*test_size);
     std::vector<float> tmp_out(test_size);
     std::vector<float> tmp_out_ref(test_size);
 
@@ -69,10 +69,10 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
 
 // Total dot product error
 float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
-    std::vector<uint8_t> tmp_q1(test_size);
-    std::vector<uint8_t> tmp_q2(test_size*2);
+    std::vector<uint8_t> tmp_q1(2*test_size);
+    std::vector<uint8_t> tmp_q2(2*test_size);
 
-    qfns.quantize_row_q(test_data1, tmp_q1.data(), test_size);
+    qfns.quantize_row_q    (test_data1, tmp_q1.data(), test_size);
     qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
 
     float result = INFINITY;
@@ -125,7 +125,7 @@ int main(int argc, char * argv[]) {
             failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
             num_failed += failed;
             if (failed || verbose) {
-                printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
+                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
             }
 
             const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
@@ -139,7 +139,7 @@ int main(int argc, char * argv[]) {
             failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
             num_failed += failed;
             if (failed || verbose) {
-                printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
+                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
             }
         }
     }

From bb98e77be704584fb40b0400394b4c16ae75f8e2 Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Tue, 25 Apr 2023 23:19:57 +0200
Subject: [PATCH 31/74] nix: use convert.py instead of legacy wrapper
 convert-pth-to-ggml.py (#981)

---
 flake.nix | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.nix b/flake.nix
index 5363052b1..2c9edbb6a 100644
--- a/flake.nix
+++ b/flake.nix
@@ -30,9 +30,9 @@
             mv bin/* $out/bin/
             mv $out/bin/main $out/bin/llama
 
-            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
-            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
-            chmod +x $out/bin/convert-pth-to-ggml
+            echo "#!${llama-python}/bin/python" > $out/bin/convert.py
+            cat ${./convert.py} >> $out/bin/convert.py
+            chmod +x $out/bin/convert.py
           '';
           meta.mainProgram = "llama";
         };

From 667c501334ace706e3abc3f7a37cf1d6b4228745 Mon Sep 17 00:00:00 2001
From: ostix360 <55257054+ostix360@users.noreply.github.com>
Date: Tue, 25 Apr 2023 23:33:08 +0200
Subject: [PATCH 32/74] py : cast lora_alpha to int in convert-lora-to-ggml
 (#1170)

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 convert-lora-to-ggml.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py
index 8a2085c25..9090e8d6d 100644
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -49,7 +49,12 @@ def translate_tensor_name(t: str) -> str:
 def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
     fout.write(b"ggla"[::-1])  # magic (ggml lora)
     fout.write(struct.pack("i", 1))  # file version
-    fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
+    fout.write(struct.pack("i", params["r"]))
+    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
+    # but some models ship a float value instead
+    # let's convert to int, but fail if lossless conversion is not possible
+    assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly"
+    fout.write(struct.pack("i", int(params["lora_alpha"])))
 
 
 def write_tensor_header(
@@ -89,7 +94,7 @@ if params["peft_type"] != "LORA":
     print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
     sys.exit(1)
 
-if params["fan_in_fan_out"] == True:
+if params["fan_in_fan_out"] is True:
     print("Error: param fan_in_fan_out is not supported")
     sys.exit(1)
 

From 4afcc378698e057fcde64e23eb664e5af8dd6956 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Tue, 25 Apr 2023 21:41:56 +0000
Subject: [PATCH 33/74] Update SHA256SUMS after quantization change (#1181)

Co-authored-by: Pavol Rusnak <pavol@rusnak.io>
---
 SHA256SUMS | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/SHA256SUMS b/SHA256SUMS
index 1d034b371..87faa7f1b 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,16 +1,16 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-fcb7664c2e69776920b526362a243e912f73c36b1ec892eb354bab940f5edb5a  models/7B/ggml-model-q4_0.bin
+99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
 cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
-1bc7484c24a87612726d756f1761890e7acf5f412e23378577ce50fbe789b5b8  models/7B/ggml-model-q4_2.bin
+25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
 3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b  models/7B/ggml-model-q4_3.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-4b69e4d6b6e3275230955997b90407fceca7e5ab3daf2e63a2c9e7270a8e1e3e  models/13B/ggml-model-q4_0.bin
+eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
 d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
-8d55a2077317ec9a928c7851d6a43e08e51f7e9e08360f2a7a7e1deefea3134f  models/13B/ggml-model-q4_2.bin
+75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
 4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3  models/13B/ggml-model-q4_3.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
@@ -18,9 +18,9 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/con
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
-7a679908ce31c9d6ae2e38d6059bcd4d0ad3a870cd58cc1c8f7b36f2b2f51c73  models/30B/ggml-model-q4_0.bin
+517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
 7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
-2c82b4954a94a6a284f452f6011c1e4f0d20362c194a0b1eb5737f5fd8a20fb3  models/30B/ggml-model-q4_2.bin
+aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
 a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33  models/30B/ggml-model-q4_3.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
@@ -32,9 +32,9 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
-c671fe1bce71499ac732ec999770ebe53ac486623a7891e42c9dfdb6962d2c64  models/65B/ggml-model-q4_0.bin
+01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
 4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
-4a145a210c56982389b1ed34387e0590c3e0d7325fa9be4f2284fe4d244a3633  models/65B/ggml-model-q4_2.bin
+1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
 305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c  models/65B/ggml-model-q4_3.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model

From 859fee6dfb00fab7ce6bc215b4adae78d82f4759 Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Wed, 26 Apr 2023 18:43:27 +0200
Subject: [PATCH 34/74] quantize : use `map` to assign quantization type from
 `string` (#1191)

instead of `int` (while `int` option still being supported)

This allows the following usage:

`./quantize ggml-model-f16.bin ggml-model-q4_0.bin q4_0`

instead of:

`./quantize ggml-model-f16.bin ggml-model-q4_0.bin 2`
---
 .devops/tools.sh               |  2 +-
 README.md                      |  4 ++--
 examples/quantize/quantize.cpp | 30 ++++++++++++++++++++++++------
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/.devops/tools.sh b/.devops/tools.sh
index b0196b60d..ece9e4efa 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
             echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
         else
             echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./quantize "$i" "${i/f16/q4_0}" 2
+            ./quantize "$i" "${i/f16/q4_0}" q4_0
         fi
     done
 else
diff --git a/README.md b/README.md
index 44cf72124..509df61a1 100644
--- a/README.md
+++ b/README.md
@@ -203,8 +203,8 @@ python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
 python3 convert.py models/7B/
 
-# quantize the model to 4-bits (using method 2 = q4_0)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
+# quantize the model to 4-bits (using q4_0 method)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
 
 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index ad39a805d..ec7f91aae 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -2,8 +2,17 @@
 #include "llama.h"
 
 #include <cstdio>
+#include <map>
 #include <string>
 
+static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
+  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
+  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
+  {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
+  {"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
+  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
+};
+
 // usage:
 //  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
@@ -12,11 +21,9 @@ int main(int argc, char ** argv) {
 
     if (argc < 4) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
-        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
-        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
-        fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
-        fprintf(stderr, "  type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
-        fprintf(stderr, "  type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
+        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
+            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+        }
         return 1;
     }
 
@@ -30,7 +37,18 @@ int main(int argc, char ** argv) {
     const std::string fname_inp = argv[1];
     const std::string fname_out = argv[2];
 
-    const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
+    enum llama_ftype ftype;
+    if (argv[3][0] == 'q') {
+        auto it = LLAMA_FTYPE_MAP.find(argv[3]);
+        if (it == LLAMA_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
+            return 1;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum llama_ftype)atoi(argv[3]);
+    }
+
     int nthread = argc > 4 ? atoi(argv[4]) : 0;
 
     const int64_t t_main_start_us = ggml_time_us();

From ea3ad7eb60cfb44526a58122e8019850f437cd1b Mon Sep 17 00:00:00 2001
From: DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com>
Date: Wed, 26 Apr 2023 22:03:03 +0200
Subject: [PATCH 35/74] Updating build instructions to include BLAS support
 (#1183)

* Updated build information

First update to the build instructions to include BLAS.

* Update README.md

* Update information about BLAS

* Better BLAS explanation

Adding a clearer BLAS explanation and adding a link to download the CUDA toolkit.

* Better BLAS explanation

* BLAS for Mac

Specifying that BLAS is already supported on Macs using the Accelerate Framework.

* Clarify the effect of BLAS

* Windows Make instructions

Added the instructions to build with Make on Windows

* Fixing typo

* Fix trailing whitespace
---
 README.md | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 509df61a1..ddbd4c8b1 100644
--- a/README.md
+++ b/README.md
@@ -167,15 +167,27 @@ cd llama.cpp
 
 ### Build
 
-Note: For Windows, CMake or Zig can be used.
+In order to build llama.cpp you have three different options.
 
-1. Use `make`
+- Using `make`:
+  - On Linux or MacOS:
 
-    ```bash
-    make
-    ```
+      ```bash
+      make
+      ```
 
-1. Use CMake
+  - On Windows:
+
+    1. Download the latest fortran version of [w64devkit](https://github.com/seeto/w64devkit/releases).
+    2. Extract `w64devkit` on your pc.
+    3. Run `w64devkit.exe`.
+    4. Use the `cd` command to reach the `llama.cpp` folder.
+    5. From here you can run:
+        ```bash
+        make
+        ```
+
+- Using `CMake`:
 
     ```bash
     mkdir build
@@ -184,12 +196,71 @@ Note: For Windows, CMake or Zig can be used.
     cmake --build . --config Release
     ```
 
-1. Use Zig
+- Using `Zig`:
 
     ```bash
     zig build -Drelease-fast
     ```
 
+### BLAS Build
+
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:
+
+- Accelerate Framework:
+
+  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.
+
+- OpenBLAS:
+
+  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.
+
+  - Using `make`:
+    - On Linux:
+      ```bash
+      make LLAMA_OPENBLAS=1
+      ```
+      Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas`
+
+    - On Windows:
+
+      1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
+      2. Download the latest version of [OpenBLAS for Windows](https://github.com/xianyi/OpenBLAS/releases).
+      3. Extract `w64devkit` on your pc.
+      4. From the OpenBLAS zip that you just downloaded copy `libopenblas.a`, located inside the `lib` folder, inside `w64devkit\x86_64-w64-mingw32\lib`.
+      5. From the same OpenBLAS zip copy the content of the `include` folder inside `w64devkit\x86_64-w64-mingw32\include`.
+      6. Run `w64devkit.exe`.
+      7. Use the `cd` command to reach the `llama.cpp` folder.
+      8. From here you can run:
+
+          ```bash
+          make LLAMA_OPENBLAS=1
+          ```
+
+  - Using `CMake` on Linux:
+
+      ```bash
+      mkdir build
+      cd build
+      cmake .. -DLLAMA_OPENBLAS=ON
+      cmake --build . --config Release
+      ```
+
+- cuBLAS
+
+  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  - Using `make`:
+    ```bash
+    make LLAMA_CUBLAS=1
+    ```
+  - Using `CMake`:
+
+    ```bash
+    mkdir build
+    cd build
+    cmake .. -DLLAMA_CUBLAS=ON
+    cmake --build . --config Release
+    ```
+
 ### Prepare Data & Run
 
 ```bash

From 87a6f846d3e929632c45916dd08f1e2a9c72d2a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81sgeir=20Bjarni=20Ingvarsson?= <asgeir@fundinn.org>
Date: Wed, 26 Apr 2023 20:08:43 +0000
Subject: [PATCH 36/74] Allow setting the rng seed after initialization.
 (#1184)

The llama_set_state_data function restores the rng state to what it
was at the time llama_copy_state_data was called. But users may want
to restore the state and proceed with a different seed.
---
 llama.cpp | 7 +++++++
 llama.h   | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 25203c9e9..8334553a5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2082,6 +2082,13 @@ int llama_get_kv_cache_token_count(struct llama_context * ctx) {
 
 #define LLAMA_MAX_RNG_STATE 64*1024
 
+void llama_set_rng_seed(struct llama_context * ctx, int seed) {
+    if (seed <= 0) {
+        seed = time(NULL);
+    }
+    ctx->rng.seed(seed);
+}
+
 // Returns the size of the state
 size_t llama_get_state_size(struct llama_context * ctx) {
     // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
diff --git a/llama.h b/llama.h
index ab41798d8..24c48cce6 100644
--- a/llama.h
+++ b/llama.h
@@ -116,6 +116,9 @@ extern "C" {
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
 
+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
+
     // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
     LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
 

From 574406dc7e350ddbffaeca33bf0392b7bfeb1436 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Apr 2023 23:14:13 +0300
Subject: [PATCH 37/74] ggml : add Q5_0 and Q5_1 quantization (#1187)

* ggml : add Q5_0 quantization (cuBLAS only)

* ggml : fix Q5_0 qh -> uint32_t

* ggml : fix q5_0 histogram stats

* ggml : q5_0 scalar dot product

* ggml : q5_0 ARM NEON dot

* ggml : q5_0 more efficient ARM NEON using uint64_t masks

* ggml : rename Q5_0 -> Q5_1

* ggml : adding Q5_0 mode

* quantize : add Q5_0 and Q5_1 to map

* ggml : AVX2 optimizations for Q5_0, Q5_1 (#1195)

---------

Co-authored-by: Stephan Walter <stephan@walter.name>
---
 .gitignore                     |   1 +
 examples/quantize/quantize.cpp |   2 +
 ggml-cuda.cu                   |  85 +++++
 ggml-cuda.h                    |   2 +
 ggml.c                         | 633 +++++++++++++++++++++++++++++++--
 ggml.h                         |   8 +-
 llama.cpp                      |   8 +
 llama.h                        |   2 +
 8 files changed, 711 insertions(+), 30 deletions(-)

diff --git a/.gitignore b/.gitignore
index e52d479ee..c7573bb3b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ build-em/
 build-debug/
 build-release/
 build-static/
+build-cublas/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index ec7f91aae..60966595e 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -10,6 +10,8 @@ static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
   {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
   {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
   {"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
+  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
+  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
   {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
 };
 
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index f104ed5ac..b1bd29b10 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -37,6 +37,23 @@ typedef struct {
 } block_q4_3;
 static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
 
+#define QK5_0 32
+typedef struct {
+    __half d;               // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    __half d;               // delta
+    __half m;               // min
+    uint32_t qh;            // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
 #define QK8_0 32
 typedef struct {
     float   d;              // delta
@@ -138,6 +155,64 @@ static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
     }
 }
 
+static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    const uint8_t * pp = x[i].qs;
+
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    for (int l = 0; l < QK5_0; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+        const int8_t vi0 = ((vi & 0xf) | vh0);
+        const int8_t vi1 = ((vi >>  4) | vh1);
+
+        const float v0 = (vi0 - 16)*d;
+        const float v1 = (vi1 - 16)*d;
+
+        y[i*QK5_0 + l + 0] = v0;
+        y[i*QK5_0 + l + 1] = v1;
+    }
+}
+
+static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const uint8_t * pp = x[i].qs;
+
+    const uint32_t qh = x[i].qh;
+
+    for (int l = 0; l < QK5_1; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+        const int8_t vi0 = (vi & 0xf) | vh0;
+        const int8_t vi1 = (vi >>  4) | vh1;
+
+        const float v0 = vi0*d + m;
+        const float v1 = vi1*d + m;
+
+        y[i*QK5_1 + l + 0] = v0;
+        y[i*QK5_1 + l + 1] = v1;
+    }
+}
+
 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
     const block_q8_0 * x = (const block_q8_0 *) vx;
 
@@ -174,6 +249,16 @@ void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t st
     dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
 }
 
+void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK5_0;
+    dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK5_1;
+    dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
+}
+
 void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK8_0;
     dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 4048ea491..ed9b44184 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -35,6 +35,8 @@ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t st
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 
 #ifdef  __cplusplus
diff --git a/ggml.c b/ggml.c
index 064510eda..03b4bd439 100644
--- a/ggml.c
+++ b/ggml.c
@@ -328,6 +328,19 @@ static ggml_fp16_t table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB)
 static float table_f32_f16[1 << 16];
 
+#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
+#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
+#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
+#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
+#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
+#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
+#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
+#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
+
+// precomputed tables for expanding 8bits to 8 bytes (shl 4)
+static const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) };
+static const uint64_t table_b2b_i[1 << 8] = { B8(F0, 00) };
+
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
@@ -673,6 +686,23 @@ typedef struct {
 } block_q4_3;
 static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
 
+#define QK5_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    ggml_fp16_t m;         // min
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
 #define QK8_0 32
 typedef struct {
     float   d;          // delta
@@ -1288,6 +1318,103 @@ static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int
     quantize_row_q4_3_reference(x, y, k);
 }
 
+static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
+    assert(k % QK5_0 == 0);
+    const int nb = k / QK5_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max = 0.0f;
+
+        for (int l = 0; l < QK5_0; l++) {
+            const float v = x[i*QK5_0 + l];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max = v;
+            }
+        }
+
+        const float d = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int l = 0; l < QK5_0; l += 2) {
+            const float v0 = x[i*QK5_0 + l + 0]*id;
+            const float v1 = x[i*QK5_0 + l + 1]*id;
+
+            const uint32_t vi0 = MIN(31, (int) (v0 + 16.5f));
+            const uint32_t vi1 = MIN(31, (int) (v1 + 16.5f));
+
+            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
+            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+    }
+}
+
+static void quantize_row_q5_0(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK5_0 == 0);
+
+    block_q5_0 * restrict y = vy;
+
+    quantize_row_q5_0_reference(x, y, k);
+}
+
+static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
+    assert(k % QK5_1 == 0);
+    const int nb = k / QK5_1;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int l = 0; l < QK5_1; l++) {
+            const float v = x[i*QK5_1 + l];
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        uint32_t qh = 0;
+
+        for (int l = 0; l < QK5_1; l += 2) {
+            const float v0 = (x[i*QK5_1 + l + 0] - min)*id;
+            const float v1 = (x[i*QK5_1 + l + 1] - min)*id;
+
+            const uint32_t vi0 = (int) (v0 + 0.5f);
+            const uint32_t vi1 = (int) (v1 + 0.5f);
+
+            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
+            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+    }
+}
+
+static void quantize_row_q5_1(const float * restrict x, void * restrict vy, int k) {
+    assert(k % QK5_1 == 0);
+
+    block_q5_1 * restrict y = vy;
+
+    quantize_row_q5_1_reference(x, y, k);
+}
+
 // reference implementation for deterministic creation of model files
 static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
     assert(k % QK8_0 == 0);
@@ -1571,7 +1698,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
             const uint8x8_t v8 = vld1_u8(pp + l/2);
 
             // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
+            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
             const uint8x8_t v1 = vshr_n_u8(v8, 4);
 
             // Convert to signed 8-bit integers
@@ -1621,7 +1748,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
         for (int l = 0; l < QK4_0; l += 2) {
             const uint8_t vi = pp[l/2];
 
-            const int8_t vi0 = vi & 0xf;
+            const int8_t vi0 = vi & 0x0F;
             const int8_t vi1 = vi >> 4;
 
             const float v0 = (vi0 - 8)*d;
@@ -1687,7 +1814,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
             const uint8x8_t v8 = vld1_u8(pp + l/2);
 
             // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
+            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
             const uint8x8_t v1 = vshr_n_u8(v8, 4);
 
             // Interleave and combine
@@ -1729,7 +1856,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
         for (int l = 0; l < QK4_1; l += 2) {
             const uint8_t vi = pp[l/2];
 
-            const int8_t vi0 = vi & 0xf;
+            const int8_t vi0 = vi & 0x0F;
             const int8_t vi1 = vi >> 4;
 
             const float v0 = vi0*d + m;
@@ -1759,7 +1886,7 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
         for (int l = 0; l < QK4_2; l += 2) {
             const uint8_t vi = pp[l/2];
 
-            const int8_t vi0 = vi & 0xf;
+            const int8_t vi0 = vi & 0x0F;
             const int8_t vi1 = vi >> 4;
 
             const float v0 = (vi0 - 8)*d;
@@ -1789,7 +1916,7 @@ static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, in
         for (int l = 0; l < QK4_3; l += 2) {
             const uint8_t vi = pp[l/2];
 
-            const int8_t vi0 = vi & 0xf;
+            const int8_t vi0 = vi & 0x0F;
             const int8_t vi1 = vi >> 4;
 
             const float v0 = vi0*d + m;
@@ -1804,6 +1931,79 @@ static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, in
     }
 }
 
+static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
+    assert(k % QK5_0 == 0);
+    const int nb = k / QK5_0;
+
+    const block_q5_0 * restrict x = vx;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * restrict pp = x[i].qs;
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int l = 0; l < QK5_0; l += 2) {
+            const uint8_t vi = pp[l/2];
+
+            // extract the 5-th bit from qh
+            const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+            const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+            const int8_t vi0 = (vi & 0x0F) | vh0;
+            const int8_t vi1 = (vi >>   4) | vh1;
+
+            const float v0 = (vi0 - 16)*d;
+            const float v1 = (vi1 - 16)*d;
+
+            y[i*QK5_0 + l + 0] = v0;
+            y[i*QK5_0 + l + 1] = v1;
+
+            assert(!isnan(y[i*QK5_0 + l + 0]));
+            assert(!isnan(y[i*QK5_0 + l + 1]));
+        }
+    }
+}
+
+static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, int k) {
+    assert(k % QK5_1 == 0);
+    const int nb = k / QK5_1;
+
+    const block_q5_1 * restrict x = vx;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        const uint8_t * restrict pp = x[i].qs;
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int l = 0; l < QK5_1; l += 2) {
+            const uint8_t vi = pp[l/2];
+
+            // extract the 5-th bit from qh
+            const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+            const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+            const uint8_t vi0 = (vi & 0x0F) | vh0;
+            const uint8_t vi1 = (vi >>   4) | vh1;
+
+            const float v0 = vi0*d + m;
+            const float v1 = vi1*d + m;
+
+            y[i*QK5_1 + l + 0] = v0;
+            y[i*QK5_1 + l + 1] = v1;
+
+            assert(!isnan(y[i*QK5_1 + l + 0]));
+            assert(!isnan(y[i*QK5_1 + l + 1]));
+        }
+    }
+}
+
 static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
@@ -1825,6 +2025,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
 static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
@@ -1860,6 +2062,22 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_q                = ggml_vec_dot_q4_3_q8_1,
         .vec_dot_type             = GGML_TYPE_Q8_1,
     },
+    [GGML_TYPE_Q5_0] = {
+        .dequantize_row_q         = dequantize_row_q5_0,
+        .quantize_row_q           = quantize_row_q5_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference,
+        .quantize_row_q_dot       = quantize_row_q8_0,
+        .vec_dot_q                = ggml_vec_dot_q5_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q5_1] = {
+        .dequantize_row_q         = dequantize_row_q5_1,
+        .quantize_row_q           = quantize_row_q5_1,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference,
+        .quantize_row_q_dot       = quantize_row_q8_1,
+        .vec_dot_q                = ggml_vec_dot_q5_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    },
     [GGML_TYPE_Q8_0] = {
         .dequantize_row_q         = dequantize_row_q8_0,
         .quantize_row_q           = quantize_row_q8_0,
@@ -2496,7 +2714,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         const block_q8_0 * restrict y0 = &y[i + 0];
         const block_q8_0 * restrict y1 = &y[i + 1];
 
-        const uint8x16_t m4b   = vdupq_n_u8(0xf);
+        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
         const int8x16_t  s8b   = vdupq_n_s8(0x8);
 
         const uint8x16_t v0_0 = vld1q_u8(x0->qs);
@@ -2632,8 +2850,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         for (int j = 0; j < QK8_0/2; j++) {
             const uint8_t v0 = p0[j];
 
-            const int i0 = (int8_t) (v0 & 0xf) - 8;
-            const int i1 = (int8_t) (v0 >> 4)  - 8;
+            const int i0 = (int8_t) (v0 & 0x0F) - 8;
+            const int i1 = (int8_t) (v0 >>   4) - 8;
 
             const int i2 = p1[2*j + 0];
             const int i3 = p1[2*j + 1];
@@ -2670,7 +2888,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
 
         summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
 
-        const uint8x16_t m4b = vdupq_n_u8(0xf);
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
 
         const uint8x16_t v0_0 = vld1q_u8(x0->qs);
         const uint8x16_t v0_1 = vld1q_u8(x1->qs);
@@ -2767,8 +2985,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         for (int j = 0; j < QK8_1/2; j++) {
             const uint8_t v0 = p0[j];
 
-            const float f0 = d0*(v0 & 0xf) + m0;
-            const float f1 = d0*(v0 >> 4)  + m0;
+            const float f0 = d0*(v0 & 0x0F) + m0;
+            const float f1 = d0*(v0 >>   4) + m0;
 
             const float f2 = d1*p1[2*j + 0];
             const float f3 = d1*p1[2*j + 1];
@@ -2803,7 +3021,7 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
         const block_q8_0 * restrict y0 = &y[i + 0];
         const block_q8_0 * restrict y1 = &y[i + 1];
 
-        const uint8x16_t m4b   = vdupq_n_u8(0xf);
+        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
         const int8x16_t  s8b   = vdupq_n_s8(0x8);
 
         const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
@@ -2914,11 +3132,11 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
             const uint8_t v0 = x0[j];
             const uint8_t v1 = x1[j];
 
-            const int i0_0 = (int8_t) (v0 & 0xf) - 8;
-            const int i1_0 = (int8_t) (v0 >> 4)  - 8;
+            const int i0_0 = (int8_t) (v0 & 0x0F) - 8;
+            const int i1_0 = (int8_t) (v0 >>   4) - 8;
 
-            const int i0_1 = (int8_t) (v1 & 0xf) - 8;
-            const int i1_1 = (int8_t) (v1 >> 4)  - 8;
+            const int i0_1 = (int8_t) (v1 & 0x0F) - 8;
+            const int i1_1 = (int8_t) (v1 >>   4) - 8;
 
             const int i2_0 = y0[2*j + 0];
             const int i3_0 = y0[2*j + 1];
@@ -2966,7 +3184,7 @@ static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void *
         const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
 
         // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, vdupq_n_u8(0xf)));
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, vdupq_n_u8(0x0F)));
         const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
 
         // interleave
@@ -3045,10 +3263,10 @@ static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void *
             const uint8_t v0 = x0[j];
             const uint8_t v1 = x1[j];
 
-            const int x0_0 = v0 & 0xf;
+            const int x0_0 = v0 & 0x0F;
             const int x1_0 = v0 >> 4;
 
-            const int x0_1 = v1 & 0xf;
+            const int x0_1 = v1 & 0x0F;
             const int x1_1 = v1 >> 4;
 
             const int y0_0 = y0[2*j + 0];
@@ -3067,6 +3285,273 @@ static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void *
 #endif
 }
 
+static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / QK8_0;
+
+    assert(n % QK8_0 == 0);
+    assert(nb % 2 == 0);
+    assert(QK8_0 == QK5_0);
+
+    const block_q5_0 * restrict x = vx;
+    const block_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv = vdupq_n_f32(0.0f);
+
+    uint64_t tmp[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q8_0 * restrict y0 = &y[i];
+
+        const uint8x16_t m4b  = vdupq_n_u8(0x0F);
+        const int8x16_t  s16b = vdupq_n_s8(0x10);
+
+        // extract the 5th bit
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
+
+        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
+        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
+
+        const uint8x16_t v0 = vld1q_u8(x0->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, m4b));
+        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
+
+        // interleave
+        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
+        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
+
+        // add high bit and sub 16
+        const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0lz, qhl), s16b);
+        const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0hz, qhh), s16b);
+
+        // load y
+        const int8x16_t v1l = vld1q_s8(y0->qs);
+        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
+                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+
+        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = _mm256_set_epi64x(
+            table_b2b_i[x[i].qh[3]], table_b2b_i[x[i].qh[2]],
+            table_b2b_i[x[i].qh[1]], table_b2b_i[x[i].qh[0]]);
+        bx = _mm256_or_si256(bx, bxhi);
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    *s = hsum_float_8(acc);
+#else
+    // scalar
+    float sumf = 0.0;
+    for (int i = 0; i < nb; i++) {
+        const uint8_t * restrict x0 = x[i].qs;
+        const  int8_t * restrict y0 = y[i].qs;
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        int sxy = 0;
+
+        for (int j = 0; j < QK8_0/2; j++) {
+            const uint8_t v0 = x0[j];
+
+            const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
+            const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
+
+            const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
+            const int x1_0 = ((v0 >>   4) | x1_0h) - 16;
+
+            const int y0_0 = y0[2*j + 0];
+            const int y1_0 = y0[2*j + 1];
+
+            sxy += x0_0*y0_0 + x1_0*y1_0;
+        }
+
+        sumf += (d*sxy)*y[i].d;
+    }
+    *s = sumf;
+#endif
+}
+
+static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
+    const int nb = n / QK8_1;
+
+    assert(n % QK8_1 == 0);
+    assert(nb % 2 == 0);
+    assert(QK8_1 == QK5_1);
+
+    const block_q5_1 * restrict x = vx;
+    const block_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv = vdupq_n_f32(0.0f);
+
+    float summs = 0.0f;
+
+    uint64_t tmp[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q8_1 * restrict y0 = &y[i];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
+
+        // extract the 5th bit
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
+
+        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
+        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
+
+        const uint8x16_t v0 = vld1q_u8(x0->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, vdupq_n_u8(0x0F)));
+        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
+
+        // interleave
+        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
+        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
+
+        // add
+        const int8x16_t v0lf = vorrq_s8(v0lz, qhl);
+        const int8x16_t v0hf = vorrq_s8(v0hz, qhh);
+
+        // load y
+        const int8x16_t v1l = vld1q_s8(y0->qs);
+        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
+                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+
+        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv) + summs;
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = _mm256_set_epi64x(
+            table_b2b_u[x[i].qh[3]], table_b2b_u[x[i].qh[2]],
+            table_b2b_u[x[i].qh[1]], table_b2b_u[x[i].qh[0]]);
+        bx = _mm256_or_si256(bx, bxhi);
+
+        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#else
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        const uint8_t * restrict x0 = x[i].qs;
+        const  int8_t * restrict y0 = y[i].qs;
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        int sxy = 0;
+
+        for (int j = 0; j < QK8_1/2; j++) {
+            const uint8_t v0 = x0[j];
+
+            const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
+            const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
+
+            const int x0_0 = (v0 & 0x0F) | x0_0h;
+            const int x1_0 = (v0 >>   4) | x1_0h;
+
+            const int y0_0 = y0[2*j + 0];
+            const int y1_0 = y0[2*j + 1];
+
+            sxy += x0_0*y0_0 + x1_0*y1_0;
+        }
+
+        sumf += (d*sxy)*y[i].d + m*(y[i].s0 + y[i].s1);
+    }
+
+    *s = sumf;
+#endif
+}
+
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     const int nb = n / QK8_0;
 
@@ -3409,13 +3894,15 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_1] = QK4_1,
     [GGML_TYPE_Q4_2] = QK4_2,
     [GGML_TYPE_Q4_3] = QK4_3,
+    [GGML_TYPE_Q5_0] = QK5_0,
+    [GGML_TYPE_Q5_1] = QK5_1,
     [GGML_TYPE_Q8_0] = QK8_0,
     [GGML_TYPE_Q8_1] = QK8_1,
     [GGML_TYPE_I8]   = 1,
     [GGML_TYPE_I16]  = 1,
     [GGML_TYPE_I32]  = 1,
 };
-static_assert(GGML_TYPE_COUNT == 11, "GGML_BLCK_SIZE is outdated");
+static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
 
 static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = sizeof(float),
@@ -3424,13 +3911,15 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
     [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
     [GGML_TYPE_Q4_3] = sizeof(block_q4_3),
+    [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
+    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
     [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
     [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
     [GGML_TYPE_I8]   = sizeof(int8_t),
     [GGML_TYPE_I16]  = sizeof(int16_t),
     [GGML_TYPE_I32]  = sizeof(int32_t),
 };
-static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_SIZE is outdated");
+static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
 
 
 static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@@ -3440,13 +3929,15 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_1] = "q4_1",
     [GGML_TYPE_Q4_2] = "q4_2",
     [GGML_TYPE_Q4_3] = "q4_3",
+    [GGML_TYPE_Q5_0] = "q5_0",
+    [GGML_TYPE_Q5_1] = "q5_1",
     [GGML_TYPE_Q8_0] = "q8_0",
     [GGML_TYPE_Q8_1] = "q8_1",
     [GGML_TYPE_I8]   = "i8",
     [GGML_TYPE_I16]  = "i16",
     [GGML_TYPE_I32]  = "i32",
 };
-static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_NAME is outdated");
+static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
 
 static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F32]  = false,
@@ -3455,13 +3946,15 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_1] = true,
     [GGML_TYPE_Q4_2] = true,
     [GGML_TYPE_Q4_3] = true,
+    [GGML_TYPE_Q5_0] = true,
+    [GGML_TYPE_Q5_1] = true,
     [GGML_TYPE_Q8_0] = true,
     [GGML_TYPE_Q8_1] = true,
     [GGML_TYPE_I8]   = false,
     [GGML_TYPE_I16]  = false,
     [GGML_TYPE_I32]  = false,
 };
-static_assert(GGML_TYPE_COUNT == 11, "GGML_IS_QUANTIZED is outdated");
+static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
 
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",
@@ -6673,6 +7166,8 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q4_3:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
             {
                 ggml_compute_forward_add_q_f32(params, src0, src1, dst);
@@ -8161,6 +8656,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
         else if (type == GGML_TYPE_Q4_3) {
             dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
         }
+        else if (type == GGML_TYPE_Q5_0) {
+            dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
+        }
+        else if (type == GGML_TYPE_Q5_1) {
+            dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
+        }
         else if (type == GGML_TYPE_Q8_0) {
             dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
         }
@@ -8319,6 +8820,8 @@ static void ggml_compute_forward_mul_mat(
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q4_3:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
         case GGML_TYPE_Q8_1:
             {
@@ -8549,6 +9052,8 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
         case GGML_TYPE_Q4_3:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
         case GGML_TYPE_Q8_1:
             {
@@ -12261,7 +12766,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t *
 
         for (int i = 0; i < nb; i++) {
             for (int l = 0; l < QK4_0; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
+                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
                 const uint8_t vi1 = y[i].qs[l/2] >> 4;
 
                 hist[vi0]++;
@@ -12284,7 +12789,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
 
         for (int i = 0; i < nb; i++) {
             for (int l = 0; l < QK4_1; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
+                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
                 const uint8_t vi1 = y[i].qs[l/2] >> 4;
 
                 hist[vi0]++;
@@ -12307,7 +12812,7 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
 
         for (int i = 0; i < nb; i++) {
             for (int l = 0; l < QK4_2; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
+                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
                 const uint8_t vi1 = y[i].qs[l/2] >> 4;
 
                 hist[vi0]++;
@@ -12330,7 +12835,7 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
 
         for (int i = 0; i < nb; i++) {
             for (int l = 0; l < QK4_3; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
+                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
                 const uint8_t vi1 = y[i].qs[l/2] >> 4;
 
                 hist[vi0]++;
@@ -12342,6 +12847,66 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_3*sizeof(block_q4_3));
 }
 
+size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK5_0 == 0);
+    const int nb = k / QK5_0;
+
+    for (int j = 0; j < n; j += k) {
+        block_q5_0 * restrict y = (block_q5_0 *)dst + j/QK5_0;
+
+        quantize_row_q5_0_reference(src + j, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int l = 0; l < QK5_0; l += 2) {
+                const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK5_0*sizeof(block_q5_0));
+}
+
+size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % QK5_1 == 0);
+    const int nb = k / QK5_1;
+
+    for (int j = 0; j < n; j += k) {
+        block_q5_1 * restrict y = (block_q5_1 *)dst + j/QK5_1;
+
+        quantize_row_q5_1_reference(src + j, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int l = 0; l < QK5_1; l += 2) {
+                const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/QK5_1*sizeof(block_q5_1));
+}
+
 size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
@@ -12390,6 +12955,18 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
                 result = ggml_quantize_q4_3(src + start, block, n, n, hist);
             } break;
+        case GGML_TYPE_Q5_0:
+            {
+                GGML_ASSERT(start % QK5_0 == 0);
+                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
+                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
+            } break;
+        case GGML_TYPE_Q5_1:
+            {
+                GGML_ASSERT(start % QK5_1 == 0);
+                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
+                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
+            } break;
         case GGML_TYPE_Q8_0:
             {
                 GGML_ASSERT(start % QK8_0 == 0);
diff --git a/ggml.h b/ggml.h
index 8300a0c62..d9d3d214e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -222,8 +222,10 @@ extern "C" {
         GGML_TYPE_Q4_1 = 3,
         GGML_TYPE_Q4_2 = 4,
         GGML_TYPE_Q4_3 = 5,
-        GGML_TYPE_Q8_0 = 6,
-        GGML_TYPE_Q8_1 = 7,
+        GGML_TYPE_Q5_0 = 6,
+        GGML_TYPE_Q5_1 = 7,
+        GGML_TYPE_Q8_0 = 8,
+        GGML_TYPE_Q8_1 = 9,
         GGML_TYPE_I8,
         GGML_TYPE_I16,
         GGML_TYPE_I32,
@@ -833,6 +835,8 @@ extern "C" {
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
diff --git a/llama.cpp b/llama.cpp
index 8334553a5..28a74b514 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -484,6 +484,8 @@ struct llama_file_loader {
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q4_2:
                 case GGML_TYPE_Q4_3:
+                case GGML_TYPE_Q5_0:
+                case GGML_TYPE_Q5_1:
                 case GGML_TYPE_Q8_0:
                     break;
                 default: {
@@ -559,6 +561,8 @@ struct llama_file_saver {
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q4_2:
             case GGML_TYPE_Q4_3:
+            case GGML_TYPE_Q5_0:
+            case GGML_TYPE_Q5_1:
             case GGML_TYPE_Q8_0:
                 break;
             default: LLAMA_ASSERT(false);
@@ -850,6 +854,8 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
                                       return "mostly Q4_1, some F16";
         case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
         case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
+        case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
         case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
         default:                      return "unknown, may not work";
     }
@@ -1588,6 +1594,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
         case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
+        case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
         case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
         default: throw format("invalid output file type %d\n", ftype);
     };
diff --git a/llama.h b/llama.h
index 24c48cce6..17dac0689 100644
--- a/llama.h
+++ b/llama.h
@@ -75,6 +75,8 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
     };
 
     LLAMA_API struct llama_context_params llama_context_default_params();

From f9be42add0bf3ce61814b7ede0e6d0dda9ff22c6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 26 Apr 2023 23:24:42 +0300
Subject: [PATCH 38/74] readme : add quantization info

---
 README.md | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index ddbd4c8b1..be0e49e47 100644
--- a/README.md
+++ b/README.md
@@ -7,31 +7,27 @@
 
 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
-**Warnings**
-
-- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized
-
 **Hot topics:**
 
+- [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
 - [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
 - [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
 - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
 
 ## Description
 
-The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook.
+The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook
 
 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
 - AVX2 support for x86 architectures
 - Mixed F16 / F32 precision
-- 4-bit quantization support
+- 4-bit integer quantization support
 - Runs on the CPU
 
-This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
-Please do not make conclusions about the models based on the results from this implementation.
-For all I know, it can be completely wrong. This project is for educational purposes.
-New features will probably be added mostly through community contributions.
+The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
+Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
+as the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
 
 **Supported platforms:**
 
@@ -294,6 +290,24 @@ As the models are currently fully loaded into memory, you will need adequate dis
 | 30B   | 60 GB         | 19.5 GB                |
 | 65B   | 120 GB        | 38.5 GB                |
 
+### Quantization
+
+Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
+
+Model | F16 | Q4_0 | Q4_1 | Q4_2 | Q4_3 | Q5_0 | Q5_1 | Q8_0
+-- | -- | -- | -- | -- | -- | -- | -- | --
+7B (ppl) | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0617 | 6.0139 | 5.9934 | 5.9571
+7B (size) | 13.0G | 4.0G | 4.8G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G
+7B (ms/tok @ 4th) | 128 | 56 | 61 | 84 | 91 | 91 | 95 | 75
+7B (ms/tok @ 8th) | 128 | 47 | 55 | 48 | 53 | 53 | 59 | 75
+7B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
+-- | -- | -- | -- | -- | -- | -- | -- | --
+13B (ppl) | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.3234 | 5.2768 | 5.2582 | 5.2458
+13B (size) | 25.0G | 7.6G | 9.1G | 7.6G | 9.1G | 8.4G | 9.1G | 14G
+13B (ms/tok @ 4th) | 239 | 104 | 113 | 160 | 175 | 176 | 185 | 141
+13B (ms/tok @ 8th) | 240 | 85 | 99 | 97 | 114 | 108 | 117 | 147
+13B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
+
 ### Interactive mode
 
 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.

From 0b2da20538d01926b77ea237dd1c930c4d20b686 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Wed, 26 Apr 2023 20:26:42 +0000
Subject: [PATCH 39/74] ggml : slightly faster AVX2 implementation for Q5
 (#1197)

---
 ggml.c | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/ggml.c b/ggml.c
index 03b4bd439..3422a9448 100644
--- a/ggml.c
+++ b/ggml.c
@@ -328,6 +328,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB)
 static float table_f32_f16[1 << 16];
 
+#if defined(__ARM_NEON)
 #define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
 #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
 #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -339,7 +340,7 @@ static float table_f32_f16[1 << 16];
 
 // precomputed tables for expanding 8bits to 8 bytes (shl 4)
 static const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) };
-static const uint64_t table_b2b_i[1 << 8] = { B8(F0, 00) };
+#endif
 
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
@@ -490,6 +491,19 @@ static inline int hsum_i32_4(const __m128i a) {
 }
 
 #if __AVX2__ || __AVX512F__
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t * x) {
+    uint32_t x32;
+    memcpy(&x32, x, sizeof(uint32_t));
+    const __m256i shuf_mask = _mm256_set_epi64x(
+        0x0303030303030303, 0x0202020202020202,
+        0x0101010101010101, 0x0000000000000000);
+    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+    bytes = _mm256_or_si256(bytes, bit_mask);
+    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
@@ -3367,9 +3381,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
 
         __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = _mm256_set_epi64x(
-            table_b2b_i[x[i].qh[3]], table_b2b_i[x[i].qh[2]],
-            table_b2b_i[x[i].qh[1]], table_b2b_i[x[i].qh[0]]);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
         bx = _mm256_or_si256(bx, bxhi);
 
         __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
@@ -3501,9 +3514,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
 
         __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = _mm256_set_epi64x(
-            table_b2b_u[x[i].qh[3]], table_b2b_u[x[i].qh[2]],
-            table_b2b_u[x[i].qh[1]], table_b2b_u[x[i].qh[0]]);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
         bx = _mm256_or_si256(bx, bxhi);
 
         const __m256 dy = _mm256_broadcast_ss(&y[i].d);

From 04aaae1d79482cad2564412f3b32e70298ac7789 Mon Sep 17 00:00:00 2001
From: Yann Follet <131855179+YannFollet@users.noreply.github.com>
Date: Fri, 28 Apr 2023 19:59:48 +0800
Subject: [PATCH 40/74] add avx2 for dot_q8_0_q8_0, 2x faster than scalar
 (#1211)

---
 ggml.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/ggml.c b/ggml.c
index 3422a9448..1fbf2955d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3626,6 +3626,24 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
     }
 
     *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    *s = hsum_float_8(acc);
 #else
     // scalar
     float sumf = 0.0;

From 92a6e13a31ba052abd9062af6cb8df2a293ce661 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 28 Apr 2023 15:40:32 +0200
Subject: [PATCH 41/74] Add Manjaro CUDA include and lib dirs to Makefile
 (#1212)

---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 8fbb19c46..f7c8dbfdc 100644
--- a/Makefile
+++ b/Makefile
@@ -105,8 +105,8 @@ ifdef LLAMA_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef LLAMA_CUBLAS
-	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
-	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
+	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native

From 78ec543733d10a1629f984fd0302fdaa4e87fe66 Mon Sep 17 00:00:00 2001
From: Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
Date: Fri, 28 Apr 2023 19:22:48 +0500
Subject: [PATCH 42/74] Correcting link to w64devkit (#1214)

Correcting link to w64devkit (change seeto to skeeto).
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index be0e49e47..2a20746c6 100644
--- a/README.md
+++ b/README.md
@@ -174,7 +174,7 @@ In order to build llama.cpp you have three different options.
 
   - On Windows:
 
-    1. Download the latest fortran version of [w64devkit](https://github.com/seeto/w64devkit/releases).
+    1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
     2. Extract `w64devkit` on your pc.
     3. Run `w64devkit.exe`.
     4. Use the `cd` command to reach the `llama.cpp` folder.

From 7296c961d9303010a2b98379f738da2a8a55aa1b Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Fri, 28 Apr 2023 16:57:16 +0200
Subject: [PATCH 43/74] ggml : add CLBlast support (#1164)

* Allow use of OpenCL GPU-based BLAS using ClBlast instead of OpenBLAS for context processing

* Improve ClBlast implementation, avoid recreating buffers, remove redundant transfers

* Finish merge of ClBlast support

* Move CLBlast implementation to separate file

Add buffer reuse code (adapted from slaren's cuda implementation)

* Add q4_2 and q4_3 CLBlast support, improve code

* Double CLBlast speed by disabling OpenBLAS thread workaround

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>

* Fix device selection env variable names

* Fix cast in opencl kernels

* Add CLBlast to CMakeLists.txt

* Replace buffer pool with static buffers a, b, qb, c

Fix compile warnings

* Fix typos, use GGML_TYPE defines, improve code

* Improve btype dequant kernel selection code, add error if type is unsupported

* Improve code quality

* Move internal stuff out of header
* Use internal enums instead of CLBlast enums
* Remove leftover C++ includes and defines
* Make event use easier to read

Co-authored-by: Henri Vasserman <henv@hot.ee>

* Use c compiler for opencl files

* Simplify code, fix include

* First check error, then release event

* Make globals static, fix indentation

* Rename dequant kernels file to conform with other file names

* Fix import cl file name

---------

Co-authored-by: Concedo <39025047+LostRuins@users.noreply.github.com>
Co-authored-by: slaren <2141330+slaren@users.noreply.github.com>
Co-authored-by: Henri Vasserman <henv@hot.ee>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 CMakeLists.txt         |  19 +++-
 Makefile               |  11 ++-
 ggml-opencl-dequant.cl |  84 ++++++++++++++++
 ggml-opencl.c          | 216 +++++++++++++++++++++++++++++++++++++++++
 ggml-opencl.h          |  24 +++++
 ggml.c                 |  68 ++++++++++---
 ggml.h                 |   3 +-
 llama.cpp              |   2 +-
 8 files changed, 411 insertions(+), 16 deletions(-)
 create mode 100644 ggml-opencl-dequant.cl
 create mode 100644 ggml-opencl.c
 create mode 100644 ggml-opencl.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 11ebe9eb6..5fdbeddfc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,7 @@ endif()
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
 option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
+option(LLAMA_CLBLAST                "llama: use CLBlast"                                    OFF)
 
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
@@ -168,6 +169,21 @@ if (LLAMA_CUBLAS)
     endif()
 endif()
 
+if (LLAMA_CLBLAST)
+    find_package(CLBlast)
+    if (CLBlast_FOUND)
+        message(STATUS "CLBlast found")
+
+        set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+
+        add_compile_definitions(GGML_USE_CLBLAST)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
+    else()
+        message(WARNING "CLBlast not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
@@ -307,7 +323,8 @@ endif()
 add_library(ggml OBJECT
             ggml.c
             ggml.h
-            ${GGML_CUDA_SOURCES})
+            ${GGML_CUDA_SOURCES}
+            ${GGML_OPENCL_SOURCES})
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
diff --git a/Makefile b/Makefile
index f7c8dbfdc..0715e857b 100644
--- a/Makefile
+++ b/Makefile
@@ -105,14 +105,21 @@ ifdef LLAMA_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef LLAMA_CUBLAS
-	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif
+ifdef LLAMA_CLBLAST
+	CFLAGS  += -DGGML_USE_CLBLAST
+	LDFLAGS += -lclblast -lOpenCL
+	OBJS    += ggml-opencl.o
+ggml-opencl.o: ggml-opencl.c ggml-opencl.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl
new file mode 100644
index 000000000..191b2e575
--- /dev/null
+++ b/ggml-opencl-dequant.cl
@@ -0,0 +1,84 @@
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+const char * clblast_dequant = MULTILINE_QUOTE(
+
+struct block_q4_0
+{
+    float d;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+struct block_q4_1
+{
+    float d;
+    float m;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+    const float m = blocks[i].m;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = (vi & 0xf) * d + m;
+    result[index + 1] = (vi >> 4) * d + m;
+}
+
+struct block_q4_2
+{
+    ushort d;
+    uchar qs[8];
+};
+
+__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 16;
+    const uint l = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &blocks[i].d);;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*16 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+struct block_q4_3
+{
+    ushort d;
+    ushort m;
+    uchar qs[8];
+};
+
+__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 16;
+    const uint l = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &(blocks[i].d));
+    const float m = vload_half(0, (__global half*) &(blocks[i].m));
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*16 + l*2;
+    result[index + 0] = (vi & 0xf) * d + m;
+    result[index + 1] = (vi >> 4) * d + m;
+}
+
+);
diff --git a/ggml-opencl.c b/ggml-opencl.c
new file mode 100644
index 000000000..1d68f19ee
--- /dev/null
+++ b/ggml-opencl.c
@@ -0,0 +1,216 @@
+#include "ggml-opencl.h"
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast_c.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
+#include "ggml-opencl-dequant.cl"
+
+#define CL_CHECK(err, name)                                                                     \
+    do {                                                                                        \
+        cl_int err_ = (err);                                                                    \
+        if (err_ != CL_SUCCESS) {                                                               \
+            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            exit(1);                                                                            \
+        }                                                                                       \
+    } while (0)
+
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
+static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
+static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size, log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+
+    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
+    if(err < 0) {
+
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        printf("%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+void ggml_cl_init(void) {
+    cl_int err = 0;
+    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
+    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
+    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
+    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
+    printf("\nInitializing CLBlast (First Run)...");
+    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
+    cl_uint num_platforms;
+    clGetPlatformIDs(0, NULL, &num_platforms);
+    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+    clGetPlatformIDs(num_platforms, platforms, NULL);
+    platform = platforms[plat_num];
+    char platform_buffer[1024];
+    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
+    cl_uint num_devices;
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+    device = devices[dev_num];
+    char device_buffer[1024];
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
+    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
+    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    CL_CHECK(err, "clCreateContext");
+    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
+    CL_CHECK(err, "clCreateCommandQueue");
+
+    free(platforms);
+    free(devices);
+
+    program = build_program_from_source(context, device, clblast_dequant);
+
+    // Prepare dequantize kernels
+    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
+    CL_CHECK(err, "clCreateKernel");
+}
+
+static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
+    if (req_size <= *cur_size) {
+        return;
+    }
+
+    // Reallocate buffer with enough space
+    if (*cur_size > 0) {
+        clReleaseMemObject(*buf);
+    }
+    cl_int err;
+    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
+    *cur_size = req_size;
+    CL_CHECK(err, "clCreateBuffer");
+}
+
+void ggml_cl_sgemm_wrapper(
+        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
+        const int m, const int n, const int k,
+        const float alpha, const void *host_a, const int lda,
+        const float *host_b, const int ldb, const float beta,
+        float *host_c, const int ldc, const int btype) {
+    cl_int err = 0;
+
+    cl_kernel kernel;
+    size_t global = n * k, local, size_qb;
+    bool dequant;
+
+    switch (btype) {
+    case GGML_TYPE_F32:
+        dequant = false;
+        break;
+    case GGML_TYPE_Q4_0:
+        dequant = true;
+        kernel = kernel_q4_0;
+        local = 16;
+        size_qb = global * (sizeof(float) + local) / 32;
+        break;
+    case GGML_TYPE_Q4_1:
+        dequant = true;
+        kernel = kernel_q4_1;
+        local = 16;
+        size_qb = global * (sizeof(float) * 2 + local) / 32;
+        break;
+    case GGML_TYPE_Q4_2:
+        dequant = true;
+        kernel = kernel_q4_2;
+        local = 8;
+        size_qb = global * (sizeof(short) + local) / 16;
+        break;
+    case GGML_TYPE_Q4_3:
+        dequant = true;
+        kernel = kernel_q4_3;
+        local = 8;
+        size_qb = global * (sizeof(short) * 2 + local) / 16;
+        break;
+    default:
+        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
+        abort();
+    }
+
+    const size_t size_a =  m * k * sizeof(float);
+    const size_t size_b =  n * k * sizeof(float);
+    const size_t size_c =  m * n * sizeof(float);
+
+    // Prepare buffers
+    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
+    if (dequant) {
+        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
+    }
+    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
+    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
+
+    cl_event ev_a, ev_qb, ev_b;
+
+    if (dequant) {
+        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
+        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
+        CL_CHECK(err, "clSetKernelArg");
+        clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+    } else {
+        clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+    }
+
+    clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    if (dequant) {
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
+        CL_CHECK(err, "clEnqueueNDRangeKernel");
+        clReleaseEvent(ev_qb);
+    }
+    clWaitForEvents(1, &ev_a);
+    clWaitForEvents(1, &ev_b);
+    clReleaseEvent(ev_a);
+    clReleaseEvent(ev_b);
+
+    cl_event ev_sgemm;
+    CLBlastSgemm((CLBlastLayout)order,
+                 (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
+                 m, n, k,
+                 alpha,
+                 cl_buffer_a, 0, lda,
+                 cl_buffer_b, 0, ldb,
+                 beta,
+                 cl_buffer_c, 0, ldc,
+                 &queue, &ev_sgemm);
+
+    cl_event ev_c;
+    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
+
+    // Wait for completion
+    clWaitForEvents(1, &ev_c);
+    clReleaseEvent(ev_sgemm);
+    clReleaseEvent(ev_c);
+}
diff --git a/ggml-opencl.h b/ggml-opencl.h
new file mode 100644
index 000000000..7bcc603ef
--- /dev/null
+++ b/ggml-opencl.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void ggml_cl_init(void);
+
+enum ggml_blas_order {
+    GGML_BLAS_ORDER_ROW_MAJOR = 101,
+    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
+};
+
+enum ggml_blas_op {
+    GGML_BLAS_OP_N = 111,
+    GGML_BLAS_OP_T = 112,
+    GGML_BLAS_OP_C = 113,
+};
+
+void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml.c b/ggml.c
index 1fbf2955d..33fb1681e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -149,6 +149,8 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include <cblas.h>
 #elif defined(GGML_USE_CUBLAS)
 #include "ggml-cuda.h"
+#elif defined(GGML_USE_CLBLAST)
+#include "ggml-opencl.h"
 #endif
 
 #undef MIN
@@ -4363,6 +4365,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         // initialize cuBLAS
         #if defined(GGML_USE_CUBLAS)
         ggml_init_cublas();
+        #elif defined(GGML_USE_CLBLAST)
+        ggml_cl_init();
         #endif
 
         is_first_call = false;
@@ -8104,7 +8108,7 @@ static void ggml_compute_forward_rms_norm(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@@ -8129,6 +8133,7 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 
     return false;
 }
+
 #endif
 
 static void ggml_compute_forward_mul_mat_f32(
@@ -8144,7 +8149,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
     const int64_t ne10 = src1->ne[0];
 #endif
     const int64_t ne11 = src1->ne[1];
@@ -8201,7 +8206,7 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -8250,8 +8255,15 @@ static void ggml_compute_forward_mul_mat_f32(
 
                 // copy data to host
                 CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#else
+#elif defined(GGML_USE_CLBLAST)
                 // zT = y * xT
+                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne10,
+                        0.0f,    d, ne01,
+                        GGML_TYPE_F32);
+#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
@@ -8395,7 +8407,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
 
@@ -8472,6 +8484,19 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 
                 // copy data to host
                 CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
+#elif defined(GGML_USE_CLBLAST)
+                const float * x = wdata;
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+
+                // zT = y * xT
+                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne10,
+                        0.0f,    d, ne01,
+                        GGML_TYPE_F32);
 #else
                 const float * x = wdata;
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
@@ -8646,7 +8671,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -8698,7 +8723,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
         else {
             GGML_ASSERT(false);
         }
-#else
+#elif !defined(GGML_USE_CLBLAST)
         float * const wdata = params->wdata;
         dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
 #endif
@@ -8717,6 +8742,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
                 dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
                 CUDA_CHECK(cudaGetLastError());
+#elif defined(GGML_USE_CLBLAST)
+                const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
 #else
                 {
                     size_t id = 0;
@@ -8743,8 +8770,15 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
                 // copy data to host
                 CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#else
+#elif defined(GGML_USE_CLBLAST)
                 // zT = y * xT
+                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne10,
+                        0.0f,    d, ne01,
+                        type);
+#else
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
@@ -11583,7 +11617,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         size_t cur = 0;
 
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
@@ -11600,7 +11634,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -13100,7 +13134,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
     return 1;
 #else
     return 0;
@@ -13115,6 +13149,18 @@ int ggml_cpu_has_cublas(void) {
 #endif
 }
 
+int ggml_cpu_has_clblast(void) {
+#if defined(GGML_USE_CLBLAST)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
+int ggml_cpu_has_gpublas(void) {
+    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
+}
+
 int ggml_cpu_has_sse3(void) {
 #if defined(__SSE3__)
     return 1;
diff --git a/ggml.h b/ggml.h
index d9d3d214e..1bbe2db93 100644
--- a/ggml.h
+++ b/ggml.h
@@ -858,10 +858,11 @@ extern "C" {
     GGML_API int ggml_cpu_has_wasm_simd  (void);
     GGML_API int ggml_cpu_has_blas       (void);
     GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_gpublas    (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
 
-
     //
     // Internal types and functions exposed for tests and benchmarks
     //
diff --git a/llama.cpp b/llama.cpp
index 28a74b514..bfebf14bf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1085,7 +1085,7 @@ static bool llama_eval_internal(
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
+    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     memcpy(embd->data, tokens, N*ggml_element_size(embd));

From 11d902364b0e3b503a02a4e757ee2dc38aacb68f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Apr 2023 17:58:44 +0300
Subject: [PATCH 44/74] ggml : add helper debug printf in soft_max

---
 ggml.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml.c b/ggml.c
index 33fb1681e..44293dac9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9257,6 +9257,7 @@ static void ggml_compute_forward_soft_max_f32(
 
         uint16_t scvt;
         for (int i = 0; i < nc; i++) {
+            //printf("p[%3d] = %8.4f\n", i, p[i]);
             if (p[i] == -INFINITY) {
                 p[i] = 0.0f;
             } else {

From 1481a9cf25ea2e4abef6b13a57660a35f3e66af1 Mon Sep 17 00:00:00 2001
From: Evan Jones <evan.q.jones@gmail.com>
Date: Fri, 28 Apr 2023 11:59:37 -0400
Subject: [PATCH 45/74] llama : add session file format and saved sessions in
 main (#1169)

---
 examples/chat-13B.sh   |  4 +-
 examples/common.cpp    |  7 ++++
 examples/common.h      |  1 +
 examples/main/main.cpp | 89 ++++++++++++++++++++++++++++++++++++++++++
 llama.cpp              | 53 +++++++++++++++++++++++++
 llama.h                |  4 ++
 6 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh
index 4265d7b66..2fac37784 100755
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -31,8 +31,6 @@ The transcript only includes text, it does not include markup like HTML and Mark
 
 $USER_NAME: Hello, $AI_NAME!
 $AI_NAME: Hello $USER_NAME! How may I help you today?
-$USER_NAME: What time is it?
-$AI_NAME: It is $(date +%H:%M).
 $USER_NAME: What year is it?
 $AI_NAME: We are in $(date +%Y).
 $USER_NAME: Please tell me the largest city in Europe.
@@ -50,4 +48,6 @@ $AI_NAME: The arguments are stored in process.argv.
     argv[3] is the second argument passed to the script and so on.
 $USER_NAME: Name a color.
 $AI_NAME: Blue
+$USER_NAME: What time is it?
+$AI_NAME: It is $(date +%H:%M).
 $USER_NAME:" "$@"
diff --git a/examples/common.cpp b/examples/common.cpp
index c0e87eb9f..9f10dc268 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -61,6 +61,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.prompt = argv[i];
+        } else if (arg == "--session") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.path_session = argv[i];
         } else if (arg == "-f" || arg == "--file") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -228,6 +234,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
     fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
     fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
     fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
     fprintf(stderr, "  -f FNAME, --file FNAME\n");
diff --git a/examples/common.h b/examples/common.h
index 6f26b514d..9d3697d79 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -31,6 +31,7 @@ struct gpt_params {
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
+    std::string path_session = "";       // path to file for saving/loading model eval state
     std::string input_prefix = "";       // string to prefix user inputs with
     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index f9c9e9d98..fda65574f 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -157,6 +157,32 @@ int main(int argc, char ** argv) {
     // Add a space in front of the first character to match OG llama tokenizer behavior
     params.prompt.insert(0, 1, ' ');
 
+    std::string path_session = params.path_session;
+    std::vector<llama_token> session_tokens;
+
+    if (!path_session.empty()) {
+        fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
+
+        // REVIEW - fopen to check for existing session
+        FILE * fp = std::fopen(path_session.c_str(), "rb");
+        if (fp != NULL) {
+            std::fclose(fp);
+
+            session_tokens.resize(params.n_ctx);
+            size_t n_token_count_out = 0;
+            const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
+            session_tokens.resize(n_token_count_out);
+
+            if (n_session_bytes > 0) {
+                fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
+            } else {
+                fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
+            }
+        } else {
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+        }
+    }
+
     // tokenize the prompt
     auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
 
@@ -167,6 +193,26 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (session_tokens.size()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if (n_matching_session_tokens >= embd_inp.size()) {
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        }
+    }
+
     // number of tokens to keep when resetting context
     if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
         params.n_keep = (int)embd_inp.size();
@@ -252,9 +298,16 @@ int main(int argc, char ** argv) {
     bool is_antiprompt = false;
     bool input_noecho  = false;
 
+    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
+    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
+    // initial prompt so it doesn't need to be an exact match.
+    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
+
+
     int n_past     = 0;
     int n_remain   = params.n_predict;
     int n_consumed = 0;
+    int n_session_consumed = 0;
 
     // the first thing we will do is to output the prompt, so set color accordingly
     set_console_color(con_st, CONSOLE_COLOR_PROMPT);
@@ -276,6 +329,9 @@ int main(int argc, char ** argv) {
                 // insert n_left/2 tokens at the start of embd from last_n_tokens
                 embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
 
+                // REVIEW - stop saving session if we run out of context
+                path_session = "";
+
                 //printf("\n---\n");
                 //printf("resetting: '");
                 //for (int i = 0; i < (int) embd.size(); i++) {
@@ -285,6 +341,28 @@ int main(int argc, char ** argv) {
                 //printf("\n---\n");
             }
 
+            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+            // REVIEW
+            if (n_session_consumed < (int) session_tokens.size()) {
+                size_t i = 0;
+                for ( ; i < embd.size(); i++) {
+                    if (embd[i] != session_tokens[n_session_consumed]) {
+                        session_tokens.resize(n_session_consumed);
+                        break;
+                    }
+
+                    n_past++;
+                    n_session_consumed++;
+
+                    if (n_session_consumed >= (int) session_tokens.size()) {
+                        break;
+                    }
+                }
+                if (i > 0) {
+                    embd.erase(embd.begin(), embd.begin() + i);
+                }
+            }
+
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
@@ -298,6 +376,11 @@ int main(int argc, char ** argv) {
                 }
                 n_past += n_eval;
             }
+
+            if (embd.size() > 0 && !path_session.empty()) {
+                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                n_session_consumed = session_tokens.size();
+            }
         }
 
         embd.clear();
@@ -309,6 +392,12 @@ int main(int argc, char ** argv) {
             const float   temp           = params.temp;
             const float   repeat_penalty = params.repeat_penalty;
 
+            // optionally save the session on first sample (for faster prompt loading next time)
+            if (!path_session.empty() && need_to_save_session) {
+                need_to_save_session = false;
+                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
+            }
+
             llama_token id = 0;
 
             {
diff --git a/llama.cpp b/llama.cpp
index bfebf14bf..dca017db6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2431,3 +2431,56 @@ std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_te
     return ctx->model.tensors_by_name;
 }
 
+size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    // TODO leverage mmap
+    llama_file file(path_session, "rb");
+    const uint32_t magic = file.read_u32();
+    const uint32_t version = file.read_u32();
+
+    if (!(magic == 'ggsn' && version == 0)) {
+        fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+        return 0;
+    }
+
+    llama_hparams session_hparams;
+    file.read_raw(&session_hparams, sizeof(llama_hparams));
+
+    // REVIEW
+    if (session_hparams != ctx->model.hparams) {
+        fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
+        return 0;
+    }
+
+    const uint32_t n_token_count = file.read_u32();
+    LLAMA_ASSERT(n_token_capacity >= n_token_count);
+    file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+    *n_token_count_out = n_token_count;
+
+    const size_t n_state_size = file.size - file.tell();
+    const size_t n_orig_state_size = llama_get_state_size(ctx);
+    if (n_state_size != n_orig_state_size) {
+        fprintf(stderr, "%s : failed to validate state size\n", __func__);
+    }
+    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
+    file.read_raw(state_data.get(), n_state_size);
+    return llama_set_state_data(ctx, state_data.get());
+}
+
+size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    // TODO save temp & swap
+    llama_file file(path_session, "wb");
+
+    const size_t n_state_size = llama_get_state_size(ctx);
+    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
+    llama_copy_state_data(ctx, state_data.get());
+
+    file.write_u32('ggsn'); // magic
+    file.write_u32(0); // version
+    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
+
+    file.write_u32((uint32_t) n_token_count); // REVIEW
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    file.write_raw(state_data.get(), n_state_size);
+    return n_state_size; // REVIEW
+}
diff --git a/llama.h b/llama.h
index 17dac0689..86a7d279a 100644
--- a/llama.h
+++ b/llama.h
@@ -133,6 +133,10 @@ extern "C" {
     // Returns the number of bytes read
     LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
 
+    // Save/load session file
+    LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process
     // n_past is the number of tokens to use from previous eval calls

From 5fba3c016bfd1d73a070e7c93dac14162ce118d0 Mon Sep 17 00:00:00 2001
From: CRD716 <crd716@gmail.com>
Date: Fri, 28 Apr 2023 11:13:33 -0500
Subject: [PATCH 46/74] examples : add Jeopardy example (#1168)

* Basic Setup

* Prevent Results.txt from coming up

* Prefixes, Line separators, etc

* editorcheck

* introduction to give more consistent results

* Basic graph thing

* Grading, ready for testing!

* Y'all ready to get funky?

* fix column removal stuff

* missed a few
---
 .gitignore                      |   2 +
 examples/jeopardy/README.md     |  21 +++++++
 examples/jeopardy/graph.py      |  56 +++++++++++++++++
 examples/jeopardy/jeopardy.sh   |  30 ++++++++++
 examples/jeopardy/qasheet.csv   | 103 ++++++++++++++++++++++++++++++++
 examples/jeopardy/questions.txt | 100 +++++++++++++++++++++++++++++++
 6 files changed, 312 insertions(+)
 create mode 100644 examples/jeopardy/README.md
 create mode 100644 examples/jeopardy/graph.py
 create mode 100644 examples/jeopardy/jeopardy.sh
 create mode 100644 examples/jeopardy/qasheet.csv
 create mode 100644 examples/jeopardy/questions.txt

diff --git a/.gitignore b/.gitignore
index c7573bb3b..54dcebc4d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,5 @@ zig-out/
 zig-cache/
 
 ppl-*.txt
+
+examples/jeopardy/results.txt
diff --git a/examples/jeopardy/README.md b/examples/jeopardy/README.md
new file mode 100644
index 000000000..4c42e3cdb
--- /dev/null
+++ b/examples/jeopardy/README.md
@@ -0,0 +1,21 @@
+# llama.cpp/example/jeopardy
+
+This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
+
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+
+
+Step 1: Open jeopardy.sh and modify the following:
+```
+MODEL=(path to your model)
+MODEL_NAME=(name of your model)
+prefix=(basically, if you use vicuna it's Human: , if you use something else it might be User: , etc)
+opts=(add -instruct here if needed for your model, or anything else you want to test out)
+```
+Step 2: Run `jeopardy.sh` from the llama.cpp folder
+
+Step 3: Repeat steps 1 and 2 until you have all the results you need.
+
+Step 4: Run `graph.py`, and follow the instructions. At the end, it will generate your final graph.
+
+Note: The Human bar is based off of the full, original 100 sample questions. If you modify the question count or questions, it will not be valid.
diff --git a/examples/jeopardy/graph.py b/examples/jeopardy/graph.py
new file mode 100644
index 000000000..d00b28652
--- /dev/null
+++ b/examples/jeopardy/graph.py
@@ -0,0 +1,56 @@
+import matplotlib.pyplot as plt
+import sys, os
+import csv
+
+labels = []
+numbers = []
+numEntries = 1
+
+rows = []
+
+def bar_chart(numbers, labels, pos):
+    plt.bar(pos, numbers, color='blue')
+    plt.xticks(ticks=pos, labels=labels)
+    plt.title("Jeopardy Results by Model")
+    plt.xlabel("Model")
+    plt.ylabel("Questions Correct")
+    plt.show()
+
+def calculatecorrect():
+    directory = os.fsencode("./examples/jeopardy/results/")
+    csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
+    for row in csv_reader:
+        global rows
+        rows.append(row)
+    for listing in os.listdir(directory):
+        filename = os.fsdecode(listing)
+        if filename.endswith(".txt"):
+            file = open("./examples/jeopardy/results/" + filename, "rt")
+            global labels
+            global numEntries
+            global numbers
+            labels.append(filename[:-4])
+            numEntries += 1
+            i = 1
+            totalcorrect = 0
+            for line in file.readlines():
+                if line.strip() != "------":
+                    print(line)
+                else:
+                    print("Correct answer: " + rows[i][2] + "\n")
+                    i+=1
+                    print("Did the AI get the question right? (y/n)")
+                    if input() == "y":
+                        totalcorrect += 1
+            numbers.append(totalcorrect)
+
+
+
+if __name__ == '__main__':
+    calculatecorrect()
+    pos = list(range(numEntries))
+    labels.append("Human")
+    numbers.append(48.11)
+    bar_chart(numbers, labels, pos)
+    print(labels)
+    print(numbers)
diff --git a/examples/jeopardy/jeopardy.sh b/examples/jeopardy/jeopardy.sh
new file mode 100644
index 000000000..9bdbc755c
--- /dev/null
+++ b/examples/jeopardy/jeopardy.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+MODEL=./models/ggml-vicuna-13b-1.1-q4_0.bin
+MODEL_NAME=Vicuna
+
+# exec options
+prefix="Human: " # Ex. Vicuna uses "Human: "
+opts="--temp 0 -n 80" # additional flags
+nl='
+'
+introduction="You will be playing a game of Jeopardy. Simply answer the question in the correct format (Ex. What is Paris, or Who is George Washington)."
+
+# file options
+question_file=./examples/jeopardy/questions.txt
+touch ./examples/jeopardy/results/$MODEL_NAME.txt
+output_file=./examples/jeopardy/results/$MODEL_NAME.txt
+
+counter=1
+
+echo 'Running'
+while IFS= read -r question
+do
+  exe_cmd="./main -p "\"$prefix$introduction$nl$prefix$question\"" "$opts" -m ""\"$MODEL\""" >> ""\"$output_file\""
+  echo $counter
+  echo "Current Question: $question"
+  eval "$exe_cmd"
+  echo -e "\n------" >> $output_file
+  counter=$((counter+1))
+done < "$question_file"
diff --git a/examples/jeopardy/qasheet.csv b/examples/jeopardy/qasheet.csv
new file mode 100644
index 000000000..35b084189
--- /dev/null
+++ b/examples/jeopardy/qasheet.csv
@@ -0,0 +1,103 @@
+Index,Original Category,Original Correct Question,Model Prompt
+1,The Oscars,Who is John Williams?,Which actor Born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+2,English Literature,What is Paradise Lost?,"What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?"
+3,Writers’ Lesser-Known Works,Who is Niccolò Machiavelli?,"Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?"
+4,Exploration,What is Easter Island (Rapa Nui)?,"James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?"
+5,The Bill of Rights,What is the Eighth Amendment?,England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+6,Nobel Peace Prize Winners,Who are Nelson Mandela & Desmond Tutu?,"Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?"
+7,Famous Names,Who is Walt Disney?,"In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?"
+8,Geography,What is Colombia?,"Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?"
+9,Fashion History,What are rhinestones?,"Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?"
+10,Movies of the ’80s,What is Driving Miss Daisy?,What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+11,Novelists,Who is John Grisham?,"A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?"
+12,20th Century Eponyms,What is the Maginot Line?,"A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?"
+13,City History,What is Stockholm?,"Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?"
+14,Brand Names,What is Jacuzzi?,"The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?"
+15,American Authors,Who is Washington Irving?,"In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?"
+16,Symbols,What is “less than”?,What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+17,Movie Theme Songs,Who is James Bond?,"Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?"
+18,American Novelists,Who is Joseph Heller?,"What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?"
+19,Medieval Places,"What is Canterbury, England? (Canterbury Cathedral)","In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?"
+20,Countries of Africa,What is Morocco?,"At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?"
+21,Statehood,What is Wyoming?,Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+22,1980s Movies,What is Raiders of the Lost Ark?,"A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?"
+23,Art Exhibitions,Who is Rembrandt?,In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+24,Countries of the World,What is Mongolia?,"Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?"
+25,Literature,What is “Howl”?,A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+26,Invasions,Who is William of Orange?,"Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?"
+27,Landmarks,What is the Eiffel Tower?,"After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?"
+28,Geographic Name’s the Same,What is Dover?,"The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?"
+29,Names in the Bookstore,Who is Peter Mark Roget?,"This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?"
+30,U.S. History,Who is Dr. Samuel Mudd?,"An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?"
+31,American Literature,What is The Things They Carried?,"Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?"
+32,Nonfiction,What is The Communist Manifesto,"What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?"
+33, a new version was passed 81 years later,Laws in U.S. History,What is the Civil Rights Act?,,,,,,,,,,,,,,,,,,0, 2/3
+34,Names of Myth,Who is Helen of Troy?,"Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?"
+35,African Countries,What is Sudan?,"Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?"
+36,The Ancient World,What is Alexandria?,"The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?"
+37,Famous Names,Who is Andy Warhol?,"For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?"
+38,People & Places,What is Guam?,"Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?"
+39,Current World Leaders,What is the Philippines?,"In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?"
+40,Writers & The South,Who is Tennessee Williams?,In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+41,National Parks,What is Yellowstone?,"What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?"
+42,Sports,Who are the Harlem Globetrotters?,"In 2010 who introduced the 4-point shot, 35 feet from the basket?"
+43,The U.S. Military,What is “Top Gun”?,Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+44,Art & Science,What is Halley’s Comet?,"A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?"
+45,Words From World War I,What is “tank”?,"In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?"
+46,European History,What is Holy Roman Emperor?,"Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?"
+47,Theater History,Who is Peter Pan?,"In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?"
+48,European Cities,What is Aachen?,"Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?"
+49,Word Origins,What is mantra?,This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+50,Inventions,What is barbed wire?,1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+51,World War II,What is Schindler’s list?,"Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?"
+52, their offspring was the source of this mythical object,Mythology,What is the Golden Fleece?
+53,Literature,What is Pride and Prejudice?,"Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?"
+54, only these 2 west of the Mississippi River border each other,U.S. State Names,What are Oregon & Nevada?
+55,Word Origins,What is passion?,"Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?"
+56,World Cinema,What is La Vie en Rose?,"The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?"
+57,History,What is Santa Maria?,"Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?"
+58,Landmarks,What is a kremlin?,Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+59,Foreign-Born Authors,Who is Vladimir Nabokov?,In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+60,Astronomy & Geography,What is Capricorn?,"At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?"
+61,Television,What is Law & Order?,"Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?"
+62,British Landmarks,What is the Tower of London?,"Like Sir Thomas More, 3 16th century English queens are buried at what British location?"
+63,Early American History,What are witches?,"In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person … be condemned'?"
+64,Geography Mnemonics,What are Arkansas and Louisiana?,"The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?"
+65,Business Milestones,What is the Ford Model T?,"What was first sold in 1908, at a price equivalent to about $27,000 today?"
+66,In The Bookstore,Who is Tom Clancy?,The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+67,Historic Art,What is the Bayeux Tapestry?,The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+68,Pop Stars,Who is Madonna?,In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+69,Classic Tale Characters,Who is Scheherazade?,"In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?"
+70,USA,What is Jack Daniel’s?,"Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?"
+71,Historic People,Who was William Bligh?,"After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?"
+72,The Movies,What is The Godfather?,Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+73,Continental Geography,What is Colombia?,"Until a 1903 secession, what country's contiguous territory spanned 2 continents?"
+74,Foreign-Born Authors,Who is Isabel Allende?,"Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?"
+75,Historic Crimes,What is the Mona Lisa?,"Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?"
+76,U.S. Bodies of Water,What is Lake Mead?,"Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?"
+77,Gods & Goddesses,Who is Aurora (or Eos)?,"Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?"
+78,America At War,What is the Battle of New Orleans?,"Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?"
+79,Children’s Books,What is The Velveteen Rabbit?,"Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?"
+80,TV Finales,What is Grace and Frankie?,"In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?"
+81,American Poems,Who is Evangeline?,"In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?"
+82,Famous Names,Who is Banksy?,"In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?"
+83,Children’s Lit,What is Charlotte’s Web?,The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+84,Classic Songs,What is “Here Comes Santa Claus”?,The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+85,Brand Names,What are Milk Duds?,"Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?"
+86,Countries of the World,What is Italy?,"What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?"
+87,Action Movies,What is Die Hard?,"What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?"
+88,Presidential Facts,Who is Woodrow Wilson?,Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+89,19th Century Americans,Who is Frederick Douglass?,"Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?"
+90,Latin Phrases,What is “quid pro quo”?,"Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?"
+91,1970s Movies,What is Monty Python and the Holy Grail?,The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+92,Name’s The Same,What is Manhattan?,"A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?"
+93,U.S. Presidents,Who is Calvin Coolidge?,"Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?"
+94,Plays,What is The Tempest?,A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+95,Landmarks,What is the Berlin Wall?,"In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?"
+96,World Capitals,"What is Vienna, Austria?","Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?"
+97,Language & Its Meanings,What is a night owl?,"Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?"
+98,Flags of Our Hemisphere,What is Brazil?,"The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?"
+99,Names in U.S. History,Who is Oliver Brown?,What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+100,Children’s Authors,"Who is Sarah? (from Sarah, Plain and Tall)","Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?"
+,,,
+TOTALS,,,
diff --git a/examples/jeopardy/questions.txt b/examples/jeopardy/questions.txt
new file mode 100644
index 000000000..eea78a057
--- /dev/null
+++ b/examples/jeopardy/questions.txt
@@ -0,0 +1,100 @@
+Which man born in 1932 was the son of a percussionist in the CBS radio orchestra has been nominated for 53 Oscars?
+What work in English Literature says: 'The mind is its own place, & in itself can make a heaven of hell, a hell of heaven. What matter where, if I be still the same'?
+Known for more philosophical works, he wrote the play 'La Mandragola', in which Florentines are rewarded for immoral actions?
+James Cook's account of a 1774 visit where records an object 'near 27 feet long, and upwards of 8 feet over the breast or shoulders'?
+England's 'Bloody Assizes' & a 1685 life sentence for perjury were 2 main origins of which amendment to the U.S. Constitution?
+Which nobel peace price winners each lived at times on Vilakazi St. in Soweto , so it claims to be the world's only street home to 2 Nobel Peace Prize winners?
+In 1966, the year of who's death did he share plans for an experimental prototype community in Florida?
+Of the 13 nations through which the Equator passes, what is the only one whose coastline borders the Caribbean Sea?
+Which decorative items in fashion history get their name from their origin in the port city of Strasbourg, on the border of France & Germany?
+What 1980's movie is based on an off-Broadway play with just 3 characters and won the Best Picture Oscar & the actors in all 3 roles were nominated?
+A 2012 book review for which novelist noted subjects that 'sparked his ire': capital punishment, big tobacco & 'the plight of the unjustly convicted'?
+A 1940 headline about what 20th Century Eponym included 'failure', 'liability when it came to offense' & 'stout hearts no match for tanks'?
+Over 700 years after its traditional 1252 founding date, what port city became associated with a psychological response?
+The success of what brand has its roots with a hydrotherapy pump its cofounder created for his son, who had arthritis?
+In a periodical in 1807, what American Author called New York City 'Gotham, Gotham! Most enlightened of cities'?
+What symbol is a rotated V in math and a feeling of some marginalized or underrepresented people in society?
+Monty Norman, the composer of what character's theme, said the staccato riff conveyed sexiness, mystery & ruthlessness?
+What American Novelist served with an airman named Yohannan in World War II & despite what readers might think, he said he enjoyed his service?
+In what Medieval place did one of the participants in an 1170 event say, 'Let us away, knights; he will rise no more'?
+At one time a province of the Roman Empire, what African country kingdom is known to Arabic scholars as Al-Maghrib Al-Aqsa, 'the far west'?
+Congress relented in 1890 after what prospective state said it would wait 100 years rather than come in without the women?
+A writer & producer of what movie said he wanted it to be like a Western or James Bond film, 'only it takes place in the 30s'?
+In 1898 what's been called the first blockbuster art show was devoted to which artist & put on for Queen Wilhelmina's coronation?
+Part of the largest contiguous land empire during the 1200s & 1300s, today what is the world's second-largest landlocked country?
+A 2006 book was titled 'The Poem That Changed America:' What 'Fifty Years Later'?
+Backed by 14,000 troops, who invaded England to restore, in his words, its 'religion, laws, and liberties'?
+After its completion in the late 19th c., what was landmark was called 'a truly tragic street lamp' & a 'high & skinny pyramid of iron ladders'?
+The busiest passenger port in the U.K., what shares its name with a capital of one of the original 13 states?
+This man made lists, perhaps to cope with depression; a set of lists he published in 1852 made whose name synonymous with a type of book?
+An 1869 presidential pardon was granted to which man, due in part to a plea by the Medical Society of Harford County, Maryland?
+Letters, pocket knives, C rations & steel helmets are among the tangible items referred to in the title of what American literature modern war classic?
+What nonfiction book has the line, 'The discovery of America…opened up fresh ground for the rising bourgeoisie'?
+A radical Republican championed what 1875 act but the Supreme Court struck it down in 1883; a new version was passed 81 years later?
+Whose brothers, Castor & Pollux, saved her after Theseus stole her away as a kid; a larger force would seek her later in life?
+Once Africa's largest country in area, what African Country dropped to third in 2011 when a portion of it declared independence?
+The ancient writer Galen said books on ships arriving to what city's port were seized, originals kept & copies returned?
+For a special 1970s cookbook, who provided one simple recipe–a can of Campbell's tomato soup & 2 cans of milk?
+Thought to descend from people of Southeast Asia, the Chamorro make up what U.S. territory’s largest ethnic group?
+In office from 2022, the president of what country has taken so many foreign trips a play on his name is 'Ferdinand Magellan Jr.'?
+In 1939 which writer lived on Toulouse Street in the French Quarter & chose the professional name that bonded him to the South?
+What National Park is named for a river indigenous people called Mi tse a-da-zi, translated by French-speaking trappers as 'Pierre Jaune'?
+In 2010 who introduced the 4-point shot, 35 feet from the basket?
+Losses over Asia in the 1960s led to the establishment of the program known as what at a San Diego naval base in 1969?
+A craft that visited what was named for Giotto, based on the story that 680 years earlier, the painter depicted it as the Star of Bethlehem?
+In World War I, 'Cistern' & 'reservoir' were suggested names for what secret invention, but the British preferred this less clumsy monosyllable?
+Until 1806, some German nobles included among their honors the title of 'Elector' for their role in selecting this personage?
+In 1904, wearing a harness, actress Nina Boucicault became the first to play what character onstage?
+Alphabetically the first German city in encyclopedias, what was also the first one taken by the Allies in World War II?
+This Sanskrit word referring to a spoken word or phrase comes from a word for 'to think'?
+1917's 'Elements of Trench Warfare' said what Old West invention was 'difficult to destroy' & 'difficult to get through'?
+Mimi Reinhard, who never learned to type using more than 2 fingers, produced what in World War II with 1,100 names, including hers?
+Poseidon carried off the maiden Theophane & turned her into a ewe; their offspring was the source of what mythical object?
+Published in 2011, P.D. James' final novel, 'Death Comes to Pemberley', was a sequel to what novel from 200 years earlier?
+5 U.S. states have 6-letter names; only which 2 west of the Mississippi River border each other?
+Originally relating to a story of suffering, what word now more commonly refers to strong emotion of any kind?
+The 2007 biopic called 'La Môme' in France, meaning 'The Kid', was released in the U.S. under what other French title?
+Returning home in 1493, Columbus stopped in the Azores at an island with what name, also something he'd lost off the Haiti coast?
+Pskov & Nizhny Novgorod are 2 of the cities that have a fortress called what?
+In the 1950s the New York Times said what author 'is writing about all lust' & his lecherous narrator 'is all of us'?
+At the winter solstice, the sun is in Sagittarius; it once appeared in what constellation, giving a geographic feature its name?
+Mike Post combined the sound of a slamming jail door, an anvil & 100 men stomping on a floor for what television series that debuted in 1990?
+Like Sir Thomas More, 3 16th century English queens are buried at what British location?
+In 1692 Increase Mather wrote, 'It were better that ten suspected' of these who 'escape, than that one innocent person be condemned'?
+The Geography Mnemonic Mimal, sometimes said to be the silhouette of a chef or elf, stands for Minnesota, Iowa, Missouri, and what other 2 states?
+What was first sold in 1908, at a price equivalent to about $27,000 today?
+The name of what author dead since 2013 now appears on books written by a former U.S. marshal & a former Apache helicopter pilot?
+The artwork once known in France as 'la tapisserie de la Reine Mathilde' is better known as what?
+In 2022 which pop star became the first woman to have a Billboard Top 10 album in 5 decades starting with the 1980s?
+In one 19th century translation, what female classic tale character 'perceived the dawn of day and ceased' speaking nearly 1,000 times?
+Ironically, though what company founded in the 1860s is Moore County, Tennessee's largest employer, Moore is a dry county?
+After a 1789 event, who wrote, 'My first determination was to seek a supply of…water at Tofoa, & afterwards to sail for Tongataboo'?
+Laurence Olivier & Ernest Borgnine were considered for the lead role & Sergio Leone to direct for what film that turned 50 in 2022?
+Until a 1903 secession, what country's contiguous territory spanned 2 continents?
+Early in her career which foreign-born author translated romance novels into Spanish, often changing the dialogue to make the heroines smarter?
+Saying it was stolen by Napoleon, self-styled Italian patriot Vincenzo Peruggia took what in 1911?
+Continuing a downward trend, in July 2022 what US body of water was at 27% capacity, its lowest level since 1937 when it was first being filled?
+Each morning which goddess began her ride in her chariot across the sky ahead of her brother Sol, or Helios?
+Until the Civil War, the Jan. 8 date of what American battle of dubious military importance but big morale value was a national holiday?
+Which children's book title character is told 'By the time you are real, most of your hair has been loved off your eyes drop out & you get shabby'?
+In a TV reunion over 40 years in the making, Dolly Parton appeared as an angel named Agnes in the final episode of what comedy in 2022?
+In an 1847 American poem what character sees her town of Grand-Pré burned, but finally reunites with her beau for a kiss before his death?
+In 2001 who published a book called 'Banging Your Head Against a Brick Wall'; in 2002, 'Existencilism'?
+The title object of what childrens book 'never looked more beautiful each strand held dozens of bright drops of early morning dew'?
+The shouts of excited children at a 1946 holiday parade are said to have inspired what perennial classic song favorite?
+Unable to make what candies perfectly round, the confectioner embraced this flawed name for the product?
+What country is home to 58 UNESCO World Heritage Sites, more than any other country; the sites include a volcano & a lagoon?
+What action movie's last line is 'If this is their idea of Christmas, I gotta be here for New Years'?
+Only 3 presidents have married while in office— John Tyler was the first & which one was the last?
+Demonstrating the dignity & humanity of Black Americans, who sat for 160 known photographs, the most of any American in the 19th century?
+Originally, which Latin 3-word phrase referred to when a doctor or apothecary substituted one medicine for another?
+The 1975 premiere of what movie comedy advertised free coconuts for the first thousand in the audience?
+A cocktail, an island & a WWII venture originally called 'Development of Substitute Materials' all bear what name?
+Which US President was sworn in twice as President within 2 years, first by his father & then later by a former U.S. President?
+A 1609 story in which an exiled king of Bulgaria creates a sea palace with his magic may have inspired the plot of what play?
+In 2009, during a 20th anniversary celebration, what landmark was called 'an edifice of fear. On Nov. 9, it became a place of joy'?
+Among what world capital's nicknames are the 'City of Classical Music' &, possibly in honor of a famous resident from 1860 to 1938, the 'City of Dreams'?
+Now meaning someone with nocturnal habits, what catches a sleeping dove in Shakespeare's 'Lucrece'?
+The stars on what country's flag represent states, 26 of them; unlike the USA's, its 'federal district' gets its own 27th star?
+What father was the only man among the 13 plaintiffs in a US class-action case filed in 1951?
+Reversing the story of what heroine she created, childrens author Patricia Maclachlan was born on the prairie but spent much of her life in New England?

From 55390bcaf2579a5435564d7267ae3ed367837fd6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Apr 2023 20:37:43 +0300
Subject: [PATCH 47/74] ggml : sync ggml (ggml_alibi)

---
 ggml.c | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 ggml.h |   9 +++
 2 files changed, 210 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 44293dac9..53796bd97 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4034,7 +4034,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "MAP_BINARY",
 };
 
-static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
+static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -4082,7 +4082,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
+static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -6080,6 +6080,37 @@ struct ggml_tensor * ggml_rope(
     return result;
 }
 
+// ggml_alibi
+
+struct ggml_tensor * ggml_alibi(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_head) {
+    GGML_ASSERT(n_past >= 0);
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // TODO: when implement backward, fix this:
+    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
+    ((int32_t *) b->data)[0] = n_past;
+    ((int32_t *) b->data)[1] = n_head;
+
+    result->op   = GGML_OP_ALIBI;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
 // ggml_conv_1d_1s
 
 struct ggml_tensor * ggml_conv_1d_1s(
@@ -9300,6 +9331,162 @@ static void ggml_compute_forward_soft_max(
     }
 }
 
+// ggml_compute_forward_alibi
+
+static void ggml_compute_forward_alibi_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(src1->type == GGML_TYPE_I32);
+    assert(ggml_nelements(src1) == 2);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n_past = ((int32_t *) src1->data)[0];
+    const int n_head = ((int32_t *) src1->data)[1];
+
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int n  = ggml_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    //const int nb3 = src0->nb[3];
+
+    assert(nb0 == sizeof(float));
+    assert(ne1+n_past == ne0);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
+
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
+                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                float m_k;
+
+                if (k < n_heads_log2_floor) {
+                    m_k = powf(m0, k + 1);
+                } else {
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+
+                pdst[0] = (j+1) * m_k + src[0];
+            }
+        }
+    }
+}
+
+
+static void ggml_compute_forward_alibi_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(src1->type == GGML_TYPE_I32);
+    assert(ggml_nelements(src1) == 2);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n_past = ((int32_t *) src1->data)[0];
+    const int n_head = ((int32_t *) src1->data)[1];
+
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    //const int ne2 = src0->ne[2]; // n_head -> this is k
+    //const int ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int n  = ggml_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    //const int nb3 = src0->nb[3];
+
+    assert(nb0 == sizeof(ggml_fp16_t));
+    assert(ne1+n_past == ne0);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
+
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
+                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                      float *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                float m_k;
+
+                if (k < n_heads_log2_floor) {
+                    m_k = powf(m0, k + 1);
+                } else {
+                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+
+                // we return F32
+                pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_alibi(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_alibi_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_alibi_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q4_2:
+        case GGML_TYPE_Q4_3:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q8_1:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_rope
 
 static void ggml_compute_forward_rope_f32(
@@ -10938,6 +11125,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
             } break;
+        case GGML_OP_ALIBI:
+            {
+                ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_CONV_1D_1S:
             {
                 ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -11140,6 +11331,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case GGML_OP_ALIBI:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_SILU:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -11673,6 +11868,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                         node->n_tasks = n_threads;
                     } break;
+                case GGML_OP_ALIBI:
+                    {
+                        node->n_tasks = 1; //TODO
+                    } break;
                 case GGML_OP_CONV_1D_1S:
                 case GGML_OP_CONV_1D_2S:
                     {
diff --git a/ggml.h b/ggml.h
index 1bbe2db93..540901f15 100644
--- a/ggml.h
+++ b/ggml.h
@@ -269,6 +269,7 @@ extern "C" {
         GGML_OP_DIAG_MASK_INF,
         GGML_OP_SOFT_MAX,
         GGML_OP_ROPE,
+        GGML_OP_ALIBI,
         GGML_OP_CONV_1D_1S,
         GGML_OP_CONV_1D_2S,
 
@@ -662,6 +663,14 @@ extern "C" {
             int                   n_dims,
             int                   mode);
 
+    // alibi position embedding
+    // in-place, returns view(a)
+    struct ggml_tensor * ggml_alibi(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_head);
+
     // padding = 1
     // TODO: we don't support extra parameters for now
     //       that's why we are hard-coding the stride, padding, and dilation

From 7f15c5c477d9933689a9d1c40794483e350c2f19 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 28 Apr 2023 21:32:52 +0300
Subject: [PATCH 48/74] readme : update hot topics

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 2a20746c6..731f491ca 100644
--- a/README.md
+++ b/README.md
@@ -9,10 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
 - [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)
-- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
-- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
-- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
 
 ## Description
 

From 36d19a603b221d1bd7897fcb10e823e2103b052d Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Fri, 28 Apr 2023 23:10:43 +0000
Subject: [PATCH 49/74] Remove Q4_3 which is no better than Q5 (#1218)

---
 README.md                      |  37 +++--
 SHA256SUMS                     |   4 -
 examples/quantize/quantize.cpp |   1 -
 ggml-cuda.cu                   |  37 -----
 ggml-cuda.h                    |   1 -
 ggml-opencl-dequant.cl         |  21 ---
 ggml-opencl.c                  |  10 +-
 ggml.c                         | 260 ---------------------------------
 ggml.h                         |   3 +-
 llama.cpp                      |   4 -
 llama.h                        |   2 +-
 11 files changed, 21 insertions(+), 359 deletions(-)

diff --git a/README.md b/README.md
index 731f491ca..f55c576ab 100644
--- a/README.md
+++ b/README.md
@@ -281,30 +281,29 @@ When running the larger models, make sure you have enough disk space to store al
 
 As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
 
-| model | original size | quantized size (4-bit) |
-|-------|---------------|------------------------|
-| 7B    | 13 GB         | 3.9 GB                 |
-| 13B   | 24 GB         | 7.8 GB                 |
-| 30B   | 60 GB         | 19.5 GB                |
-| 65B   | 120 GB        | 38.5 GB                |
+| Model | Original size | Quantized size (4-bit) |
+|------:|--------------:|-----------------------:|
+|    7B |         13 GB |                 3.9 GB |
+|   13B |         24 GB |                 7.8 GB |
+|   30B |         60 GB |                19.5 GB |
+|   65B |        120 GB |                38.5 GB |
 
 ### Quantization
 
 Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
 
-Model | F16 | Q4_0 | Q4_1 | Q4_2 | Q4_3 | Q5_0 | Q5_1 | Q8_0
--- | -- | -- | -- | -- | -- | -- | -- | --
-7B (ppl) | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0617 | 6.0139 | 5.9934 | 5.9571
-7B (size) | 13.0G | 4.0G | 4.8G | 4.0G | 4.8G | 4.4G | 4.8G | 7.1G
-7B (ms/tok @ 4th) | 128 | 56 | 61 | 84 | 91 | 91 | 95 | 75
-7B (ms/tok @ 8th) | 128 | 47 | 55 | 48 | 53 | 53 | 59 | 75
-7B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
--- | -- | -- | -- | -- | -- | -- | -- | --
-13B (ppl) | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.3234 | 5.2768 | 5.2582 | 5.2458
-13B (size) | 25.0G | 7.6G | 9.1G | 7.6G | 9.1G | 8.4G | 9.1G | 14G
-13B (ms/tok @ 4th) | 239 | 104 | 113 | 160 | 175 | 176 | 185 | 141
-13B (ms/tok @ 8th) | 240 | 85 | 99 | 97 | 114 | 108 | 117 | 147
-13B (bpw) | 16.0 | 5.0 | 6.0 | 5.0 | 6.0 | 5.5 | 6.0 | 9.0
+| Model | Measure      | F16    | Q4_0   | Q4_1   | Q4_2   | Q5_0   | Q5_1   | Q8_0   |
+|------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
+|    7B | perplexity   | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0139 | 5.9934 | 5.9571 |
+|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.0G |   4.4G |   4.8G |   7.1G |
+|    7B | ms/tok @ 4th |    128 |     56 |     61 |     84 |     91 |     95 |     75 |
+|    7B | ms/tok @ 8th |    128 |     47 |     55 |     48 |     53 |     59 |     75 |
+|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
+|   13B | perplexity   | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.2768 | 5.2582 | 5.2458 |
+|   13B | file size    |  25.0G |   7.6G |   9.1G |   7.6G |   8.4G |   9.1G |    14G |
+|   13B | ms/tok @ 4th |    239 |    104 |    113 |    160 |    176 |    185 |    141 |
+|   13B | ms/tok @ 8th |    240 |     85 |     99 |     97 |    108 |    117 |    147 |
+|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
 
 ### Interactive mode
 
diff --git a/SHA256SUMS b/SHA256SUMS
index 87faa7f1b..e487bdca6 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -3,7 +3,6 @@
 99aeb35f26b577fa2732716cca4d8b5ada39a78ea9b2dca2651fc632b5d101b6  models/7B/ggml-model-q4_0.bin
 cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
 25b050337a87344da687a7f2adddc03bd99b7f6c140450e836649f3585fb6496  models/7B/ggml-model-q4_2.bin
-3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b  models/7B/ggml-model-q4_3.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
@@ -11,7 +10,6 @@ d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/con
 eecb575d325d935157761172e2bf05984dad216eb2b06777b73463cf9b818bab  models/13B/ggml-model-q4_0.bin
 d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
 75a218a47df03f5f96354656329864613abcb67779412b9bc2282b28c1c3cbaa  models/13B/ggml-model-q4_2.bin
-4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3  models/13B/ggml-model-q4_3.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
@@ -21,7 +19,6 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/con
 517b9e525742c42b5478a6280a4b41ec66f46298c57aba7f0453d491682fe42d  models/30B/ggml-model-q4_0.bin
 7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
 aadbc9cf806313a55be570f62884eed289d30c313fac3b7838717e01bd553204  models/30B/ggml-model-q4_2.bin
-a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33  models/30B/ggml-model-q4_3.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@@ -35,6 +32,5 @@ d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/con
 01672072136f8be6ca9d7cebe5f86ed316e8b85851b9fe3de951809233cea4f2  models/65B/ggml-model-q4_0.bin
 4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
 1b6f6588d0e2ecfe6c4d849088e48e5e3083466b962daa32e3261363e21fc5e9  models/65B/ggml-model-q4_2.bin
-305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c  models/65B/ggml-model-q4_3.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 60966595e..dd175c690 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -9,7 +9,6 @@ static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
   {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
   {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
   {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
-  {"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
   {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
   {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
   {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index b1bd29b10..d619f5da4 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -29,14 +29,6 @@ typedef struct {
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 
-#define QK4_3 16
-typedef struct {
-    __half  d;              // delta
-    __half  m;              // min
-    uint8_t qs[QK4_3 / 2];  // nibbles / quants
-} block_q4_3;
-static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
-
 #define QK5_0 32
 typedef struct {
     __half d;               // delta
@@ -131,30 +123,6 @@ static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
     }
 }
 
-static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
-    const block_q4_3 * x = (const block_q4_3 *) vx;
-
-    const int i = blockIdx.x;
-
-    const float d = x[i].d;
-    const float m = x[i].m;
-
-    const uint8_t * pp = x[i].qs;
-
-    for (int l = 0; l < QK4_3; l += 2) {
-        const uint8_t vi = pp[l/2];
-
-        const int8_t vi0 = vi & 0xf;
-        const int8_t vi1 = vi >> 4;
-
-        const float v0 = vi0*d + m;
-        const float v1 = vi1*d + m;
-
-        y[i*QK4_3 + l + 0] = v0;
-        y[i*QK4_3 + l + 1] = v1;
-    }
-}
-
 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
     const block_q5_0 * x = (const block_q5_0 *) vx;
 
@@ -244,11 +212,6 @@ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t st
     dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
-    const int nb = k / QK4_3;
-    dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
-}
-
 void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_0;
     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
diff --git a/ggml-cuda.h b/ggml-cuda.h
index ed9b44184..b105ed0c2 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -34,7 +34,6 @@ void   ggml_cuda_pool_free(void * ptr, size_t size);
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl
index 191b2e575..a65a79f4d 100644
--- a/ggml-opencl-dequant.cl
+++ b/ggml-opencl-dequant.cl
@@ -60,25 +60,4 @@ __kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global f
     result[index + 1] = ((vi >> 4) - 8)*d;
 }
 
-struct block_q4_3
-{
-    ushort d;
-    ushort m;
-    uchar qs[8];
-};
-
-__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &(blocks[i].d));
-    const float m = vload_half(0, (__global half*) &(blocks[i].m));
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*16 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
-}
-
 );
diff --git a/ggml-opencl.c b/ggml-opencl.c
index 1d68f19ee..b748f86b7 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -24,7 +24,7 @@ static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
 
@@ -97,8 +97,6 @@ void ggml_cl_init(void) {
     CL_CHECK(err, "clCreateKernel");
     kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
     CL_CHECK(err, "clCreateKernel");
-    kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
-    CL_CHECK(err, "clCreateKernel");
 }
 
 static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
@@ -150,12 +148,6 @@ void ggml_cl_sgemm_wrapper(
         local = 8;
         size_qb = global * (sizeof(short) + local) / 16;
         break;
-    case GGML_TYPE_Q4_3:
-        dequant = true;
-        kernel = kernel_q4_3;
-        local = 8;
-        size_qb = global * (sizeof(short) * 2 + local) / 16;
-        break;
     default:
         fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
         abort();
diff --git a/ggml.c b/ggml.c
index 53796bd97..0c6eb7482 100644
--- a/ggml.c
+++ b/ggml.c
@@ -694,14 +694,6 @@ typedef struct {
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 
-#define QK4_3 16
-typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
-    uint8_t qs[QK4_3 / 2]; // nibbles / quants
-} block_q4_3;
-static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
-
 #define QK5_0 32
 typedef struct {
     ggml_fp16_t d;         // delta
@@ -1291,49 +1283,6 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
     quantize_row_q4_2_reference(x, y, k);
 }
 
-static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
-    assert(k % QK4_3 == 0);
-    const int nb = k / QK4_3;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < QK4_3; l++) {
-            const float v = x[i*QK4_3 + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        for (int l = 0; l < QK4_3; l += 2) {
-            const float v0 = (x[i*QK4_3 + l + 0] - min)*id;
-            const float v1 = (x[i*QK4_3 + l + 1] - min)*id;
-
-            const uint8_t vi0 = (int) (v0 + 0.5f);
-            const uint8_t vi1 = (int) (v1 + 0.5f);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
-    }
-}
-
-static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_3 == 0);
-
-    block_q4_3 * restrict y = vy;
-
-    quantize_row_q4_3_reference(x, y, k);
-}
-
 static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
     assert(k % QK5_0 == 0);
     const int nb = k / QK5_0;
@@ -1917,36 +1866,6 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
     }
 }
 
-static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_3 == 0);
-    const int nb = k / QK4_3;
-
-    const block_q4_3 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_3; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
-
-            y[i*QK4_3 + l + 0] = v0;
-            y[i*QK4_3 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_3 + l + 0]));
-            assert(!isnan(y[i*QK4_3 + l + 1]));
-        }
-    }
-}
-
 static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
     assert(k % QK5_0 == 0);
     const int nb = k / QK5_0;
@@ -2040,7 +1959,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
 static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -2070,14 +1988,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
         .vec_dot_q                = ggml_vec_dot_q4_2_q8_0,
         .vec_dot_type             = GGML_TYPE_Q8_0,
     },
-    [GGML_TYPE_Q4_3] = {
-        .dequantize_row_q         = dequantize_row_q4_3,
-        .quantize_row_q           = quantize_row_q4_3,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference,
-        .quantize_row_q_dot       = quantize_row_q8_1,
-        .vec_dot_q                = ggml_vec_dot_q4_3_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
     [GGML_TYPE_Q5_0] = {
         .dequantize_row_q         = dequantize_row_q5_0,
         .quantize_row_q           = quantize_row_q5_0,
@@ -3171,136 +3081,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
 #endif
 }
 
-static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_1;
-
-    assert(n % QK8_1 == 0);
-    assert(nb % 2 == 0);
-    assert(QK8_1 == 2*QK4_3);
-
-    const block_q4_3 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
-        const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
-
-        const block_q8_1 * restrict y0 = &y[i + 0];
-
-        summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
-        summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
-
-        const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, vdupq_n_u8(0x0F)));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-
-        const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
-        const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d);
-#endif
-    }
-
-    *s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 dx = _mm256_set_m128(d1, d0);
-
-        summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0
-               + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1;
-
-        const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        const __m256i bx = _mm256_set_m128i(bx1, bx0);
-
-        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[2*i + 0].qs;
-        const uint8_t * restrict x1 = x[2*i + 1].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
-        const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m);
-        const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
-        const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
-
-        int sxy_0 = 0;
-        int sxy_1 = 0;
-
-        for (int j = 0; j < QK8_1/4; j++) {
-            const uint8_t v0 = x0[j];
-            const uint8_t v1 = x1[j];
-
-            const int x0_0 = v0 & 0x0F;
-            const int x1_0 = v0 >> 4;
-
-            const int x0_1 = v1 & 0x0F;
-            const int x1_1 = v1 >> 4;
-
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
-
-            const int y0_1 = y0[2*(j + QK8_1/4) + 0];
-            const int y1_1 = y0[2*(j + QK8_1/4) + 1];
-
-            sxy_0 += x0_0*y0_0 + x1_0*y1_0;
-            sxy_1 += x0_1*y0_1 + x1_1*y1_1;
-        }
-
-        sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1;
-    }
-    *s = sumf;
-#endif
-}
-
 static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
     const int nb = n / QK8_0;
 
@@ -3925,7 +3705,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = QK4_0,
     [GGML_TYPE_Q4_1] = QK4_1,
     [GGML_TYPE_Q4_2] = QK4_2,
-    [GGML_TYPE_Q4_3] = QK4_3,
     [GGML_TYPE_Q5_0] = QK5_0,
     [GGML_TYPE_Q5_1] = QK5_1,
     [GGML_TYPE_Q8_0] = QK8_0,
@@ -3942,7 +3721,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
     [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
     [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
-    [GGML_TYPE_Q4_3] = sizeof(block_q4_3),
     [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
     [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
     [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
@@ -3960,7 +3738,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = "q4_0",
     [GGML_TYPE_Q4_1] = "q4_1",
     [GGML_TYPE_Q4_2] = "q4_2",
-    [GGML_TYPE_Q4_3] = "q4_3",
     [GGML_TYPE_Q5_0] = "q5_0",
     [GGML_TYPE_Q5_1] = "q5_1",
     [GGML_TYPE_Q8_0] = "q8_0",
@@ -3977,7 +3754,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = true,
     [GGML_TYPE_Q4_1] = true,
     [GGML_TYPE_Q4_2] = true,
-    [GGML_TYPE_Q4_3] = true,
     [GGML_TYPE_Q5_0] = true,
     [GGML_TYPE_Q5_1] = true,
     [GGML_TYPE_Q8_0] = true,
@@ -7230,7 +7006,6 @@ static void ggml_compute_forward_add(
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -8739,9 +8514,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
         else if (type == GGML_TYPE_Q4_2) {
             dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
         }
-        else if (type == GGML_TYPE_Q4_3) {
-            dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
-        }
         else if (type == GGML_TYPE_Q5_0) {
             dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
         }
@@ -8914,7 +8686,6 @@ static void ggml_compute_forward_mul_mat(
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -9146,7 +8917,6 @@ static void ggml_compute_forward_get_rows(
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -9472,7 +9242,6 @@ static void ggml_compute_forward_alibi(
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
         case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -13088,29 +12857,6 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK4_2*sizeof(block_q4_2));
 }
 
-size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_3 == 0);
-    const int nb = k / QK4_3;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
-
-        quantize_row_q4_3_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_3; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_3*sizeof(block_q4_3));
-}
-
 size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK5_0 == 0);
     const int nb = k / QK5_0;
@@ -13213,12 +12959,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
                 block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
                 result = ggml_quantize_q4_2(src + start, block, n, n, hist);
             } break;
-        case GGML_TYPE_Q4_3:
-            {
-                GGML_ASSERT(start % QK4_3 == 0);
-                block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
-                result = ggml_quantize_q4_3(src + start, block, n, n, hist);
-            } break;
         case GGML_TYPE_Q5_0:
             {
                 GGML_ASSERT(start % QK5_0 == 0);
diff --git a/ggml.h b/ggml.h
index 540901f15..38ae9a6ee 100644
--- a/ggml.h
+++ b/ggml.h
@@ -221,7 +221,7 @@ extern "C" {
         GGML_TYPE_Q4_0 = 2,
         GGML_TYPE_Q4_1 = 3,
         GGML_TYPE_Q4_2 = 4,
-        GGML_TYPE_Q4_3 = 5,
+        // GGML_TYPE_Q4_3 (5) support has been removed
         GGML_TYPE_Q5_0 = 6,
         GGML_TYPE_Q5_1 = 7,
         GGML_TYPE_Q8_0 = 8,
@@ -843,7 +843,6 @@ extern "C" {
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
diff --git a/llama.cpp b/llama.cpp
index dca017db6..45f0d44ac 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -483,7 +483,6 @@ struct llama_file_loader {
                 case GGML_TYPE_Q4_0:
                 case GGML_TYPE_Q4_1:
                 case GGML_TYPE_Q4_2:
-                case GGML_TYPE_Q4_3:
                 case GGML_TYPE_Q5_0:
                 case GGML_TYPE_Q5_1:
                 case GGML_TYPE_Q8_0:
@@ -560,7 +559,6 @@ struct llama_file_saver {
             case GGML_TYPE_Q4_0:
             case GGML_TYPE_Q4_1:
             case GGML_TYPE_Q4_2:
-            case GGML_TYPE_Q4_3:
             case GGML_TYPE_Q5_0:
             case GGML_TYPE_Q5_1:
             case GGML_TYPE_Q8_0:
@@ -853,7 +851,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
                                       return "mostly Q4_1, some F16";
         case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
-        case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
         case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
         case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
         case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
@@ -1593,7 +1590,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
         case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
         case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
diff --git a/llama.h b/llama.h
index 86a7d279a..936c52139 100644
--- a/llama.h
+++ b/llama.h
@@ -73,7 +73,7 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
         LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
         LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors

From b1ee8f59b4101b46999a0995d9a34506f7285466 Mon Sep 17 00:00:00 2001
From: Henri Vasserman <henv@hot.ee>
Date: Sat, 29 Apr 2023 02:31:56 +0300
Subject: [PATCH 50/74] cuBLAS: non-contiguous tensor support (#1215)

* Cuda: non-contiguous tensor support

* remove extra stuff

* rename

* fix error

* more fixes, now OpenBLAS and CLBlast build too

* now then?
---
 ggml-cuda.cu | 28 ++++++++++++++++++++++++++++
 ggml-cuda.h  |  3 +++
 ggml.c       | 24 +++++++++++++-----------
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index d619f5da4..eb244f409 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -302,3 +302,31 @@ void ggml_init_cublas(void) {
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
     }
 }
+
+cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
+    const uint64_t ne0 = src->ne[0];
+    const uint64_t ne1 = src->ne[1];
+    const uint64_t nb0 = src->nb[0];
+    const uint64_t nb1 = src->nb[1];
+    const uint64_t nb2 = src->nb[2];
+    const uint64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const size_t ts = ggml_type_size(type);
+    const size_t bs = ggml_blck_size(type);
+
+    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
+    } else if (nb0 == ts) {
+        return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
+    } else {
+        for (uint64_t i1 = 0; i1 < ne1; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
+            if (r != cudaSuccess) return r;
+        }
+        return cudaSuccess;
+    }
+}
diff --git a/ggml-cuda.h b/ggml-cuda.h
index b105ed0c2..1fd67ebeb 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -1,5 +1,6 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include "ggml.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -38,6 +39,8 @@ void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t st
 void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 
+cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml.c b/ggml.c
index 0c6eb7482..4ec637ee1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -7930,8 +7930,12 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     const int64_t ne1 = dst->ne[1];
 
     // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
+    if (
+#if !defined(GGML_USE_CUBLAS)
+        ggml_is_contiguous(src0) &&
+        ggml_is_contiguous(src1) &&
+#endif
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
 
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
         return true;
@@ -8041,15 +8045,16 @@ static void ggml_compute_forward_mul_mat_f32(
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
+#if !defined(GGML_USE_CUBLAS)
                 const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
+#endif
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
 #if defined(GGML_USE_CUBLAS)
                 // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
-                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
 
                 // compute
                 CUBLAS_CHECK(
@@ -8269,13 +8274,12 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 #endif
 
 #if defined(GGML_USE_CUBLAS)
-                const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
                 const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
 
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
                 // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
                 CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
 
                 // compute
@@ -8539,9 +8543,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
 #if defined(GGML_USE_CUBLAS)
                 // copy and dequantize on device
-                CUDA_CHECK(
-                    cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
-                        GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream));
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream));
 
                 dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
                 CUDA_CHECK(cudaGetLastError());
@@ -8561,7 +8563,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
 #if defined(GGML_USE_CUBLAS)
                 // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
 
                 // compute
                 CUBLAS_CHECK(

From 7fc50c051ae8a78e9643fdf172d12e20f2dd9b6c Mon Sep 17 00:00:00 2001
From: slaren <2141330+slaren@users.noreply.github.com>
Date: Sat, 29 Apr 2023 02:04:18 +0200
Subject: [PATCH 51/74] cuBLAS: use host pinned memory and dequantize while
 copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
---
 Makefile     |  5 ++--
 ggml-cuda.cu | 45 ++++++++++++++++++++++++++++-----
 ggml-cuda.h  | 10 +++++++-
 ggml.c       | 70 ++++++++++++++++++++++------------------------------
 llama.cpp    |  8 +++---
 llama_util.h | 26 +++++++++++++++++++
 6 files changed, 110 insertions(+), 54 deletions(-)

diff --git a/Makefile b/Makefile
index 0715e857b..5a1cb3e83 100644
--- a/Makefile
+++ b/Makefile
@@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS
 endif
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
@@ -164,10 +165,10 @@ $(info )
 # Build library
 #
 
-ggml.o: ggml.c ggml.h
+ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-llama.o: llama.cpp ggml.h llama.h llama_util.h
+llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common.o: examples/common.cpp examples/common.h
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index eb244f409..5a2701cfe 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -227,6 +227,25 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st
     dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 
+dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_cuda;
+        case GGML_TYPE_Q4_2:
+            return dequantize_row_q4_2_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_row_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_row_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_row_q8_0_cuda;
+        default:
+            return nullptr;
+    }
+}
+
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 16
 
@@ -286,18 +305,22 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
     CUDA_CHECK(cudaFree(ptr));
 }
 
-cublasHandle_t g_cublasH = NULL;
-cudaStream_t g_cudaStream = NULL;
+cublasHandle_t g_cublasH = nullptr;
+cudaStream_t g_cudaStream = nullptr;
+cudaStream_t g_cudaStream2 = nullptr;
+cudaEvent_t g_cudaEvent = nullptr;
 
-void ggml_init_cublas(void) {
-    if (g_cublasH == NULL) {
+void ggml_init_cublas() {
+    if (g_cublasH == nullptr) {
         // create cublas handle, bind a stream
         CUBLAS_CHECK(cublasCreate(&g_cublasH));
-
         CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
-
         CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
 
+        // create additional stream and event for synchronization
+        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
+        CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
+
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
     }
@@ -330,3 +353,13 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
         return cudaSuccess;
     }
 }
+
+void * ggml_cuda_host_malloc(size_t size) {
+    void * ptr;
+    CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
+    return ptr;
+}
+
+void ggml_cuda_host_free(void * ptr) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+}
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 1fd67ebeb..36782d9e7 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -26,9 +26,14 @@ extern "C" {
     } while (0)
 
 extern cublasHandle_t g_cublasH;
-extern cudaStream_t   g_cudaStream;
+extern cudaStream_t g_cudaStream;
+extern cudaStream_t g_cudaStream2;
+extern cudaEvent_t g_cudaEvent;
 
 void   ggml_init_cublas(void);
+void * ggml_cuda_host_malloc(size_t size);
+void   ggml_cuda_host_free(void * ptr);
+
 void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
 void   ggml_cuda_pool_free(void * ptr, size_t size);
 
@@ -41,6 +46,9 @@ void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t st
 
 cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
 
+typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
+dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml.c b/ggml.c
index 4ec637ee1..64ecd0867 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8033,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32(
 #if defined(GGML_USE_CUBLAS)
         const float alpha = 1.0f;
         const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
+        const int x_ne = ne01 * ne00;
         const int y_ne = ne11 * ne10;
         const int d_ne = ne11 * ne01;
 
@@ -8235,25 +8235,27 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         }
 
 #if defined(GGML_USE_CUBLAS)
-        ggml_fp16_t * const wdata = params->wdata;
-
         const float alpha = 1.0f;
         const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
+        const int x_ne = ne01 * ne00;
         const int y_ne = ne11 * ne10;
         const int d_ne = ne11 * ne01;
 
         size_t x_size, y_size, d_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
+        ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
+        ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
+        float       * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
 #else
         float * const wdata = params->wdata;
 #endif
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
 #if defined(GGML_USE_CUBLAS)
+                // copy src0 while converting src1
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
+
                 // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
+                ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
                 {
                     size_t id = 0;
                     for (int64_t i01 = 0; i01 < ne11; ++i01) {
@@ -8275,11 +8277,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 
 #if defined(GGML_USE_CUBLAS)
                 const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
-
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
                 // copy data to device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
                 CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
 
                 // compute
@@ -8498,39 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
 #if defined(GGML_USE_CUBLAS)
         const float alpha = 1.0f;
         const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
+        const int x_ne = ne01 * ne00;
         const int y_ne = ne11 * ne10;
         const int d_ne = ne11 * ne01;
 
         size_t x_size, y_size, d_size, q_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-        float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
+        float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
+        float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
+        float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
+        void  * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
 
-        void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream)  = NULL;
-        if (type == GGML_TYPE_Q4_0) {
-            dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
-        }
-        else if (type == GGML_TYPE_Q4_1) {
-            dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
-        }
-        else if (type == GGML_TYPE_Q4_2) {
-            dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
-        }
-        else if (type == GGML_TYPE_Q5_0) {
-            dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
-        }
-        else if (type == GGML_TYPE_Q5_1) {
-            dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
-        }
-        else if (type == GGML_TYPE_Q8_0) {
-            dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
-        }
-        else {
-            GGML_ASSERT(false);
-        }
-#elif !defined(GGML_USE_CLBLAST)
+        const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
+        GGML_ASSERT(dequantize_row_q_cuda != NULL);
+#else
         float * const wdata = params->wdata;
         dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
 #endif
@@ -8543,10 +8523,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
 #if defined(GGML_USE_CUBLAS)
                 // copy and dequantize on device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream));
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2));
 
-                dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
+                dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
                 CUDA_CHECK(cudaGetLastError());
+                CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2));
 #elif defined(GGML_USE_CLBLAST)
                 const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
 #else
@@ -8560,11 +8541,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 const float * x = wdata;
 #endif
 
-
 #if defined(GGML_USE_CUBLAS)
                 // copy data to device
                 CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
 
+                // wait for dequantization
+                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
+
                 // compute
                 CUBLAS_CHECK(
                     cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
@@ -11588,7 +11571,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
                                 //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
                                 //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
                                 //printf("cur = %zu\n", cur);
@@ -11600,6 +11583,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                                node->n_tasks = 1;
+                            }
+#endif
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
diff --git a/llama.cpp b/llama.cpp
index 45f0d44ac..4699e5cf1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -136,7 +136,7 @@ struct llama_kv_cache {
 
     struct ggml_context * ctx = NULL;
 
-    llama_buffer buf;
+    llama_ctx_buffer buf;
 
     int n; // number of tokens currently in the cache
 
@@ -167,7 +167,7 @@ struct llama_model {
     struct llama_kv_cache kv_self;
 
     // the model memory buffer
-    llama_buffer buf;
+    llama_ctx_buffer buf;
 
     // model memory mapped file
     std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +228,8 @@ struct llama_context {
 
     // memory buffers used to evaluate the model
     // TODO: move in llama_state
-    llama_buffer buf_compute;
-    llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
+    llama_ctx_buffer buf_compute;
+    llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
 
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
diff --git a/llama_util.h b/llama_util.h
index acb207e65..6e66d12a8 100755
--- a/llama_util.h
+++ b/llama_util.h
@@ -405,4 +405,30 @@ struct llama_buffer {
         delete[] addr;
     }
 };
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+struct llama_ctx_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        if (addr) {
+            ggml_cuda_host_free(addr);
+        }
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        this->size = size;
+    }
+
+    ~llama_ctx_buffer() {
+        if (addr) {
+            ggml_cuda_host_free(addr);
+        }
+    }
+};
+#else
+typedef llama_buffer llama_ctx_buffer;
+#endif
+
+
 #endif

From dd7eff57d8491792010b1002b8de6a4b54912e5c Mon Sep 17 00:00:00 2001
From: Ivan Stepanov <ivanstepanovftw@gmail.com>
Date: Sat, 29 Apr 2023 08:34:41 +0300
Subject: [PATCH 52/74] llama : new sampling algorithms (#1126)

* Sample interface, new samplers.

New samplers:
- locally typical sampling
- tail free sampling
- frequency and presence penalty
- mirostat

Ignore EOS fix: -inf should be used.

* mirostat

* Added --logit-bias and --no-penalize-nl, removed std::span

* Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)

Use C++11, clarify llama API documentation, rename Mirostat parameters to --mirostat_lr and --mirostat_ent, add temperature sampling for Mirostat, simplify Mirostat sampling API parameters (removed N and *k)

* Save and load example adjust

* Tests

* Windows build fix

* Windows test fix
---
 examples/common.cpp                          |  91 +++-
 examples/common.h                            |  21 +-
 examples/main/main.cpp                       |  71 ++-
 examples/save-load-state/save-load-state.cpp |  34 +-
 llama.cpp                                    | 491 ++++++++++++++-----
 llama.h                                      |  64 ++-
 tests/CMakeLists.txt                         |   1 +
 tests/test-sampling.cpp                      | 199 ++++++++
 8 files changed, 812 insertions(+), 160 deletions(-)
 create mode 100644 tests/test-sampling.cpp

diff --git a/examples/common.cpp b/examples/common.cpp
index 9f10dc268..6c712c713 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -6,6 +6,8 @@
 #include <string>
 #include <iterator>
 #include <algorithm>
+#include <sstream>
+#include <iostream>
 
 #if defined (_WIN32)
 #include <fcntl.h>
@@ -114,6 +116,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.temp = std::stof(argv[i]);
+        } else if (arg == "--tfs") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.tfs_z = std::stof(argv[i]);
+        } else if (arg == "--typical") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.typical_p = std::stof(argv[i]);
         } else if (arg == "--repeat_last_n") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -126,6 +140,36 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.repeat_penalty = std::stof(argv[i]);
+        } else if (arg == "--frequency_penalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.frequency_penalty = std::stof(argv[i]);
+        } else if (arg == "--presence_penalty") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.presence_penalty = std::stof(argv[i]);
+        } else if (arg == "--mirostat") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mirostat = std::stoi(argv[i]);
+        } else if (arg == "--mirostat_lr") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mirostat_eta = std::stof(argv[i]);
+        } else if (arg == "--mirostat_ent") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mirostat_tau = std::stof(argv[i]);
         } else if (arg == "-b" || arg == "--batch_size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -185,7 +229,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         } else if (arg == "--perplexity") {
             params.perplexity = true;
         } else if (arg == "--ignore-eos") {
-            params.ignore_eos = true;
+            params.logit_bias[llama_token_eos()] = -INFINITY;
+        } else if (arg == "--no-penalize-nl") {
+            params.penalize_nl = false;
+        } else if (arg == "-l" || arg == "--logit-bias") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::stringstream ss(argv[i]);
+            llama_token key;
+            char sign;
+            std::string value_str;
+            try {
+                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+                } else {
+                    throw std::exception();
+                }
+            } catch (const std::exception &e) {
+                invalid_param = true;
+                break;
+            }
         } else if (arg == "--n_parts") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -240,12 +305,26 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  -f FNAME, --file FNAME\n");
     fprintf(stderr, "                        prompt file to start generation.\n");
     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", (double)params.top_p);
-    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stderr, "  --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stderr, "  --mirostat N          use Mirostat sampling.\n");
+    fprintf(stderr, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+    fprintf(stderr, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    fprintf(stderr, "  --mirostat_lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stderr, "  --mirostat_ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stderr, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
+    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
+    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
     fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating\n");
+    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
+    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
     fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
     fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
diff --git a/examples/common.h b/examples/common.h
index 9d3697d79..14e6b1ba7 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -8,6 +8,7 @@
 #include <vector>
 #include <random>
 #include <thread>
+#include <unordered_map>
 
 //
 // CLI argument parsing
@@ -17,17 +18,25 @@ struct gpt_params {
     int32_t seed          = -1;   // RNG seed
     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t n_predict     = 128;  // new tokens to predict
-    int32_t repeat_last_n = 64;   // last n tokens to penalize
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
     int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters
-    int32_t top_k = 40;
-    float   top_p = 0.95f;
-    float   temp  = 0.80f;
-    float   repeat_penalty  = 1.10f;
+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+    int32_t top_k = 0;              // <= 0 to use vocab size
+    float   top_p = 1.0f;           // 1.0 = disabled
+    float   tfs_z = 1.0f;           // 1.0 = disabled
+    float   typical_p = 1.0f;       // 1.0 = disabled
+    float   temp = 1.0f;            // 1.0 = disabled
+    float   repeat_penalty  = 1.0f; // 1.0 = disabled
+    int32_t repeat_last_n = -1;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.0f; // 0.0 = disabled
+    float   presence_penalty = 0.0f;  // 0.0 = disabled
+    int     mirostat = 0;           // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau = 5.0f;    // target entropy
+    float   mirostat_eta = 0.1f;    // learning rate
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
@@ -47,7 +56,7 @@ struct gpt_params {
     bool interactive_first = false; // wait for user input immediately
 
     bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool ignore_eos        = false; // do not stop generating after eos
+    bool penalize_nl       = true;  // consider newlines as a repeatable token
     bool perplexity        = false; // compute perplexity over the prompt
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index fda65574f..674920b8a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -276,8 +276,8 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
         }
     }
-    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
-        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
     fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     fprintf(stderr, "\n\n");
 
@@ -387,10 +387,19 @@ int main(int argc, char ** argv) {
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
-            const int32_t top_k          = params.top_k;
-            const float   top_p          = params.top_p;
             const float   temp           = params.temp;
+            const int32_t top_k          = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p          = params.top_p;
+            const float   tfs_z          = params.tfs_z;
+            const float   typical_p      = params.typical_p;
+            const int32_t repeat_last_n  = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
             const float   repeat_penalty = params.repeat_penalty;
+            const float   alpha_presence = params.presence_penalty;
+            const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat       = params.mirostat;
+            const float   mirostat_tau   = params.mirostat_tau;
+            const float   mirostat_eta   = params.mirostat_eta;
+            const bool    penalize_nl   = params.penalize_nl;
 
             // optionally save the session on first sample (for faster prompt loading next time)
             if (!path_session.empty() && need_to_save_session) {
@@ -402,14 +411,58 @@ int main(int argc, char ** argv) {
 
             {
                 auto logits = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);
 
-                if (params.ignore_eos) {
-                    logits[llama_token_eos()] = 0;
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
                 }
 
-                id = llama_sample_top_p_top_k(ctx,
-                        last_n_tokens.data() + n_ctx - params.repeat_last_n,
-                        params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                }
+
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                // Apply penalties
+                float nl_logit = logits[llama_token_nl()];
+                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+                llama_sample_repetition_penalty(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, repeat_penalty);
+                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, alpha_frequency, alpha_presence);
+                if (!penalize_nl) {
+                    logits[llama_token_nl()] = nl_logit;
+                }
+
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    if (mirostat == 1) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        const int mirostat_m = 100;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                    } else if (mirostat == 2) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                    } else {
+                        // Temperature sampling
+                        llama_sample_top_k(ctx, &candidates_p, top_k);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z);
+                        llama_sample_typical(ctx, &candidates_p, typical_p);
+                        llama_sample_top_p(ctx, &candidates_p, top_p);
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token(ctx, &candidates_p);
+                    }
+                }
+                // printf("`%d`", candidates_p.size);
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 39aa7f82c..07dfa2c74 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -64,14 +64,15 @@ int main(int argc, char ** argv) {
     // first run
     printf("\n%s", params.prompt.c_str());
     for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token = llama_sample_top_p_top_k(
-            ctx,
-            &last_n_tokens_data.back() - params.repeat_last_n,
-            params.repeat_last_n,
-            40,
-            1.0,
-            1.0,
-            1.1);
+        auto logits = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx, &candidates_p);
         auto next_token_str = llama_token_to_str(ctx, next_token);
         last_n_tokens_data.push_back(next_token);
         printf("%s", next_token_str);
@@ -106,14 +107,15 @@ int main(int argc, char ** argv) {
 
     // second run
     for (auto i = 0; i < params.n_predict; i++) {
-        auto next_token = llama_sample_top_p_top_k(
-            ctx2,
-            &last_n_tokens_data.back() - params.repeat_last_n,
-            params.repeat_last_n,
-            40,
-            1.0,
-            1.0,
-            1.1);
+        auto logits = llama_get_logits(ctx2);
+        auto n_vocab = llama_n_vocab(ctx2);
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+        auto next_token = llama_sample_token(ctx2, &candidates_p);
         auto next_token_str = llama_token_to_str(ctx2, next_token);
         last_n_tokens_data.push_back(next_token);
         printf("%s", next_token_str);
diff --git a/llama.cpp b/llama.cpp
index 4699e5cf1..1032fb9fa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -28,6 +28,7 @@
 #include <atomic>
 #include <mutex>
 #include <sstream>
+#include <numeric>
 
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -1475,109 +1476,402 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
 // sampling
 //
 
-static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
-    // find the top k tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
-        return a.first > b.first;
-    });
+void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
+    assert(candidates->size > 0);
 
-    logits_id.resize(top_k);
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Sort the logits in descending order
+    if (!candidates->sorted) {
+        std::sort(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        });
+        candidates->sorted = true;
+    }
+
+    float max_l = candidates->data[0].logit;
+    float cum_sum = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float p = expf(candidates->data[i].logit - max_l);
+        candidates->data[i].p = p;
+        cum_sum += p;
+    }
+    for (size_t i = 0; i < candidates->size; ++i) {
+        candidates->data[i].p /= cum_sum;
+    }
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
 }
 
-static llama_vocab::id llama_sample_top_p_top_k(
-        llama_context & lctx,
-        const std::vector<llama_vocab::id> & last_n_tokens,
-        int top_k,
-        float top_p,
-        float temp,
-        float repeat_penalty) {
-    auto & rng = lctx.rng;
+void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
+    const int64_t t_start_sample_us = ggml_time_us();
 
-    const int n_logits = lctx.model.hparams.n_vocab;
+    k = std::max(k, (int) min_keep);
+    k = std::min(k, (int) candidates->size);
 
-    const auto & logits = lctx.logits;
-    const auto * plogits = logits.data() + logits.size() - n_logits;
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        llama_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
+    // Sort scores in descending order
+    if (!candidates->sorted) {
+        auto comp = [](const llama_token_data & a, const llama_token_data & b) {
+            return a.logit > b.logit;
+        };
+        if (k == (int) candidates->size) {
+            std::sort(candidates->data, candidates->data + candidates->size, comp);
+        } else {
+            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
         }
-        return max_id;
+        candidates->sorted = true;
+    }
+    candidates->size = k;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    if (p >= 1.0f) {
+        return;
     }
 
-    std::vector<std::pair<float, llama_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
+    const int64_t t_start_sample_us = ggml_time_us();
 
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
+    llama_sample_softmax(ctx, candidates);
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = candidates->size;
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        cum_sum += candidates->data[i].p;
+
+        // Check if the running sum is greater than p or if we have kept at least min_keep tokens
+        if (cum_sum > p && i >= min_keep) {
+            last_idx = i;
+            break;
         }
     }
 
-    sample_top_k(logits_id, top_k > 0 ? std::min(top_k, n_logits) : n_logits);
+    // Resize the output vector to keep only the top-p tokens
+    candidates->size = last_idx;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
+    if (z >= 1.0f || candidates->size <= 2) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    llama_sample_softmax(nullptr, candidates);
+
+    // Compute the first and second derivatives
+    std::vector<float> first_derivatives(candidates->size - 1);
+    std::vector<float> second_derivatives(candidates->size - 2);
+
+    for (size_t i = 0; i < first_derivatives.size(); ++i) {
+        first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
+    }
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
+    }
+
+    // Calculate absolute value of second derivatives
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        second_derivatives[i] = abs(second_derivatives[i]);
+    }
+
+    // Normalize the second derivatives
+    float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
+    for (float & value : second_derivatives) {
+        value /= second_derivatives_sum;
+    }
+
+    float cum_sum = 0.0f;
+    size_t last_idx = candidates->size;
+    for (size_t i = 0; i < second_derivatives.size(); ++i) {
+        cum_sum += second_derivatives[i];
+
+        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
+        if (cum_sum > z && i >= min_keep) {
+            last_idx = i;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the tokens above the tail location
+    candidates->size = last_idx;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+
+void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
+    // Reference implementation:
+    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
+    if (p >= 1.0f) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Compute the softmax of logits and calculate entropy
+    llama_sample_softmax(nullptr, candidates);
+
+    float entropy = 0.0f;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        entropy += -candidates->data[i].p * logf(candidates->data[i].p);
+    }
+
+    // Compute the absolute difference between negative log probability and entropy for each candidate
+    std::vector<float> shifted_scores;
+    for (size_t i = 0; i < candidates->size; ++i) {
+        float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
+        shifted_scores.push_back(shifted_score);
+    }
+
+    // Sort tokens based on the shifted_scores and their corresponding indices
+    std::vector<size_t> indices(candidates->size);
+    std::iota(indices.begin(), indices.end(), 0);
+
+    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
+        return shifted_scores[a] < shifted_scores[b];
+    });
+
+    // Compute the cumulative probabilities
+    float cum_sum = 0.0f;
+    size_t last_idx = indices.size();
+
+    for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        cum_sum += candidates->data[idx].p;
+
+        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
+        if (cum_sum > p && i >= min_keep - 1) {
+            last_idx = i + 1;
+            break;
+        }
+    }
+
+    // Resize the output vector to keep only the locally typical tokens
+    std::vector<llama_token_data> new_candidates;
+    for (size_t i = 0; i < last_idx; ++i) {
+        size_t idx = indices[i];
+        new_candidates.push_back(candidates->data[idx]);
+    }
+
+    // Replace the data in candidates with the new_candidates data
+    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
+    candidates->size = new_candidates.size();
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    for (size_t i = 0; i < candidates_p->size; ++i) {
+        candidates_p->data[i].logit /= temp;
+    }
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty) {
+    if (last_tokens_size == 0 || penalty == 1.0f) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    for (size_t i = 0; i < candidates->size; ++i) {
+        auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
+        if (token_iter == last_tokens + last_tokens_size) {
+            continue;
+        }
+
+        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
+        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
+        if (candidates->data[i].logit <= 0) {
+            candidates->data[i].logit *= penalty;
+        } else {
+            candidates->data[i].logit /= penalty;
+        }
+    }
+
+    candidates->sorted = false;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
+    if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
+        return;
+    }
+
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Create a frequency map to count occurrences of each token in last_tokens
+    std::unordered_map<llama_token, int> token_count;
+    for (size_t i = 0; i < last_tokens_size; ++i) {
+        token_count[last_tokens_p[i]]++;
+    }
+
+    // Apply frequency and presence penalties to the candidates
+    for (size_t i = 0; i < candidates->size; ++i) {
+        auto token_iter = token_count.find(candidates->data[i].id);
+        if (token_iter == token_count.end()) {
+            continue;
+        }
+
+        int count = token_iter->second;
+        candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
+    }
+
+    candidates->sorted = false;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+}
+
+
+llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
+    assert(ctx);
+    auto N = float(llama_n_vocab(ctx));
+    int64_t t_start_sample_us;
+    t_start_sample_us = ggml_time_us();
+
+    llama_sample_softmax(nullptr, candidates);
+
+    // Estimate s_hat using the most probable m tokens
+    float s_hat = 0.0;
+    float sum_ti_bi = 0.0;
+    float sum_ti_sq = 0.0;
+    for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
+        float t_i = logf(float(i + 2) / float(i + 1));
+        float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
+        sum_ti_bi += t_i * b_i;
+        sum_ti_sq += t_i * t_i;
+    }
+    s_hat = sum_ti_bi / sum_ti_sq;
+
+    // Compute k from the estimated s_hat and target surprise value
+    float epsilon_hat = s_hat - 1;
+    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
+
+    // Sample the next word X using top-k sampling
+    llama_sample_top_k(nullptr, candidates, int(k));
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+    llama_token X = llama_sample_token(ctx, candidates);
+    t_start_sample_us = ggml_time_us();
+
+    // Compute error as the difference between observed surprise and target surprise value
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return candidate.id == X;
+    }));
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
+    float e = observed_surprise - tau;
+
+    // Update mu using the learning rate and error
+    *mu = *mu - eta * e;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->n_sample++;
+    }
+    return X;
+}
+
+llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
+    assert(ctx);
+    int64_t t_start_sample_us;
+    t_start_sample_us = ggml_time_us();
+
+    llama_sample_softmax(ctx, candidates);
+
+    // Truncate the words with surprise values greater than mu
+    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return -log2f(candidate.p) > *mu;
+    }));
+
+    // Normalize the probabilities of the remaining words
+    llama_sample_softmax(ctx, candidates);
+
+    // Sample the next word X from the remaining words
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+    llama_token X = llama_sample_token(ctx, candidates);
+    t_start_sample_us = ggml_time_us();
+
+    // Compute error as the difference between observed surprise and target surprise value
+    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const llama_token_data & candidate) {
+        return candidate.id == X;
+    }));
+    float observed_surprise = -log2f(candidates->data[X_idx].p);
+    float e = observed_surprise - tau;
+
+    // Update mu using the learning rate and error
+    *mu = *mu - eta * e;
+
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    }
+    return X;
+}
+
+llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates) {
+    const int64_t t_start_sample_us = ggml_time_us();
+
+    // Find max element
+    auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
+        return a.logit < b.logit;
+    });
+
+    llama_token result = max_iter->id;
+    if (ctx) {
+        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+        ctx->n_sample++;
+    }
+    return result;
+}
+
+llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
+    assert(ctx);
+    const int64_t t_start_sample_us = ggml_time_us();
+    llama_sample_softmax(nullptr, candidates);
 
-    // compute probs for the top k tokens
     std::vector<float> probs;
-    probs.reserve(logits_id.size());
-
-    float maxl = logits_id[0].first;
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        const float p = expf(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
+    probs.reserve(candidates->size);
+    for (size_t i = 0; i < candidates->size; ++i) {
+        probs.push_back(candidates->data[i].p);
     }
 
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0) {
-        double cumsum = 0.0;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                probs.resize(i + 1);
-                logits_id.resize(i + 1);
-                break;
-            }
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int) 10; i++) {
-    //    printf("%d: '%s' %f\n", i, lctx.vocab.id_to_token.at(logits_id[i].second).tok.c_str(), probs[i]);
-    //}
-    //printf("\n\n");
-    //exit(0);
-
     std::discrete_distribution<> dist(probs.begin(), probs.end());
+    auto & rng = ctx->rng;
     int idx = dist(rng);
 
-    return logits_id[idx].second;
+    llama_token result = candidates->data[idx].id;
+
+    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+    ctx->n_sample++;
+    return result;
 }
 
 //
@@ -2348,33 +2642,8 @@ llama_token llama_token_eos() {
     return 2;
 }
 
-llama_token llama_sample_top_p_top_k(
-          llama_context * ctx,
-      const llama_token * last_n_tokens_data,
-                    int   last_n_tokens_size,
-                    int   top_k,
-                  float   top_p,
-                  float   temp,
-                  float   repeat_penalty) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    llama_token result = 0;
-
-    // TODO: avoid this ...
-    const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
-
-    result = llama_sample_top_p_top_k(
-            *ctx,
-            last_n_tokens,
-            top_k,
-            top_p,
-            temp,
-            repeat_penalty);
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    ctx->n_sample++;
-
-    return result;
+llama_token llama_token_nl() {
+    return 13;
 }
 
 
diff --git a/llama.h b/llama.h
index 936c52139..34a8f5b3c 100644
--- a/llama.h
+++ b/llama.h
@@ -39,12 +39,16 @@ extern "C" {
 
     typedef struct llama_token_data {
         llama_token id;  // token id
-
+        float logit; // log-odds of the token
         float p;     // probability of the token
-        float plog;  // log probability of the token
-
     } llama_token_data;
 
+    typedef struct llama_token_data_array {
+        llama_token_data * data;
+        size_t size;
+        bool sorted;
+    } llama_token_data_array;
+
     typedef void (*llama_progress_callback)(float progress, void *ctx);
 
     struct llama_context_params {
@@ -181,16 +185,52 @@ extern "C" {
     // Special tokens
     LLAMA_API llama_token llama_token_bos();
     LLAMA_API llama_token llama_token_eos();
+    LLAMA_API llama_token llama_token_nl();
 
-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                      float   top_p,
-                      float   temp,
-                      float   repeat_penalty);
+    // Sampling functions
+
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
+
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
+
+    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+
+    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+
+    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+
+    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+
+    /// @details Selects the token with the highest probability.
+    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Randomly selects a token from the candidates based on their probabilities.
+    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
 
     // Performance information
     LLAMA_API void llama_print_timings(struct llama_context * ctx);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 81eadbc4d..645648585 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -8,4 +8,5 @@ endfunction()
 # llama_add_test(test-double-float.c) # SLOW
 llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
+llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
new file mode 100644
index 000000000..7eee4f6d3
--- /dev/null
+++ b/tests/test-sampling.cpp
@@ -0,0 +1,199 @@
+#include "llama.h"
+#include "ggml.h"
+#include <cassert>
+#include <cmath>
+#include <numeric>
+#include <cassert>
+#include <iostream>
+#include <vector>
+#include <algorithm>
+
+
+void dump(const llama_token_data_array * candidates) {
+    for (size_t i = 0; i < candidates->size; i++) {
+        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
+    }
+}
+
+#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
+
+
+void test_top_k(const std::vector<float> & probs,
+                const std::vector<float> & expected_probs,
+                int k) {
+    size_t n_vocab = probs.size();
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        float logit = log(probs[token_id]);
+        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    llama_sample_softmax(nullptr, &candidates_p);
+    DUMP(&candidates_p);
+    llama_sample_top_k(nullptr, &candidates_p, k);
+    DUMP(&candidates_p);
+
+    assert(candidates_p.size == expected_probs.size());
+    for (size_t i = 0; i < candidates_p.size; i++) {
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
+    }
+}
+
+
+void test_top_p(const std::vector<float> & probs,
+                const std::vector<float> & expected_probs,
+                float p) {
+
+    size_t n_vocab = probs.size();
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        float logit = log(probs[token_id]);
+        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    llama_sample_softmax(nullptr, &candidates_p);
+    DUMP(&candidates_p);
+    llama_sample_top_p(nullptr, &candidates_p, p);
+    DUMP(&candidates_p);
+
+    assert(candidates_p.size == expected_probs.size());
+    for (size_t i = 0; i < candidates_p.size; i++) {
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+    }
+}
+
+
+void test_tfs(const std::vector<float> & probs,
+                const std::vector<float> & expected_probs,
+                float z) {
+    size_t n_vocab = probs.size();
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        float logit = log(probs[token_id]);
+        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    DUMP(&candidates_p);
+    llama_sample_tail_free(nullptr, &candidates_p, z);
+    DUMP(&candidates_p);
+
+    assert(candidates_p.size == expected_probs.size());
+    for (size_t i = 0; i < candidates_p.size; i++) {
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+    }
+}
+
+
+void test_typical(const std::vector<float> & probs,
+                const std::vector<float> & expected_probs,
+                float p) {
+    size_t n_vocab = probs.size();
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        float logit = log(probs[token_id]);
+        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    DUMP(&candidates_p);
+    llama_sample_typical(nullptr, &candidates_p, p);
+    DUMP(&candidates_p);
+
+    assert(candidates_p.size == expected_probs.size());
+    for (size_t i = 0; i < candidates_p.size; i++) {
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+    }
+}
+
+
+void test_repetition_penalty(
+                const std::vector<float> & probs,
+                const std::vector<llama_token> & last_tokens,
+                const std::vector<float> & expected_probs,
+                float penalty) {
+    assert(probs.size() == expected_probs.size());
+
+    size_t n_vocab = probs.size();
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        float logit = log(probs[token_id]);
+        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    llama_sample_softmax(nullptr, &candidates_p);
+    DUMP(&candidates_p);
+    llama_sample_repetition_penalty(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), penalty);
+    llama_sample_softmax(nullptr, &candidates_p);
+    DUMP(&candidates_p);
+
+    assert(candidates_p.size == expected_probs.size());
+    for (size_t i = 0; i < candidates_p.size; i++) {
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-6);
+    }
+}
+
+
+void test_frequency_presence_penalty(
+                const std::vector<float> & probs,
+                const std::vector<llama_token> & last_tokens,
+                const std::vector<float> & expected_probs,
+                float alpha_frequency, float alpha_presence) {
+    assert(probs.size() == expected_probs.size());
+
+    size_t n_vocab = probs.size();
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
+        float logit = log(probs[token_id]);
+        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+    llama_sample_softmax(nullptr, &candidates_p);
+    // DUMP(&candidates_p);
+    llama_sample_frequency_and_presence_penalties(nullptr, &candidates_p, (llama_token *)last_tokens.data(), last_tokens.size(), alpha_frequency, alpha_presence);
+    llama_sample_softmax(nullptr, &candidates_p);
+    // DUMP(&candidates_p);
+
+    assert(candidates_p.size == expected_probs.size());
+    for (size_t i = 0; i < candidates_p.size; i++) {
+        assert(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
+    }
+}
+
+int main(void) {
+    ggml_time_init();
+
+    test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1);
+    test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3);
+
+    test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0);
+    test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7);
+    test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1);
+
+    test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25);
+    test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75);
+    test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99);
+
+    test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5);
+    test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5);
+
+    test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0);
+    test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0);
+    test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0);
+
+    test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0},             {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0);
+    test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2},       {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0);
+    test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0);
+
+    printf("OK\n");
+}

From 334637e43e3a0529b4b50e2c22968b1ed1633353 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 09:51:06 +0300
Subject: [PATCH 53/74] common : change default parameters to pre-#1126 (#1223)

---
 examples/common.h      | 26 +++++++++++++-------------
 examples/main/main.cpp | 24 ++++++++++++------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/examples/common.h b/examples/common.h
index 14e6b1ba7..fce1d42a9 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -17,7 +17,7 @@
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict     = 128;  // new tokens to predict
+    int32_t n_predict     = -1;   // new tokens to predict
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
     int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
@@ -25,18 +25,18 @@ struct gpt_params {
 
     // sampling parameters
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-    int32_t top_k = 0;              // <= 0 to use vocab size
-    float   top_p = 1.0f;           // 1.0 = disabled
-    float   tfs_z = 1.0f;           // 1.0 = disabled
-    float   typical_p = 1.0f;       // 1.0 = disabled
-    float   temp = 1.0f;            // 1.0 = disabled
-    float   repeat_penalty  = 1.0f; // 1.0 = disabled
-    int32_t repeat_last_n = -1;     // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.0f; // 0.0 = disabled
-    float   presence_penalty = 0.0f;  // 0.0 = disabled
-    int     mirostat = 0;           // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau = 5.0f;    // target entropy
-    float   mirostat_eta = 0.1f;    // learning rate
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // 1.0 = disabled
+    float   repeat_penalty    = 1.10f; // 1.0 = disabled
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   frequency_penalty = 0.00f; // 0.0 = disabled
+    float   presence_penalty  = 0.00f; // 0.0 = disabled
+    int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
 
     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
     std::string prompt = "";
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 674920b8a..990d0fa02 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -387,19 +387,19 @@ int main(int argc, char ** argv) {
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
-            const float   temp           = params.temp;
-            const int32_t top_k          = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
-            const float   top_p          = params.top_p;
-            const float   tfs_z          = params.tfs_z;
-            const float   typical_p      = params.typical_p;
-            const int32_t repeat_last_n  = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-            const float   repeat_penalty = params.repeat_penalty;
-            const float   alpha_presence = params.presence_penalty;
+            const float   temp            = params.temp;
+            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p           = params.top_p;
+            const float   tfs_z           = params.tfs_z;
+            const float   typical_p       = params.typical_p;
+            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+            const float   repeat_penalty  = params.repeat_penalty;
+            const float   alpha_presence  = params.presence_penalty;
             const float   alpha_frequency = params.frequency_penalty;
-            const int     mirostat       = params.mirostat;
-            const float   mirostat_tau   = params.mirostat_tau;
-            const float   mirostat_eta   = params.mirostat_eta;
-            const bool    penalize_nl   = params.penalize_nl;
+            const int     mirostat        = params.mirostat;
+            const float   mirostat_tau    = params.mirostat_tau;
+            const float   mirostat_eta    = params.mirostat_eta;
+            const bool    penalize_nl     = params.penalize_nl;
 
             // optionally save the session on first sample (for faster prompt loading next time)
             if (!path_session.empty() && need_to_save_session) {

From 84ca9c2ecf3391d911589d0fe2b483cbfb4b82a6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 13:48:11 +0300
Subject: [PATCH 54/74] examples : fix save-load-state + rename llama-util.h

---
 examples/save-load-state/save-load-state.cpp | 72 +++++++++++---------
 llama_util.h => llama-util.h                 |  1 -
 llama.cpp                                    |  3 +-
 3 files changed, 42 insertions(+), 34 deletions(-)
 rename llama_util.h => llama-util.h (99%)
 mode change 100755 => 100644

diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index 07dfa2c74..f5f02ec1d 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,13 +1,10 @@
+#include "common.h"
+#include "llama.h"
+
 #include <vector>
 #include <cstdio>
 #include <chrono>
 
-#include "common.h"
-#include "llama.h"
-#include "llama.cpp"
-
-using namespace std;
-
 int main(int argc, char ** argv) {
     gpt_params params;
     params.model = "models/llama-7B/ggml-model.bin";
@@ -20,21 +17,25 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.n_predict < 0) {
+        params.n_predict = 16;
+    }
+
     auto lparams = llama_context_default_params();
 
-    lparams.n_ctx      = params.n_ctx;
-    lparams.n_parts    = params.n_parts;
-    lparams.seed       = params.seed;
-    lparams.f16_kv     = params.memory_f16;
-    lparams.use_mmap   = params.use_mmap;
-    lparams.use_mlock  = params.use_mlock;
+    lparams.n_ctx     = params.n_ctx;
+    lparams.n_parts   = params.n_parts;
+    lparams.seed      = params.seed;
+    lparams.f16_kv    = params.memory_f16;
+    lparams.use_mmap  = params.use_mmap;
+    lparams.use_mlock = params.use_mlock;
 
     auto n_past = 0;
-    auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
+    auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
 
     // init
     auto ctx = llama_init_from_file(params.model.c_str(), lparams);
-    auto tokens = vector<llama_token>(params.n_ctx);
+    auto tokens = std::vector<llama_token>(params.n_ctx);
     auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
 
     if (n_prompt_tokens < 1) {
@@ -43,23 +44,25 @@ int main(int argc, char ** argv) {
     }
 
     // evaluate prompt
-
     llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
 
     last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
     n_past += n_prompt_tokens;
 
+    const size_t state_size = llama_get_state_size(ctx);
+    uint8_t * state_mem = new uint8_t[state_size];
+
     // Save state (rng, logits, embedding and kv_cache) to file
-    FILE *fp_write = fopen("dump_state.bin", "wb");
-    auto state_size = llama_get_state_size(ctx);
-    auto state_mem = new uint8_t[state_size];
-    llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
-    fwrite(state_mem, 1, state_size, fp_write);
-    fclose(fp_write);
+    {
+        FILE *fp_write = fopen("dump_state.bin", "wb");
+        llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+        fwrite(state_mem, 1, state_size, fp_write);
+        fclose(fp_write);
+    }
 
     // save state (last tokens)
-    auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
-    auto n_past_saved = n_past;
+    const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
+    const auto n_past_saved = n_past;
 
     // first run
     printf("\n%s", params.prompt.c_str());
@@ -75,6 +78,7 @@ int main(int argc, char ** argv) {
         auto next_token = llama_sample_token(ctx, &candidates_p);
         auto next_token_str = llama_token_to_str(ctx, next_token);
         last_n_tokens_data.push_back(next_token);
+
         printf("%s", next_token_str);
         if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
@@ -88,18 +92,21 @@ int main(int argc, char ** argv) {
     llama_free(ctx);
 
     // load new model
-
     auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
 
     // Load state (rng, logits, embedding and kv_cache) from file
-    FILE *fp_read = fopen("dump_state.bin", "rb");
-    auto state_size2 = llama_get_state_size(ctx2);
-    if (state_size != state_size2) {
-        fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+    {
+        FILE *fp_read = fopen("dump_state.bin", "rb");
+        if (state_size != llama_get_state_size(ctx2)) {
+            fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+            return 1;
+        }
+        fread(state_mem, 1, state_size, fp_read);
+        llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+        fclose(fp_read);
     }
-    fread(state_mem, 1, state_size, fp_read);
-    llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
-    fclose(fp_read);
+
+    delete[] state_mem;
 
     // restore state (last tokens)
     last_n_tokens_data = last_n_tokens_data_saved;
@@ -118,6 +125,7 @@ int main(int argc, char ** argv) {
         auto next_token = llama_sample_token(ctx2, &candidates_p);
         auto next_token_str = llama_token_to_str(ctx2, next_token);
         last_n_tokens_data.push_back(next_token);
+
         printf("%s", next_token_str);
         if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
@@ -125,6 +133,8 @@ int main(int argc, char ** argv) {
         }
         n_past += 1;
     }
+
     printf("\n\n");
+
     return 0;
 }
diff --git a/llama_util.h b/llama-util.h
old mode 100755
new mode 100644
similarity index 99%
rename from llama_util.h
rename to llama-util.h
index 6e66d12a8..ca4dd162f
--- a/llama_util.h
+++ b/llama-util.h
@@ -430,5 +430,4 @@ struct llama_ctx_buffer {
 typedef llama_buffer llama_ctx_buffer;
 #endif
 
-
 #endif
diff --git a/llama.cpp b/llama.cpp
index 1032fb9fa..dc4bdc534 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5,7 +5,7 @@
 #include <cstdio>
 #endif
 
-#include "llama_util.h"
+#include "llama-util.h"
 #include "llama.h"
 
 #include "ggml.h"
@@ -33,7 +33,6 @@
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16
 
-
 // available llama models
 enum e_model {
     MODEL_UNKNOWN,

From 305eb5afd51325e3142c01c17431febb7c67de87 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 13:53:12 +0300
Subject: [PATCH 55/74] build : fix reference to old llama_util.h

---
 CMakeLists.txt                               |  2 +-
 Makefile                                     |  2 +-
 examples/save-load-state/save-load-state.cpp | 10 +++++++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5fdbeddfc..bbf599559 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -337,7 +337,7 @@ endif()
 add_library(llama
             llama.cpp
             llama.h
-            llama_util.h)
+            llama-util.h)
 
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
diff --git a/Makefile b/Makefile
index 5a1cb3e83..fd695d7dd 100644
--- a/Makefile
+++ b/Makefile
@@ -168,7 +168,7 @@ $(info )
 ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
+llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 common.o: examples/common.cpp examples/common.h
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index f5f02ec1d..f1531ba39 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -66,6 +66,7 @@ int main(int argc, char ** argv) {
 
     // first run
     printf("\n%s", params.prompt.c_str());
+
     for (auto i = 0; i < params.n_predict; i++) {
         auto logits = llama_get_logits(ctx);
         auto n_vocab = llama_n_vocab(ctx);
@@ -86,6 +87,7 @@ int main(int argc, char ** argv) {
         }
         n_past += 1;
     }
+
     printf("\n\n");
 
     // free old model
@@ -101,7 +103,13 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
             return 1;
         }
-        fread(state_mem, 1, state_size, fp_read);
+
+        const size_t ret = fread(state_mem, 1, state_size, fp_read);
+        if (ret != state_size) {
+            fprintf(stderr, "\n%s : failed to read state\n", __func__);
+            return 1;
+        }
+
         llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
         fclose(fp_read);
     }

From 214b6a35702a489e3738acd81fad6d46182d3036 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 18:43:28 +0300
Subject: [PATCH 56/74] ggml : adjust mul_mat_f16 work memory (#1226)

* llama : minor - remove explicity int64_t cast

* ggml : reduce memory buffer for F16 mul_mat when not using cuBLAS

* ggml : add asserts to guard for incorrect wsize
---
 Makefile  |  9 +++++++--
 ggml.c    | 21 +++++++++++++++------
 llama.cpp |  2 +-
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index fd695d7dd..4516e8556 100644
--- a/Makefile
+++ b/Makefile
@@ -34,10 +34,15 @@ endif
 #
 
 # keep standard at C11 and C++11
-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =
 
+ifndef LLAMA_DEBUG
+	CFLAGS   += -DNDEBUG
+	CXXFLAGS += -DNDEBUG
+endif
+
 # warnings
 CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
diff --git a/ggml.c b/ggml.c
index 64ecd0867..0dc1939f6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8245,8 +8245,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
         ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
         float       * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#else
-        float * const wdata = params->wdata;
 #endif
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8263,8 +8261,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                             wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
                         }
                     }
+
+                    assert(id*sizeof(ggml_fp16_t) <= params->wsize);
                 }
 #else
+                float * const wdata = params->wdata;
                 {
                     size_t id = 0;
                     for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -8272,6 +8273,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                             wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
                         }
                     }
+
+                    assert(id*sizeof(float) <= params->wsize);
                 }
 #endif
 
@@ -8537,7 +8540,10 @@ static void ggml_compute_forward_mul_mat_q_f32(
                         dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
                         id += ne00;
                     }
+
+                    assert(id*sizeof(float) <= params->wsize);
                 }
+
                 const float * x = wdata;
 #endif
 
@@ -11571,10 +11577,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
-                                //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
-                                //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
-                                //printf("cur = %zu\n", cur);
+#if defined(GGML_USE_CUBLAS)
+                                // with cuBLAS, we need memory for the full 3D / 4D data of src1
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+#else
+                                // here we need memory just for single 2D matrix from src0
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+#endif
                             } else {
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
                             }
diff --git a/llama.cpp b/llama.cpp
index dc4bdc534..f8b4c8e46 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -780,7 +780,7 @@ static bool kv_cache_init(
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
 
-    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

From ec728e44d7488c2da3560970317708b2b12b9c04 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 18:43:42 +0300
Subject: [PATCH 57/74] ggml : fix #if for f32_f32 mul_mat (CLBlast) (#1229)

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 0dc1939f6..6b9237186 100644
--- a/ggml.c
+++ b/ggml.c
@@ -11592,7 +11592,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                             }

From 0b5a9350993e6fc8be45dc2a3eafc1fd0812d392 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 19:28:36 +0300
Subject: [PATCH 58/74] ggml : fix visibility and unused warnings

---
 ggml.c | 4 ++--
 ggml.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index 6b9237186..ebbaf11c6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9124,7 +9124,7 @@ static void ggml_compute_forward_alibi_f32(
     //const int nb3 = src0->nb[3];
 
     assert(nb0 == sizeof(float));
-    assert(ne1+n_past == ne0);
+    assert(ne1 + n_past == ne0); (void) n_past;
 
     // add alibi to src0 (KQ_scaled)
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@@ -9185,7 +9185,7 @@ static void ggml_compute_forward_alibi_f16(
     //const int nb3 = src0->nb[3];
 
     assert(nb0 == sizeof(ggml_fp16_t));
-    assert(ne1+n_past == ne0);
+    assert(ne1 + n_past == ne0); (void) n_past;
 
     // add alibi to src0 (KQ_scaled)
     const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
diff --git a/ggml.h b/ggml.h
index 38ae9a6ee..c1c5495c6 100644
--- a/ggml.h
+++ b/ggml.h
@@ -701,8 +701,8 @@ extern "C" {
             struct ggml_tensor  * c1);
 
     // Mapping operations
-    GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
-    GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+    typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
+    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
 
     GGML_API struct ggml_tensor * ggml_map_unary_f32(
             struct ggml_context        * ctx,

From e8c051611abfc9a7f37fd4bba48217180893bd68 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 21:12:56 +0300
Subject: [PATCH 59/74] ggml : use vzip instead of vuzp for consistency

---
 ggml.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml.c b/ggml.c
index ebbaf11c6..c9f0f09ea 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2658,35 +2658,35 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
         const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
 
+        // interleave
+        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
+        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
+        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
+        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
+
         // load y
         const int8x16_t v1_0l = vld1q_s8(y0->qs);
         const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
         const int8x16_t v1_1l = vld1q_s8(y1->qs);
         const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
 
-        // interleave
-        const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
-        const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
-        const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
-        const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
-
 #if defined(__ARM_FEATURE_DOTPROD)
         // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs);
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
 
         sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
         sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
 #else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
 
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
 
         const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
         const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));

From c3ca7a5f0546c561eb278be3f2fe335795679e01 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 29 Apr 2023 21:34:23 +0300
Subject: [PATCH 60/74] ggml : fix 32-bit ARM NEON

---
 ggml.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/ggml.c b/ggml.c
index c9f0f09ea..4d53b4628 100644
--- a/ggml.c
+++ b/ggml.c
@@ -668,6 +668,33 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
     return vget_high_u8(vcombine_u8(a, b));
 }
 
+int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
+    return vcombine_s8(vget_low_s8(a), vget_low_s8(b));
+}
+
+int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
+    return vcombine_s8(vget_high_s8(a), vget_high_s8(b));
+}
+
+uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
+    return vcombine_u8(vget_low_u8(a), vget_low_u8(b));
+}
+
+uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
+    return vcombine_u8(vget_high_u8(a), vget_high_u8(b));
+}
+
+int32x4_t vcvtnq_s32_f32(float32x4_t v) {
+    int32x4_t res;
+
+    res[0] = roundf(vgetq_lane_f32(v, 0));
+    res[1] = roundf(vgetq_lane_f32(v, 1));
+    res[2] = roundf(vgetq_lane_f32(v, 2));
+    res[3] = roundf(vgetq_lane_f32(v, 3));
+
+    return res;
+}
+
 #endif
 #endif
 

From 3e5aa8a1c44051153d6d7b3eeca2f4b4e5fb310c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 30 Apr 2023 10:25:46 +0300
Subject: [PATCH 61/74] ggml : fix labels for GGML_OP_ALIBI

---
 ggml.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml.c b/ggml.c
index 4d53b4628..50685f662 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3827,6 +3827,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "DIAG_MASK_INF",
     "SOFT_MAX",
     "ROPE",
+    "ALIBI",
     "CONV_1D_1S",
     "CONV_1D_2S",
 
@@ -3875,6 +3876,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "diag_mask_inf(x)",
     "soft_max(x)",
     "rope(x)",
+    "alibi(x)",
     "conv_1d_1s(x)",
     "conv_1d_2s(x)",
 

From f0d70f147d969e41fa410b8af2965a27aa901eb9 Mon Sep 17 00:00:00 2001
From: Stephan Walter <stephan@walter.name>
Date: Sun, 30 Apr 2023 12:32:37 +0000
Subject: [PATCH 62/74] Various fixes to mat_mul benchmark (#1253)

---
 .gitignore                                    |  2 +-
 Makefile                                      |  8 ++---
 examples/CMakeLists.txt                       |  1 +
 examples/benchmark/CMakeLists.txt             |  4 +++
 ...k-q4_0-matmult.c => benchmark-matmult.cpp} | 30 +++++++------------
 5 files changed, 20 insertions(+), 25 deletions(-)
 create mode 100644 examples/benchmark/CMakeLists.txt
 rename examples/benchmark/{benchmark-q4_0-matmult.c => benchmark-matmult.cpp} (92%)

diff --git a/.gitignore b/.gitignore
index 54dcebc4d..565866fd4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,7 +28,7 @@ models/*
 /result
 /perplexity
 /embedding
-/benchmark-q4_0-matmult
+/benchmark-matmult
 /vdot
 /Pipfile
 
diff --git a/Makefile b/Makefile
index 4516e8556..6d89401c8 100644
--- a/Makefile
+++ b/Makefile
@@ -180,7 +180,7 @@ common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult
 
 main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@@ -210,9 +210,9 @@ libllama.so: llama.o ggml.o $(OBJS)
 # Tests
 #
 
-benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
-	./benchmark-q4_0-matmult
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+	./$@
 
 .PHONY: tests
 tests:
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index be35363f5..0973a3fa1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -35,4 +35,5 @@ else()
     add_subdirectory(perplexity)
     add_subdirectory(embedding)
     add_subdirectory(save-load-state)
+    add_subdirectory(benchmark)
 endif()
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
new file mode 100644
index 000000000..05deebcd1
--- /dev/null
+++ b/examples/benchmark/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET benchmark)
+add_executable(${TARGET} benchmark-matmult.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-matmult.cpp
similarity index 92%
rename from examples/benchmark/benchmark-q4_0-matmult.c
rename to examples/benchmark/benchmark-matmult.cpp
index 84b06766c..19cbab1c3 100644
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,11 +1,3 @@
-/*
-    License: MIT License
-
-    Changelog:
-    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
-
-*/
-
 #include <locale.h>
 #include "ggml.h"
 #include <assert.h>
@@ -45,7 +37,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
 
 #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
 
-#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
+#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5ld x %5ld x %5ld, nb = (%5li, %5li, %5li) - ", #TENSOR, \
         TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
         TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
     { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
@@ -98,12 +90,9 @@ int main(int argc, char ** argv)  {
         }
     }
 
-
     // create the ggml context
     printf("Starting Test\n");
 
-
-
     struct ggml_context * ctx;
     //const int sizex = 4096;
     //const int sizey = 11008;
@@ -125,16 +114,18 @@ int main(int argc, char ** argv)  {
 #endif
 
     //printf("Memsize required = %i\n", sizex*sizex);
-    ggml_type wtype = GGML_TYPE_F32;
 
     size_t ctx_size = 0;
-    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
-    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
     ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizeof(float);
-    ctx_size += 1024*1024*100;
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += 1024*1024*16;
 
-    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
+    printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
 
     struct ggml_init_params params = {
         /*.mem_size   =*/ ctx_size,
@@ -217,7 +208,7 @@ int main(int argc, char ** argv)  {
     const int dimz = sizez;
     long long int flops_per_dot_product = dimy + dimy;
     long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
-    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
 
 
     // Let's use the F32 result from above as a reference for the q4_0 multiplication
@@ -234,7 +225,6 @@ int main(int argc, char ** argv)  {
         ggml_graph_compute(ctx, &gf31);
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
-        float sec = usec/1000000;
         float flops_per_usec = (1.0f*flops_per_matrix)/usec;
         printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
             i,

From 6bc4400e67e6bc4faad3ad3d5e9d8a6576a9752d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 30 Apr 2023 19:07:00 +0300
Subject: [PATCH 63/74] ggml : add Q5 WASM SIMD + GGML_FTYPE

---
 ggml.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 ggml.h |  17 ++++++
 2 files changed, 177 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 50685f662..17c03ad40 100644
--- a/ggml.c
+++ b/ggml.c
@@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB)
 static float table_f32_f16[1 << 16];
 
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(__wasm_simd128__)
 #define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
 #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
 #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -1087,7 +1087,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
             const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
             const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
             const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
-            const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
+            const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
 
             y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
             y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
@@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
     }
 
     *s = vaddvq_f32(sumv);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint64_t tmp[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_0 * restrict x0 = &x[i];
+        const block_q8_0 * restrict y0 = &y[i];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+        const v128_t s16b = wasm_i8x16_splat(0x10);
+
+        // extract the 5th bit
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // interleave
+        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
+        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+        // add high bit and sub 16
+        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
+        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
     }
 
     *s = vaddvq_f32(sumv) + summs;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint64_t tmp[4];
+
+    for (int i = 0; i < nb; ++i) {
+        const block_q5_1 * restrict x0 = &x[i];
+        const block_q8_1 * restrict y0 = &y[i];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        uint32_t qh;
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_u[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        static bool x = true;
+
+        // interleave
+        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
+        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0lz, qhl);
+        const v128_t v0hf = wasm_v128_or(v0hz, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
 #elif defined(__AVX2__)
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
@@ -4057,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) {
     return GGML_IS_QUANTIZED[type];
 }
 
+enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
+    enum ggml_type wtype = GGML_TYPE_COUNT;
+
+    switch (ftype) {
+        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
+        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
+        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
+        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
+        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break;
+        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
+        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
+        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
+        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
+    }
+
+    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
+
+    return wtype;
+}
+
 static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
diff --git a/ggml.h b/ggml.h
index c1c5495c6..d6feacd78 100644
--- a/ggml.h
+++ b/ggml.h
@@ -232,6 +232,20 @@ extern "C" {
         GGML_TYPE_COUNT,
     };
 
+    // model file types
+    enum ggml_ftype {
+        GGML_FTYPE_UNKNOWN     = -1,
+        GGML_FTYPE_ALL_F32     = 0,
+        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+    };
+
     // available tensor operations:
     enum ggml_op {
         GGML_OP_NONE = 0,
@@ -385,6 +399,9 @@ extern "C" {
 
     GGML_API bool    ggml_is_quantized(enum ggml_type type);
 
+    // TODO: temporary until model loading of ggml examples is refactored
+    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+
     // main
 
     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);

From 76a884920aa1d2fc0dc7a7ac12dfc5ec5816377c Mon Sep 17 00:00:00 2001
From: 0cc4m <picard12@live.de>
Date: Sun, 30 Apr 2023 20:34:52 +0200
Subject: [PATCH 64/74] ggml : add CLBlast q5_0, q5_1, q8_0 dequant kernels
 (#1225)

* Implement q5_0, q5_1 and q8_0

* Work around q5_0 OpenCL issue

* Fix q8_0 dequant kernel

* Move cl kernels into ggml-opencl.c

* Use two memcpy calls for q5_0 buffer transfer
---
 ggml-opencl-dequant.cl |  63 ------------
 ggml-opencl.c          | 220 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 205 insertions(+), 78 deletions(-)
 delete mode 100644 ggml-opencl-dequant.cl

diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl
deleted file mode 100644
index a65a79f4d..000000000
--- a/ggml-opencl-dequant.cl
+++ /dev/null
@@ -1,63 +0,0 @@
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-const char * clblast_dequant = MULTILINE_QUOTE(
-
-struct block_q4_0
-{
-    float d;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
-struct block_q4_1
-{
-    float d;
-    float m;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
-}
-
-struct block_q4_2
-{
-    ushort d;
-    uchar qs[8];
-};
-
-__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 16;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &blocks[i].d);;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*16 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
-);
diff --git a/ggml-opencl.c b/ggml-opencl.c
index b748f86b7..4389eca39 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -3,12 +3,141 @@
 #define CL_TARGET_OPENCL_VERSION 110
 #include <clblast_c.h>
 
+#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 
 #include "ggml.h"
 
-#include "ggml-opencl-dequant.cl"
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+const char * clblast_dequant = MULTILINE_QUOTE(
+
+struct block_q4_0
+{
+    float d;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+struct block_q4_1
+{
+    float d;
+    float m;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+    const float m = blocks[i].m;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*32 + l*2;
+    result[index + 0] = (vi & 0xf) * d + m;
+    result[index + 1] = (vi >> 4) * d + m;
+}
+
+struct block_q4_2
+{
+    ushort d;
+    uchar qs[8];
+};
+
+__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 16;
+    const uint l = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &blocks[i].d);
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint index = i*16 + l*2;
+    result[index + 0] = ((vi & 0xf) - 8)*d;
+    result[index + 1] = ((vi >> 4) - 8)*d;
+}
+
+
+struct block_q5_0
+{
+    float d;
+    uint qh;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = blocks[i].d;
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint l2 = l * 2;
+
+    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
+    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
+
+    const uint index = i*32 + l2;
+    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
+    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
+}
+
+struct block_q5_1
+{
+    ushort d;
+    ushort m;
+    uint qh;
+    uchar qs[16];
+};
+
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &blocks[i].d);
+    const float m = vload_half(0, (__global half*) &blocks[i].m);
+
+    const uchar vi = blocks[i].qs[l];
+
+    const uint l2 = l * 2;
+
+    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
+    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
+
+    const uint index = i*32 + l2;
+    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
+    result[index + 1] = ((vi >>  4) | vh1)*d + m;
+}
+
+struct block_q8_0
+{
+    float d;
+    char qs[32];
+};
+
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
+    const uint i = get_global_id(0) / 32;
+    const uint l = get_local_id(0);
+
+    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
+}
+
+);
 
 #define CL_CHECK(err, name)                                                                     \
     do {                                                                                        \
@@ -19,12 +148,26 @@
         }                                                                                       \
     } while (0)
 
+#define QK5_0 32
+typedef struct {
+    ggml_fp16_t d;         // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+
+
+typedef struct {
+    float d;                // delta
+    uint32_t qh;          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} cl_block_q5_0;
+
 static cl_platform_id platform;
 static cl_device_id device;
 static cl_context context;
 static cl_command_queue queue;
 static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
 static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
 static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
 
@@ -97,6 +240,12 @@ void ggml_cl_init(void) {
     CL_CHECK(err, "clCreateKernel");
     kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
     CL_CHECK(err, "clCreateKernel");
+    kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
+    CL_CHECK(err, "clCreateKernel");
 }
 
 static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
@@ -125,6 +274,7 @@ void ggml_cl_sgemm_wrapper(
     cl_kernel kernel;
     size_t global = n * k, local, size_qb;
     bool dequant;
+    cl_block_q5_0* cl_host_b;
 
     switch (btype) {
     case GGML_TYPE_F32:
@@ -146,7 +296,36 @@ void ggml_cl_sgemm_wrapper(
         dequant = true;
         kernel = kernel_q4_2;
         local = 8;
-        size_qb = global * (sizeof(short) + local) / 16;
+        size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
+        break;
+    case GGML_TYPE_Q5_0:
+        dequant = true;
+        kernel = kernel_q5_0;
+        local = 16;
+        // For some reason OpenCL seems to be incapable of working with structs of size 22.
+        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
+        // TODO Find the reason, fix and remove workaround.
+        const block_q5_0* b = (const block_q5_0*) host_b;
+        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
+        for (size_t i = 0; i < global / 32; i++) {
+            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
+            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
+            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
+        }
+        host_b = (const float*) cl_host_b;
+        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
+        break;
+    case GGML_TYPE_Q5_1:
+        dequant = true;
+        kernel = kernel_q5_1;
+        local = 16;
+        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
+        break;
+    case GGML_TYPE_Q8_0:
+        dequant = true;
+        kernel = kernel_q8_0;
+        local = 32;
+        size_qb = global * (sizeof(float) + local) / 32;
         break;
     default:
         fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
@@ -171,12 +350,15 @@ void ggml_cl_sgemm_wrapper(
         err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
         err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
         CL_CHECK(err, "clSetKernelArg");
-        clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+        err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+        CL_CHECK(err, "clEnqueueWriteBuffer qb");
     } else {
-        clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+        err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+        CL_CHECK(err, "clEnqueueWriteBuffer b");
     }
 
-    clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    CL_CHECK(err, "clEnqueueWriteBuffer a");
     if (dequant) {
         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
         CL_CHECK(err, "clEnqueueNDRangeKernel");
@@ -188,15 +370,20 @@ void ggml_cl_sgemm_wrapper(
     clReleaseEvent(ev_b);
 
     cl_event ev_sgemm;
-    CLBlastSgemm((CLBlastLayout)order,
-                 (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
-                 m, n, k,
-                 alpha,
-                 cl_buffer_a, 0, lda,
-                 cl_buffer_b, 0, ldb,
-                 beta,
-                 cl_buffer_c, 0, ldc,
-                 &queue, &ev_sgemm);
+    CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
+                                            (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
+                                            m, n, k,
+                                            alpha,
+                                            cl_buffer_a, 0, lda,
+                                            cl_buffer_b, 0, ldb,
+                                            beta,
+                                            cl_buffer_c, 0, ldc,
+                                            &queue, &ev_sgemm);
+
+    if (status != CLBlastSuccess) {
+        fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
+        abort();
+    }
 
     cl_event ev_c;
     clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
@@ -205,4 +392,7 @@ void ggml_cl_sgemm_wrapper(
     clWaitForEvents(1, &ev_c);
     clReleaseEvent(ev_sgemm);
     clReleaseEvent(ev_c);
+    if (btype == GGML_TYPE_Q5_0) {
+        free((void*) cl_host_b);
+    }
 }

From a5d30b1f53677cb50791fec41c43e93274347303 Mon Sep 17 00:00:00 2001
From: jon-chuang <9093549+jon-chuang@users.noreply.github.com>
Date: Sun, 30 Apr 2023 14:41:35 -0400
Subject: [PATCH 65/74] common : better default number of threads (#934)

* commit

* fix

* try-catch

* apply code review

* improve

* improve

* add macos headers

* done

* remove color

* fix windows

* minor

* fix

* Apply suggestions from code review

Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>

* remove

* minor

* minor

---------

Co-authored-by: jon-chuang <jon-chuang@users.noreply.github.com>
Co-authored-by: DannyDaemonic <DannyDaemonic@gmail.com>
---
 examples/common.cpp | 49 ++++++++++++++++++++++++++++++++++++---------
 examples/common.h   |  5 +++--
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 6c712c713..ad7b0bba3 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,13 +1,18 @@
 #include "common.h"
 
 #include <cassert>
+#include <iostream>
 #include <cstring>
 #include <fstream>
 #include <string>
 #include <iterator>
 #include <algorithm>
 #include <sstream>
-#include <iostream>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
 
 #if defined (_WIN32)
 #include <fcntl.h>
@@ -25,19 +30,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int
 #define CP_UTF8 65001
 #endif
 
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    // determine sensible default number of threads.
-    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+int32_t get_num_physical_cores() {
 #ifdef __linux__
     std::ifstream cpuinfo("/proc/cpuinfo");
-    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
-                                  std::istream_iterator<std::string>(),
-                                  std::string("processor"));
-#endif
-    if (params.n_threads == 0) {
-        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+    std::string line;
+    while (std::getline(cpuinfo, line)) {
+        std::size_t pos = line.find("cpu cores");
+        if (pos != std::string::npos) {
+            pos = line.find(": ", pos);
+            if (pos != std::string::npos) {
+                try {
+                    // Extract the number and return it
+                    return static_cast<int32_t>(std::stoul(line.substr(pos + 2)));
+                } catch (const std::invalid_argument &) {
+                    // Ignore if we could not parse
+                }
+            }
+        }
     }
+#elif defined(__APPLE__) && defined(__MACH__)
+    int32_t num_physical_cores;
+    size_t len = sizeof(num_physical_cores);
+    int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+    result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
+    if (result == 0) {
+        return num_physical_cores;
+    }
+#elif defined(_WIN32)
+    //TODO: Implement
+#endif
+    unsigned int n_threads = std::thread::hardware_concurrency();
+    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
+}
 
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
     gpt_params default_params;
diff --git a/examples/common.h b/examples/common.h
index fce1d42a9..627696e30 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -13,11 +13,12 @@
 //
 // CLI argument parsing
 //
+int32_t get_num_physical_cores();
 
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict     = -1;   // new tokens to predict
+    int32_t n_threads     = get_num_physical_cores();
+    int32_t n_predict     = -1;  // new tokens to predict
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
     int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)

From 6f796992869f306c48484d62a39f2a181ae2fd6f Mon Sep 17 00:00:00 2001
From: Pavol Rusnak <pavol@rusnak.io>
Date: Sun, 30 Apr 2023 20:48:38 +0200
Subject: [PATCH 66/74] build: add armv{6,7,8} support to cmake (#1251)

- flags copied from Makefile
- updated comments in both CMakeLists.txt and Makefile to match reality
---
 CMakeLists.txt | 15 ++++++++++++++-
 Makefile       |  8 +++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bbf599559..098306126 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -258,9 +258,22 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
         # TODO: arm msvc?
     else()
         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+            # Apple M1, M2, etc.
+            # Raspberry Pi 3, 4, Zero 2 (64-bit)
             add_compile_options(-mcpu=native)
         endif()
-        # TODO: armv6,7,8 version specific flags
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+            # Raspberry Pi 1, Zero
+            add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+            # Raspberry Pi 2
+            add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+            # Raspberry Pi 3, 4, Zero 2 (32-bit)
+            add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
+        endif()
     endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
     message(STATUS "x86 detected")
diff --git a/Makefile b/Makefile
index 6d89401c8..1d62a4438 100644
--- a/Makefile
+++ b/Makefile
@@ -135,19 +135,21 @@ ifdef LLAMA_PERF
 	CXXFLAGS += -DGGML_PERF
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
+	# Apple M1, M2, etc.
+	# Raspberry Pi 3, 4, Zero 2 (64-bit)
 	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, 2, 3
+	# Raspberry Pi 1, Zero
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 4
+	# Raspberry Pi 2
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 4
+	# Raspberry Pi 3, 4, Zero 2 (32-bit)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 

From 7ff0dcd32091c703a12adb0c57c32c565ce17664 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 30 Apr 2023 22:28:51 +0300
Subject: [PATCH 67/74] ggml : fix UB (int << 31)

---
 ggml.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml.c b/ggml.c
index 17c03ad40..8cc48344e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1911,8 +1911,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
             const uint8_t vi = pp[l/2];
 
             // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
             const int8_t vi0 = (vi & 0x0F) | vh0;
             const int8_t vi1 = (vi >>   4) | vh1;
@@ -1948,8 +1948,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
             const uint8_t vi = pp[l/2];
 
             // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
             const uint8_t vi0 = (vi & 0x0F) | vh0;
             const uint8_t vi1 = (vi >>   4) | vh1;
@@ -3286,8 +3286,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         for (int j = 0; j < QK8_0/2; j++) {
             const uint8_t v0 = x0[j];
 
-            const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
+            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
+            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
 
             const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
             const int x1_0 = ((v0 >>   4) | x1_0h) - 16;
@@ -3491,8 +3491,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         for (int j = 0; j < QK8_1/2; j++) {
             const uint8_t v0 = x0[j];
 
-            const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
+            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
+            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
 
             const int x0_0 = (v0 & 0x0F) | x0_0h;
             const int x1_0 = (v0 >>   4) | x1_0h;
@@ -13057,8 +13057,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int l = 0; l < QK5_0; l += 2) {
-                const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
@@ -13087,8 +13087,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
             memcpy(&qh, &y[i].qh, sizeof(qh));
 
             for (int l = 0; l < QK5_1; l += 2) {
-                const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
+                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
+                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
 
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;

From 90b19bd6eee943832584f9cac0b6f9ea29cc42a4 Mon Sep 17 00:00:00 2001
From: Alex Klinkhamer <git@grencez.dev>
Date: Mon, 1 May 2023 00:24:20 -0700
Subject: [PATCH 68/74] llama : let context be const when accessing const data
 (#1261)

---
 llama.cpp | 12 ++++++------
 llama.h   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f8b4c8e46..3d82113a0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2373,7 +2373,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
     }
 }
 
-int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
     return ctx->model.kv_self.n;
 }
 
@@ -2387,7 +2387,7 @@ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
 }
 
 // Returns the size of the state
-size_t llama_get_state_size(struct llama_context * ctx) {
+size_t llama_get_state_size(const struct llama_context * ctx) {
     // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
     // for reference, std::mt19937(1337) serializes to 6701 bytes.
     const size_t s_rng_size        = sizeof(size_t);
@@ -2605,15 +2605,15 @@ int llama_tokenize(
     return res.size();
 }
 
-int llama_n_vocab(struct llama_context * ctx) {
+int llama_n_vocab(const struct llama_context * ctx) {
     return ctx->vocab.id_to_token.size();
 }
 
-int llama_n_ctx(struct llama_context * ctx) {
+int llama_n_ctx(const struct llama_context * ctx) {
     return ctx->model.hparams.n_ctx;
 }
 
-int llama_n_embd(struct llama_context * ctx) {
+int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
@@ -2625,7 +2625,7 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embedding.data();
 }
 
-const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
+const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) {
     if (token >= llama_n_vocab(ctx)) {
         return nullptr;
     }
diff --git a/llama.h b/llama.h
index 34a8f5b3c..9fbba7643 100644
--- a/llama.h
+++ b/llama.h
@@ -120,13 +120,13 @@ extern "C" {
                              int   n_threads);
 
     // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
 
     // Sets the current rng seed.
     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
 
     // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
-    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
 
     // Copies the state to the specified destination address.
     // Destination needs to have allocated enough memory.
@@ -164,9 +164,9 @@ extern "C" {
                              int   n_max_tokens,
                             bool   add_bos);
 
-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row
@@ -180,7 +180,7 @@ extern "C" {
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
     // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
 
     // Special tokens
     LLAMA_API llama_token llama_token_bos();

From b925f1f1b082319ee69943f8d1a83ac9b6ff09ca Mon Sep 17 00:00:00 2001
From: slaren <2141330+slaren@users.noreply.github.com>
Date: Mon, 1 May 2023 13:32:22 +0200
Subject: [PATCH 69/74] cuBLAS: fall back to pageable memory if pinned alloc
 fails (#1233)

* cuBLAS: fall back to pageable memory if pinned alloc fails

* cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set
---
 ggml-cuda.cu | 14 ++++++++++++--
 llama-util.h | 44 +++++++++++++++++++++++++++++++++++++++-----
 llama.cpp    |  3 +--
 3 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 5a2701cfe..c1ec306f0 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
 }
 
 void * ggml_cuda_host_malloc(size_t size) {
-    void * ptr;
-    CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            size/1024.0/1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
     return ptr;
 }
 
diff --git a/llama-util.h b/llama-util.h
index ca4dd162f..5f9f70ecc 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -395,6 +395,8 @@ struct llama_buffer {
     uint8_t * addr = NULL;
     size_t size = 0;
 
+    llama_buffer() = default;
+
     void resize(size_t size) {
         delete[] addr;
         addr = new uint8_t[size];
@@ -404,27 +406,59 @@ struct llama_buffer {
     ~llama_buffer() {
         delete[] addr;
     }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
 };
 
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
     uint8_t * addr = NULL;
+    bool is_cuda;
     size_t size = 0;
 
+    llama_ctx_buffer() = default;
+
     void resize(size_t size) {
-        if (addr) {
-            ggml_cuda_host_free(addr);
-        }
+        free();
+
         addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        if (addr) {
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
+        }
         this->size = size;
     }
 
-    ~llama_ctx_buffer() {
+    void free() {
         if (addr) {
-            ggml_cuda_host_free(addr);
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
         }
+        addr = NULL;
     }
+
+    ~llama_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
 };
 #else
 typedef llama_buffer llama_ctx_buffer;
diff --git a/llama.cpp b/llama.cpp
index 3d82113a0..0d094a52f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -727,8 +727,7 @@ struct llama_model_loader {
             LLAMA_ASSERT(offset == lt.size);
         } else if (lt.split_type == SPLIT_BY_COLUMNS) {
             // Let's load the data into temporary buffers to ensure the OS performs large loads.
-            std::vector<llama_buffer> tmp_bufs;
-            tmp_bufs.resize(lt.shards.size());
+            std::vector<llama_buffer> tmp_bufs(lt.shards.size());
             for (size_t i = 0; i < lt.shards.size(); i++) {
                 llama_load_tensor_shard & shard = lt.shards.at(i);
                 llama_file & file = file_loaders.at(shard.file_idx)->file;

From 70269cae37538461ff816e714afbb3ebcdcdc26b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 1 May 2023 14:54:59 +0300
Subject: [PATCH 70/74] llama : fix session load / save (#1263)

---
 examples/main/main.cpp |  20 +++----
 llama.cpp              | 133 ++++++++++++++++++++++++-----------------
 llama.h                |  12 ++--
 3 files changed, 96 insertions(+), 69 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 990d0fa02..78fc9a197 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -161,23 +161,22 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> session_tokens;
 
     if (!path_session.empty()) {
-        fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
+        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
 
-        // REVIEW - fopen to check for existing session
+        // fopen to check for existing session
         FILE * fp = std::fopen(path_session.c_str(), "rb");
         if (fp != NULL) {
             std::fclose(fp);
 
             session_tokens.resize(params.n_ctx);
             size_t n_token_count_out = 0;
-            const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
+            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
             session_tokens.resize(n_token_count_out);
 
-            if (n_session_bytes > 0) {
-                fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
-            } else {
-                fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
-            }
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
         } else {
             fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
         }
@@ -214,7 +213,7 @@ int main(int argc, char ** argv) {
     }
 
     // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
         params.n_keep = (int)embd_inp.size();
     }
 
@@ -329,7 +328,7 @@ int main(int argc, char ** argv) {
                 // insert n_left/2 tokens at the start of embd from last_n_tokens
                 embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
 
-                // REVIEW - stop saving session if we run out of context
+                // stop saving session if we run out of context
                 path_session = "";
 
                 //printf("\n---\n");
@@ -355,6 +354,7 @@ int main(int argc, char ** argv) {
                     n_session_consumed++;
 
                     if (n_session_consumed >= (int) session_tokens.size()) {
+                        ++i;
                         break;
                     }
                 }
diff --git a/llama.cpp b/llama.cpp
index 0d094a52f..868a58a8b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2566,6 +2566,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
     return nread;
 }
 
+bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
+    llama_file file(path_session, "rb");
+
+    // sanity checks
+    {
+        const uint32_t magic   = file.read_u32();
+        const uint32_t version = file.read_u32();
+
+        if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) {
+            fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
+            return false;
+        }
+
+        llama_hparams session_hparams;
+        file.read_raw(&session_hparams, sizeof(llama_hparams));
+
+        if (session_hparams != ctx->model.hparams) {
+            fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
+            return false;
+        }
+    }
+
+    // load the prompt
+    {
+        const uint32_t n_token_count = file.read_u32();
+
+        if (n_token_count > n_token_capacity) {
+            fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
+            return false;
+        }
+
+        file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
+        *n_token_count_out = n_token_count;
+    }
+
+    // restore the context state
+    {
+        const size_t n_state_size_cur = file.size - file.tell();
+        const size_t n_state_size_exp = llama_get_state_size(ctx);
+
+        if (n_state_size_cur != n_state_size_exp) {
+            fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur);
+            return false;
+        }
+
+        std::vector<uint8_t> state_data(n_state_size_cur);
+        file.read_raw(state_data.data(), n_state_size_cur);
+
+        llama_set_state_data(ctx, state_data.data());
+    }
+
+    return true;
+}
+
+bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
+    llama_file file(path_session, "wb");
+
+    file.write_u32(LLAMA_SESSION_MAGIC);
+    file.write_u32(LLAMA_SESSION_VERSION);
+
+    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
+
+    // save the prompt
+    file.write_u32((uint32_t) n_token_count);
+    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
+
+    // save the context state
+    {
+        const size_t n_state_size = llama_get_state_size(ctx);
+
+        std::vector<uint8_t> state_data(n_state_size);
+        llama_copy_state_data(ctx, state_data.data());
+
+        file.write_raw(state_data.data(), n_state_size);
+    }
+
+    return true;
+}
+
 int llama_eval(
         struct llama_context * ctx,
            const llama_token * tokens,
@@ -2693,57 +2772,3 @@ const char * llama_print_system_info(void) {
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
     return ctx->model.tensors_by_name;
 }
-
-size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    // TODO leverage mmap
-    llama_file file(path_session, "rb");
-    const uint32_t magic = file.read_u32();
-    const uint32_t version = file.read_u32();
-
-    if (!(magic == 'ggsn' && version == 0)) {
-        fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-        return 0;
-    }
-
-    llama_hparams session_hparams;
-    file.read_raw(&session_hparams, sizeof(llama_hparams));
-
-    // REVIEW
-    if (session_hparams != ctx->model.hparams) {
-        fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
-        return 0;
-    }
-
-    const uint32_t n_token_count = file.read_u32();
-    LLAMA_ASSERT(n_token_capacity >= n_token_count);
-    file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
-    *n_token_count_out = n_token_count;
-
-    const size_t n_state_size = file.size - file.tell();
-    const size_t n_orig_state_size = llama_get_state_size(ctx);
-    if (n_state_size != n_orig_state_size) {
-        fprintf(stderr, "%s : failed to validate state size\n", __func__);
-    }
-    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
-    file.read_raw(state_data.get(), n_state_size);
-    return llama_set_state_data(ctx, state_data.get());
-}
-
-size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
-    // TODO save temp & swap
-    llama_file file(path_session, "wb");
-
-    const size_t n_state_size = llama_get_state_size(ctx);
-    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
-    llama_copy_state_data(ctx, state_data.get());
-
-    file.write_u32('ggsn'); // magic
-    file.write_u32(0); // version
-    file.write_raw(&ctx->model.hparams, sizeof(llama_hparams));
-
-    file.write_u32((uint32_t) n_token_count); // REVIEW
-    file.write_raw(tokens, sizeof(llama_token) * n_token_count);
-
-    file.write_raw(state_data.get(), n_state_size);
-    return n_state_size; // REVIEW
-}
diff --git a/llama.h b/llama.h
index 9fbba7643..2f6ce8d83 100644
--- a/llama.h
+++ b/llama.h
@@ -19,9 +19,11 @@
 #    define LLAMA_API
 #endif
 
-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
+#define LLAMA_FILE_VERSION           1
+#define LLAMA_FILE_MAGIC             'ggjt'
+#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
+#define LLAMA_SESSION_MAGIC          'ggsn'
+#define LLAMA_SESSION_VERSION        0
 
 #ifdef __cplusplus
 extern "C" {
@@ -138,8 +140,8 @@ extern "C" {
     LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
 
     // Save/load session file
-    LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-    LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
+    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
 
     // Run the llama inference to obtain the logits and probabilities for the next token.
     // tokens + n_tokens is the provided batch of new tokens to process

From 2bdc09646d8c6cb74a6f573e9081586b4b83b9d1 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Mon, 1 May 2023 05:56:07 -0600
Subject: [PATCH 71/74] ggml : fix ggml_used_mem() (#1264)

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 8cc48344e..5b5ed925e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4411,7 +4411,7 @@ void ggml_free(struct ggml_context * ctx) {
 }
 
 size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end->offs + ctx->objects_end->size;
+    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
 }
 
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {

From ea3a0ad6b6b5ca4693b94acd4cb32e2803f66fae Mon Sep 17 00:00:00 2001
From: xloem <0xloem@gmail.com>
Date: Mon, 1 May 2023 08:58:51 -0400
Subject: [PATCH 72/74] llama : update stubs for systems without mmap and mlock
 (#1266)

Co-authored-by: John Doe <john.doe@example.com>
---
 llama-util.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llama-util.h b/llama-util.h
index 5f9f70ecc..d531588d5 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -243,7 +243,8 @@ struct llama_mmap {
 #else
     static constexpr bool SUPPORTED = false;
 
-    llama_mmap(struct llama_file *) {
+    llama_mmap(struct llama_file *, bool prefetch = true) {
+        (void)prefetch;
         throw std::string("mmap not supported");
     }
 #endif
@@ -382,8 +383,13 @@ struct llama_mlock {
 #else
     static constexpr bool SUPPORTED = false;
 
-    void raw_lock(const void * addr, size_t size) {
+    size_t lock_granularity() {
+        return (size_t) 65536;
+    }
+
+    bool raw_lock(const void * addr, size_t size) {
         fprintf(stderr, "warning: mlock not supported on this system\n");
+        return false;
     }
 
     void raw_unlock(const void * addr, size_t size) {}

From 58b367c2d757c0ea12aec672382462b42204c724 Mon Sep 17 00:00:00 2001
From: slaren <2141330+slaren@users.noreply.github.com>
Date: Mon, 1 May 2023 18:11:07 +0200
Subject: [PATCH 73/74] cuBLAS: refactor and optimize f16 mat mul performance
 (#1259)

* cuBLAS: refactor, convert fp16 to fp32 on device

* cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16

* fix build

* cuBLAS: update block_q5_1
---
 ggml-cuda.cu | 429 +++++++++++++++++++++++++++++++++++++++++++++------
 ggml-cuda.h  |  47 +-----
 ggml.c       | 252 ++++++++++--------------------
 ggml.h       |  11 ++
 4 files changed, 480 insertions(+), 259 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c1ec306f0..e8a1e77cb 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1,11 +1,38 @@
+#include <cstddef>
+#include <cstdint>
 #include <stdint.h>
 #include <stdio.h>
-#include <cuda_fp16.h>
 #include <atomic>
-#include "ggml-cuda.h"
 
-typedef uint16_t ggml_fp16_t;
-static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+
+#include "ggml-cuda.h"
+#include "ggml.h"
+
+static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        cudaError_t err_ = (err);                                                       \
+        if (err_ != cudaSuccess) {                                                      \
+            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                cudaGetErrorString(err_));                                              \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
 
 #define QK4_0 32
 typedef struct {
@@ -24,14 +51,14 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b
 
 #define QK4_2 16
 typedef struct {
-    __half  d;              // delta
+    half  d;                // delta
     uint8_t qs[QK4_2 / 2];  // nibbles / quants
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 
 #define QK5_0 32
 typedef struct {
-    __half d;               // delta
+    half d;                 // delta
     uint8_t qh[4];          // 5-th bit of quants
     uint8_t qs[QK5_0 / 2];  // nibbles / quants
 } block_q5_0;
@@ -39,9 +66,9 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 
 #define QK5_1 32
 typedef struct {
-    __half d;               // delta
-    __half m;               // min
-    uint32_t qh;            // 5-th bit of quants
+    half d;                 // delta
+    half m;                 // min
+    uint8_t qh[4];          // 5-th bit of quants
     uint8_t qs[QK5_1 / 2];  // nibbles / quants
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
@@ -162,7 +189,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
 
     const uint8_t * pp = x[i].qs;
 
-    const uint32_t qh = x[i].qh;
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
 
     for (int l = 0; l < QK5_1; l += 2) {
         const uint8_t vi = pp[l/2];
@@ -197,37 +225,50 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
     }
 }
 
-void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_0;
     dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_1;
     dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK4_2;
     dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_0;
     dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK5_1;
     dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
     const int nb = k / QK8_0;
     dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 
-dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
+// TODO: optimize
+static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
+    const half * x = (const half *) vx;
+
+    const int i = blockIdx.x;
+
+    y[i] = __half2float(x[i]);
+}
+
+static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
+    convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
+}
+
+static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
             return dequantize_row_q4_0_cuda;
@@ -241,6 +282,8 @@ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
             return dequantize_row_q5_1_cuda;
         case GGML_TYPE_Q8_0:
             return dequantize_row_q8_0_cuda;
+        case GGML_TYPE_F16:
+            return convert_fp16_to_fp32_cuda;
         default:
             return nullptr;
     }
@@ -271,7 +314,7 @@ struct cuda_buffer {
 static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
 
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -290,7 +333,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     return ptr;
 }
 
-void ggml_cuda_pool_free(void * ptr, size_t size) {
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -305,28 +348,55 @@ void ggml_cuda_pool_free(void * ptr, size_t size) {
     CUDA_CHECK(cudaFree(ptr));
 }
 
-cublasHandle_t g_cublasH = nullptr;
-cudaStream_t g_cudaStream = nullptr;
-cudaStream_t g_cudaStream2 = nullptr;
-cudaEvent_t g_cudaEvent = nullptr;
+#define GGML_CUDA_MAX_STREAMS 8
+#define GGML_CUDA_MAX_EVENTS 64
+static cublasHandle_t g_cublasH = nullptr;
+static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
 
 void ggml_init_cublas() {
     if (g_cublasH == nullptr) {
-        // create cublas handle, bind a stream
-        CUBLAS_CHECK(cublasCreate(&g_cublasH));
-        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
-        CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
+        // create streams
+        for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
+            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
+            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
+        }
+        // create events
+        for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
+        }
 
-        // create additional stream and event for synchronization
-        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
-        CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
+        // create cublas handle
+        CUBLAS_CHECK(cublasCreate(&g_cublasH));
+        CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
 
         // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
     }
 }
 
-cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
+void * ggml_cuda_host_malloc(size_t size) {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            size/1024.0/1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    return ptr;
+}
+
+void ggml_cuda_host_free(void * ptr) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+}
+
+static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
     const uint64_t ne0 = src->ne[0];
     const uint64_t ne1 = src->ne[1];
     const uint64_t nb0 = src->nb[0];
@@ -354,22 +424,293 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
     }
 }
 
-void * ggml_cuda_host_malloc(size_t size) {
-    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
-        return nullptr;
+static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+
+    size_t x_size, y_size, d_size;
+    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+
+            float * c_X = d_X + i * x_ne;
+            float * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+
+            // copy data to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, ne00,
+                                c_Y, ne10,
+                        &beta,  c_D, ne01));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
     }
 
-    void * ptr = nullptr;
-    cudaError_t err = cudaMallocHost((void **) &ptr, size);
-    if (err != cudaSuccess) {
-        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-            size/1024.0/1024.0, cudaGetErrorString(err));
-        return nullptr;
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
+}
+
+static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+
+    size_t x_size, y_size, d_size;
+    half  * d_X =  (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
+    half  * d_Y =  (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+
+    bool src1_cont_rows = nb10 == sizeof(float);
+    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+
+            half  * c_X = d_X + i * x_ne;
+            half  * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+
+            // copy src0 to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
+
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+            if (src1_cont_rows) {
+                if (src1_cont_cols) {
+                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                }
+                else {
+                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                    }
+                }
+            }
+            else {
+                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                        // very slow due to no inlining
+                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                    }
+                }
+            }
+
+            // copy src1 to device
+            CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, CUDA_R_16F, ne00,
+                                c_Y, CUDA_R_16F, ne10,
+                        &beta,  c_D, CUDA_R_32F, ne01,
+                        CUBLAS_COMPUTE_32F_FAST_16F,
+                        CUBLAS_GEMM_DEFAULT));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
     }
 
-    return ptr;
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
 }
 
-void ggml_cuda_host_free(void * ptr) {
-    CUDA_CHECK(cudaFreeHost(ptr));
+static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    const ggml_type type = src0->type;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
+
+    size_t x_size, y_size, d_size, q_size;
+    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+    char  * d_Q = (char  *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
+    GGML_ASSERT(to_fp32_cuda != nullptr);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+            cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
+            cudaEvent_t  cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
+
+            float * c_X = d_X + i * x_ne;
+            float * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+            char  * c_Q = d_Q + i * q_sz;
+
+            // copy src0 and convert to fp32 on device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
+            to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
+            CUDA_CHECK(cudaGetLastError());
+            CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+
+            // copy src1 to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+            // wait for conversion
+            CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, ne00,
+                                c_Y, ne10,
+                        &beta,  c_D, ne01));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
+    ggml_cuda_pool_free(d_Q, q_size);
+}
+
+bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        src1->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32 &&
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+    size_t src0_sz = ggml_nbytes(src0);
+    size_t src1_sz = ggml_nbytes(src1);
+
+    // mul_mat_q: src0 is converted to fp32 on device
+    size_t mul_mat_q_transfer = src0_sz + src1_sz;
+
+    // mul_mat_f16: src1 is converted to fp16 on cpu
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
+
+    // choose the smaller one to transfer to the device
+    // TODO: this is not always the best choice due to the overhead of converting to fp16
+    return mul_mat_f16_transfer < mul_mat_q_transfer;
+}
+
+void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
+    GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
+
+    if (src0->type == GGML_TYPE_F32) {
+        ggml_cuda_mul_mat_f32(src0, src1, dst);
+    }
+    else if (src0->type == GGML_TYPE_F16) {
+        if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
+        }
+        else {
+            ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+        }
+    }
+    else if (ggml_is_quantized(src0->type)) {
+        ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+    }
+    else {
+        GGML_ASSERT(false);
+    }
+}
+
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    }
+    else {
+        return 0;
+    }
 }
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 36782d9e7..f7d6a8bc1 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -1,54 +1,19 @@
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
 #include "ggml.h"
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-#define CUDA_CHECK(err)                                                                 \
-    do {                                                                                \
-        cudaError_t err_ = (err);                                                       \
-        if (err_ != cudaSuccess) {                                                      \
-            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
-                cudaGetErrorString(err_));                                              \
-            exit(1);                                                                    \
-        }                                                                               \
-    } while (0)
-
-#define CUBLAS_CHECK(err)                                                               \
-    do {                                                                                \
-        cublasStatus_t err_ = (err);                                                    \
-        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
-            exit(1);                                                                    \
-        }                                                                               \
-    } while (0)
-
-extern cublasHandle_t g_cublasH;
-extern cudaStream_t g_cudaStream;
-extern cudaStream_t g_cudaStream2;
-extern cudaEvent_t g_cudaEvent;
-
 void   ggml_init_cublas(void);
+
+bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+
+// TODO: export these with GGML_API
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
 
-void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
-void   ggml_cuda_pool_free(void * ptr, size_t size);
-
-void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
-
-cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
-
-typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
-dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
-
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml.c b/ggml.c
index 5b5ed925e..bce7a7a57 100644
--- a/ggml.c
+++ b/ggml.c
@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #define UNUSED(x) (void)(x)
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
 
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #elif defined(GGML_USE_OPENBLAS)
@@ -370,6 +362,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
     return GGML_FP32_TO_FP16(x);
 }
 
+void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
+    for (size_t i = 0; i < n; i++) {
+        y[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+}
+
+void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
+    size_t i = 0;
+#if defined(__F16C__)
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for(; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_FP32_TO_FP16(x[i]);
+    }
+}
+
+
 //
 // timing
 //
@@ -4325,12 +4343,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
 
-        // initialize cuBLAS
-        #if defined(GGML_USE_CUBLAS)
+#if defined(GGML_USE_CUBLAS)
         ggml_init_cublas();
-        #elif defined(GGML_USE_CLBLAST)
+#elif defined(GGML_USE_CLBLAST)
         ggml_cl_init();
-        #endif
+#endif
 
         is_first_call = false;
     }
@@ -8101,7 +8118,7 @@ static void ggml_compute_forward_rms_norm(
 
 // ggml_compute_forward_mul_mat
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_compute_forward_mul_mat_use_blas(
@@ -8117,12 +8134,9 @@ static bool ggml_compute_forward_mul_mat_use_blas(
     const int64_t ne1 = dst->ne[1];
 
     // TODO: find the optimal values for these
-    if (
-#if !defined(GGML_USE_CUBLAS)
-        ggml_is_contiguous(src0) &&
+    if (ggml_is_contiguous(src0) &&
         ggml_is_contiguous(src1) &&
-#endif
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
+        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
 
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
         return true;
@@ -8130,7 +8144,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 
     return false;
 }
-
 #endif
 
 static void ggml_compute_forward_mul_mat_f32(
@@ -8146,7 +8159,7 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     const int64_t ne10 = src1->ne[0];
 #endif
     const int64_t ne11 = src1->ne[1];
@@ -8203,7 +8216,16 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS)
+    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -8217,43 +8239,13 @@ static void ggml_compute_forward_mul_mat_f32(
             return;
         }
 
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne00;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#endif
-
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
-#if !defined(GGML_USE_CUBLAS)
                 const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-#endif
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CUBLAS)
-                // copy data to device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, ne00,
-                                    d_Y, ne10,
-                            &beta,  d_D, ne01));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 // zT = y * xT
                 ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
                         ne11, ne01, ne10,
@@ -8270,12 +8262,6 @@ static void ggml_compute_forward_mul_mat_f32(
 #endif
             }
         }
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-#endif
         //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
@@ -8405,7 +8391,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS)
+    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
 
@@ -8421,37 +8416,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
             return;
         }
 
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne00;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size;
-        ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float       * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#endif
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
-#if defined(GGML_USE_CUBLAS)
-                // copy src0 while converting src1
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream));
-
-                // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
-                ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne11; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne10; ++i00) {
-                            wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
-                        }
-                    }
-
-                    assert(id*sizeof(ggml_fp16_t) <= params->wsize);
-                }
-#else
                 float * const wdata = params->wdata;
                 {
                     size_t id = 0;
@@ -8463,28 +8429,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 
                     assert(id*sizeof(float) <= params->wsize);
                 }
-#endif
 
-#if defined(GGML_USE_CUBLAS)
-                const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, CUDA_R_16F, ne00,
-                                    d_Y, CUDA_R_16F, ne10,
-                            &beta,  d_D, CUDA_R_32F, ne01,
-                            CUBLAS_COMPUTE_32F,
-                            CUBLAS_GEMM_DEFAULT));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 const float * x = wdata;
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
 
@@ -8513,12 +8459,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
             }
         }
 
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-#endif
         /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
 
         return;
@@ -8671,7 +8611,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CUBLAS)
+    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
+        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
+            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
+        }
+        return;
+    }
+#endif
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         if (params->ith != 0) {
             return;
@@ -8685,25 +8634,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
             return;
         }
 
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne00;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size, q_size;
-        float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-        void  * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
-
-        const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type);
-        GGML_ASSERT(dequantize_row_q_cuda != NULL);
-#else
         float * const wdata = params->wdata;
         dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-#endif
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8711,14 +8643,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
 
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-#if defined(GGML_USE_CUBLAS)
-                // copy and dequantize on device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2));
-
-                dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2);
-                CUDA_CHECK(cudaGetLastError());
-                CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
 #else
                 {
@@ -8734,24 +8659,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 const float * x = wdata;
 #endif
 
-#if defined(GGML_USE_CUBLAS)
-                // copy data to device
-                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
-
-                // wait for dequantization
-                CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, ne00,
-                                    d_Y, ne10,
-                            &beta,  d_D, ne01));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#elif defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_CLBLAST)
                 // zT = y * xT
                 ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
                         ne11, ne01, ne10,
@@ -8769,13 +8677,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
             }
         }
 
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-        ggml_cuda_pool_free(d_Q, q_size);
-#endif
         //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
@@ -11759,18 +11660,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
                         size_t cur = 0;
 
+#if defined(GGML_USE_CUBLAS)
+                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
+                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                //       the threads are still spinning
+                            cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
+                        }
+                        else
+#endif
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
-#if defined(GGML_USE_CUBLAS)
-                                // with cuBLAS, we need memory for the full 3D / 4D data of src1
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#else
                                 // here we need memory just for single 2D matrix from src0
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-#endif
                             } else {
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
                             }
@@ -11779,13 +11683,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 #endif
                         } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
                             cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                             }
 #endif
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1;
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
diff --git a/ggml.h b/ggml.h
index d6feacd78..ef5a048c3 100644
--- a/ggml.h
+++ b/ggml.h
@@ -197,6 +197,14 @@
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 
+#define GGML_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -212,6 +220,9 @@ extern "C" {
     GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
     GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
 
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
+
     struct ggml_object;
     struct ggml_context;
 

From f4cef87edfd1b2f8d5befd4fde54ca2e03987bea Mon Sep 17 00:00:00 2001
From: DannyDaemonic <DannyDaemonic@gmail.com>
Date: Mon, 1 May 2023 09:23:47 -0700
Subject: [PATCH 74/74] Add git-based build information for better issue
 tracking (#1232)

* Add git-based build information for better issue tracking

* macOS fix

* "build (hash)" and "CMAKE_SOURCE_DIR" changes

* Redo "CMAKE_CURRENT_SOURCE_DIR" and clearer build messages

* Fix conditional dependency on missing target

* Broke out build-info.cmake, added find_package fallback, and added build into to all examples, added dependencies to Makefile

* 4 space indenting for cmake, attempt to clean up my mess in Makefile

* Short hash, less fancy Makefile, and don't modify build-info.h if it wouldn't change it
---
 .gitignore                                   |  1 +
 CMakeLists.txt                               | 35 +++++++++++++
 Makefile                                     | 51 ++++++++++++-------
 examples/benchmark/CMakeLists.txt            |  3 ++
 examples/benchmark/benchmark-matmult.cpp     |  4 +-
 examples/embedding/CMakeLists.txt            |  3 ++
 examples/embedding/embedding.cpp             |  5 +-
 examples/main/CMakeLists.txt                 |  3 ++
 examples/main/main.cpp                       |  5 +-
 examples/perplexity/CMakeLists.txt           |  3 ++
 examples/perplexity/perplexity.cpp           |  5 +-
 examples/quantize-stats/quantize-stats.cpp   |  3 ++
 examples/quantize/CMakeLists.txt             |  3 ++
 examples/quantize/quantize.cpp               |  3 ++
 examples/save-load-state/CMakeLists.txt      |  3 ++
 examples/save-load-state/save-load-state.cpp |  3 ++
 scripts/build-info.cmake                     | 53 ++++++++++++++++++++
 scripts/build-info.sh                        | 22 ++++++++
 18 files changed, 186 insertions(+), 22 deletions(-)
 create mode 100644 scripts/build-info.cmake
 create mode 100755 scripts/build-info.sh

diff --git a/.gitignore b/.gitignore
index 565866fd4..e479c6180 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ models/*
 /vdot
 /Pipfile
 
+build-info.h
 arm_neon.h
 compile_commands.json
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 098306126..f6a66daa3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,41 @@ option(LLAMA_CLBLAST                "llama: use CLBlast"
 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 
+#
+# Build info header
+#
+
+# Write header template to binary dir to keep source directory clean
+file(WRITE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in" "\
+#ifndef BUILD_INFO_H\n\
+#define BUILD_INFO_H\n\
+\n\
+#define BUILD_NUMBER @BUILD_NUMBER@\n\
+#define BUILD_COMMIT \"@BUILD_COMMIT@\"\n\
+\n\
+#endif // BUILD_INFO_H\n\
+")
+
+# Generate initial build-info.h
+include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
+    # Add a custom target for build-info.h
+    add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+
+    # Add a custom command to rebuild build-info.h when .git/index changes
+    add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h"
+        COMMENT "Generating build details from Git"
+        COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake"
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/.git/index"
+        VERBATIM
+    )
+else()
+    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
+endif()
+
 #
 # Compile flags
 #
diff --git a/Makefile b/Makefile
index 1d62a4438..6ebc3c5b9 100644
--- a/Makefile
+++ b/Makefile
@@ -181,41 +181,56 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult
+libllama.so: llama.o ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
-main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+clean:
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
+
+#
+# Examples
+#
+
+main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-libllama.so: llama.o ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+build-info.h: $(wildcard .git/index) scripts/build-info.sh
+	@scripts/build-info.sh > $@.tmp
+	@if ! cmp -s $@.tmp $@; then \
+		mv $@.tmp $@; \
+	else \
+		rm $@.tmp; \
+	fi
 
 #
 # Tests
 #
 
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@
 
+vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt
index 05deebcd1..037696194 100644
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 19cbab1c3..2cc1a1477 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,5 +1,6 @@
 #include <locale.h>
 #include "ggml.h"
+#include "build-info.h"
 #include <assert.h>
 #include <math.h>
 #include <cstring>
@@ -90,9 +91,10 @@ int main(int argc, char ** argv)  {
         }
     }
 
-    // create the ggml context
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
     printf("Starting Test\n");
 
+    // create the ggml context
     struct ggml_context * ctx;
     //const int sizex = 4096;
     //const int sizey = 11008;
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
index 88c425d4a..db73b6b44 100644
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET embedding)
 add_executable(${TARGET} embedding.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index e10de619c..b3e001476 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <ctime>
 
@@ -18,11 +19,13 @@ int main(int argc, char ** argv) {
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.seed <= 0) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
index b2dcc2910..c364242fb 100644
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET main)
 add_executable(${TARGET} main.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 78fc9a197..7dc100512 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -5,6 +5,7 @@
 
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <cassert>
 #include <cinttypes>
@@ -81,11 +82,13 @@ int main(int argc, char ** argv) {
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.seed <= 0) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
index 5836df8b2..61b17b828 100644
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET perplexity)
 add_executable(${TARGET} perplexity.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 615157e7b..2ca338835 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <cmath>
 #include <ctime>
@@ -106,11 +107,13 @@ int main(int argc, char ** argv) {
                 "expect poor results\n", __func__, params.n_ctx);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.seed <= 0) {
         params.seed = time(NULL);
     }
 
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
     if (params.random_prompt) {
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 4e6c2c831..9a2aa7c64 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "build-info.h"
 
 #define LLAMA_API_INTERNAL
 #include "llama.h"
@@ -308,6 +309,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     // load the model
     fprintf(stderr, "Loading model\n");
 
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
index fb27d4517..475fc8be8 100644
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index dd175c690..198bd5fcb 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <cstdio>
 #include <map>
@@ -50,6 +51,8 @@ int main(int argc, char ** argv) {
         ftype = (enum llama_ftype)atoi(argv[3]);
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     int nthread = argc > 4 ? atoi(argv[4]) : 0;
 
     const int64_t t_main_start_us = ggml_time_us();
diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt
index cff79fa1f..08dbe5c2b 100644
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -2,3 +2,6 @@ set(TARGET save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index f1531ba39..ea0a984d9 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"
 
 #include <vector>
 #include <cstdio>
@@ -17,6 +18,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
+
     if (params.n_predict < 0) {
         params.n_predict = 16;
     }
diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake
new file mode 100644
index 000000000..fb46ed2b5
--- /dev/null
+++ b/scripts/build-info.cmake
@@ -0,0 +1,53 @@
+set(TEMPLATE_FILE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in")
+set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
+set(BUILD_NUMBER 0)
+set(BUILD_COMMIT "unknown")
+
+# Look for git
+find_package(Git)
+if(NOT Git_FOUND)
+    execute_process(
+        COMMAND which git
+        OUTPUT_VARIABLE GIT_EXECUTABLE
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT GIT_EXECUTABLE STREQUAL "")
+        set(Git_FOUND TRUE)
+        message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}")
+    else()
+        message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.")
+    endif()
+endif()
+
+# Get the commit count and hash
+if(Git_FOUND)
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE HEAD
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE GIT_HEAD_RESULT
+    )
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE COUNT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE GIT_COUNT_RESULT
+    )
+    if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
+        set(BUILD_COMMIT ${HEAD})
+        set(BUILD_NUMBER ${COUNT})
+    endif()
+endif()
+
+# Only write the header if it's changed to prevent unnecessary recompilation
+if(EXISTS ${HEADER_FILE})
+    file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"")
+    list(GET CONTENTS 0 EXISTING)
+    if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"")
+        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+    endif()
+else()
+    configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
+endif()
diff --git a/scripts/build-info.sh b/scripts/build-info.sh
new file mode 100755
index 000000000..507d7e153
--- /dev/null
+++ b/scripts/build-info.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+BUILD_NUMBER="0"
+BUILD_COMMIT="unknown"
+
+REV_LIST=$(git rev-list --count HEAD)
+if [ $? -eq 0 ]; then
+  BUILD_NUMBER=$REV_LIST
+fi
+
+REV_PARSE=$(git rev-parse --short HEAD)
+if [ $? -eq 0 ]; then
+  BUILD_COMMIT=$REV_PARSE
+fi
+
+echo "#ifndef BUILD_INFO_H"
+echo "#define BUILD_INFO_H"
+echo ""
+echo "#define BUILD_NUMBER $BUILD_NUMBER"
+echo "#define BUILD_COMMIT \"$BUILD_COMMIT\""
+echo ""
+echo "#endif // BUILD_INFO_H"