From ecbe466a364876927994e2f1ec14f4d82301d201 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Mar 2023 19:47:21 +0200
Subject: [PATCH 1/8] Retire the ggml_mul_mat() branch for transposed src0
 (#500)

* Retire the ggml_mul_mat() for transposed src0

- It can always be made contiguous with ggml_cpy()
- The code is now simplified
- The results are deterministic in respect to num threads

* SIMD-ify dequantize_row_q4_0() for ARM_NEON (#502)

* Attempt to SIMD-ify dequantize_row_q4_0() for ARM_NEON

* Fix dequantization - forgot to interleave the quants
---
 ggml.c | 1033 +++++++++++++++-----------------------------------------
 1 file changed, 277 insertions(+), 756 deletions(-)

diff --git a/ggml.c b/ggml.c
index d8e1fbd4e..291e12a0a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -496,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
 void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     assert(k % QK == 0);
 
-#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
+#if defined(__ARM_NEON) || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
     const int nb = k / QK;
     const size_t bs = sizeof(float) + QK/2;
 
@@ -507,7 +507,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
 #endif
 
 #if defined(__POWER9_VECTOR__)
-#if QK == 32
     const vector float v85 = vec_splats(8.5f);
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
@@ -548,11 +547,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
         //memcpy(pb, pp, sizeof(pp));
         pb += bs;
     }
-#else
-#error "not implemented for QK"
-#endif
 #elif __ARM_NEON
-#if QK == 32
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
 
@@ -589,11 +584,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
         memcpy(pb, pp, sizeof(pp));
         pb += bs;
     }
-#else
-#error "not implemented for QK"
-#endif
 #elif defined(__AVX2__)
-#if QK == 32
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -660,11 +651,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
         _mm_storeu_si128( ( __m128i* )pb, res );
         pb += bs;
     }
-#else
-#error "not implemented for QK"
-#endif
 #elif defined(__wasm_simd128__)
-#if QK == 32
     for (int i = 0; i < nb; i++) {
         float amax = 0.0f; // absolute max
 
@@ -701,9 +688,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
         memcpy(pb, pp, sizeof(pp));
         pb += bs;
     }
-#else
-#error "not implemented for QK"
-#endif
 #else
     // scalar
     quantize_row_q4_0_reference(x, y, k);
@@ -771,7 +755,7 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
     const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
 
-#if defined(__AVX2__) && QK % 32 == 0
+#if defined(__AVX2__)
     for (int i = 0; i < nb; i++) {
         // scale factor
         const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs));
@@ -804,6 +788,59 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
             }
         }
     }
+#elif defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        const float d = *(const float *) (pd + i*bs);
+
+        const uint8_t * restrict pp = pb + i*bs;
+
+        const float32x4_t vd = vdupq_n_f32(d);
+
+        for (int l = 0; l < QK; l += 16) {
+            // Load 16x4-bit integers into 8x8-bit integers
+            const uint8x8_t v8 = vld1_u8(pp + l/2);
+
+            // Expand 4-bit nibbles to 8-bit bytes
+            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
+            const uint8x8_t v1 = vshr_n_u8(v8, 4);
+
+            // Convert to signed 8-bit integers
+            const int8x8_t vs_0 = vreinterpret_s8_u8(v0);
+            const int8x8_t vs_1 = vreinterpret_s8_u8(v1);
+
+            // Subtract 8 from each byte
+            const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8));
+            const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8));
+
+            // Interleave and combine
+            const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1);
+            const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1);
+
+            const int8x16_t vq = vcombine_s8(vx_0, vx_1);
+
+            // convert to 2x int16x8_t
+            const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq));
+            const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq));
+
+            // convert to 4x float32x4_t
+            const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0)));
+            const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0)));
+            const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1)));
+            const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1)));
+
+            // Multiply by d
+            const float32x4_t r0 = vmulq_f32(vf_0, vd);
+            const float32x4_t r1 = vmulq_f32(vf_1, vd);
+            const float32x4_t r2 = vmulq_f32(vf_2, vd);
+            const float32x4_t r3 = vmulq_f32(vf_3, vd);
+
+            // Store
+            vst1q_f32(y + i*QK + l +  0, r0);
+            vst1q_f32(y + i*QK + l +  4, r1);
+            vst1q_f32(y + i*QK + l +  8, r2);
+            vst1q_f32(y + i*QK + l + 12, r3);
+        }
+    }
 #else
     // scalar
     for (int i = 0; i < nb; i++) {
@@ -1500,8 +1537,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 
     float sumf = 0.0;
 
-#ifdef __ARM_NEON
-#if QK == 32
+#if defined(__ARM_NEON)
     float sum0 = 0.0f;
     float sum1 = 0.0f;
 
@@ -1600,12 +1636,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
     }
 
     sumf = sum0 + sum1;
-#else
-#error "not implemented for QK"
-#endif
 #elif defined(__AVX512F__)
-
-#if QK == 32
     // Initialize accumulator with zeros
     __m512 acc0 = _mm512_setzero_ps();
     __m512 acc1 = _mm512_setzero_ps();
@@ -1634,11 +1665,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 
     // Horizontal sum of all lanes of the accumulator
     sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
-#else
-#error "not implemented for QK"
-#endif
 #elif defined(__AVX2__)
-#if QK == 32
     const size_t countBlocks = nb;
 
     // Initialize accumulator with zeros
@@ -1689,11 +1716,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
     res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
 
     sumf = _mm_cvtss_f32( res );
-#else
-#error "not implemented for QK"
-#endif
 #elif defined(__wasm_simd128__)
-#if QK == 32
     // wasm simd
     float sum0 = 0.0f;
     float sum1 = 0.0f;
@@ -1776,9 +1799,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
     }
 
     sumf = sum0 + sum1;
-#else
-#error "not implemented for QK"
-#endif
 #else
     // scalar
     for (int i = 0; i < nb; i++) {
@@ -1823,7 +1843,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void
     float sumf = 0.0;
 
 #if defined(__AVX2__)
-#if QK == 32
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
     // Accumulator for constant offsets
@@ -1898,9 +1917,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void
     res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
 
     sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
-#else
-#error "not implemented for QK"
-#endif
 #else
     // scalar
     for (int i = 0; i < nb; i++) {
@@ -2017,167 +2033,6 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 #endif
 }
 
-inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        GGML_ASSERT(false);
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
-    }
-#endif
-}
-
-inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * restrict x, const float v) {
-    assert(n % QK == 0);
-
-    const int nb = n / QK;
-    const size_t bs = sizeof(float) + QK/2;
-
-    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
-    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
-
-#if __ARM_NEON
-#if QK == 32
-    for (int i = 0; i < nb; ++i) {
-        const float d0 = v*(*(const float *) (pd + i*bs));
-
-        const uint8_t * restrict pp = pb + i*bs;
-
-        const uint8x8_t m4b = vdup_n_u8(0xf);
-        const int8x8_t  s8b = vdup_n_s8(0x8);
-
-        const float32x4_t vd = vdupq_n_f32(d0);
-
-        for (int j = 0; j < 2; j++) {
-            const uint8x8_t vx = vld1_u8(pp + j*8);
-
-            const int8x8_t vxl = vreinterpret_s8_u8(vand_u8(vx, m4b));
-            const int8x8_t vxh = vreinterpret_s8_u8(vshr_n_u8(vx, 4));
-
-            // sub 8
-            const int8x8_t vxls = vsub_s8(vxl, s8b);
-            const int8x8_t vxhs = vsub_s8(vxh, s8b);
-
-            //const int8x8_t vxlt = vzip_s8(vxls, vxhs)[0];
-            //const int8x8_t vxht = vzip_s8(vxls, vxhs)[1];
-            const int8x8_t vxlt = vzip1_s8(vxls, vxhs);
-            const int8x8_t vxht = vzip2_s8(vxls, vxhs);
-
-            const int8x16_t vxq = vcombine_s8(vxlt, vxht);
-
-            // convert to 2x int16x8_t
-            const int16x8_t vxq0 = vmovl_s8(vget_low_s8 (vxq));
-            const int16x8_t vxq1 = vmovl_s8(vget_high_s8(vxq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vx0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq0)));
-            const float32x4_t vx1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq0)));
-            const float32x4_t vx2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq1)));
-            const float32x4_t vx3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq1)));
-
-            const float32x4_t vy0 = vld1q_f32(y + i*32 + j*16 + 0);
-            const float32x4_t vy1 = vld1q_f32(y + i*32 + j*16 + 4);
-            const float32x4_t vy2 = vld1q_f32(y + i*32 + j*16 + 8);
-            const float32x4_t vy3 = vld1q_f32(y + i*32 + j*16 + 12);
-
-            const float32x4_t vr0 = vfmaq_f32(vy0, vx0, vd);
-            const float32x4_t vr1 = vfmaq_f32(vy1, vx1, vd);
-            const float32x4_t vr2 = vfmaq_f32(vy2, vx2, vd);
-            const float32x4_t vr3 = vfmaq_f32(vy3, vx3, vd);
-
-            vst1q_f32(y + i*32 + j*16 + 0,  vr0);
-            vst1q_f32(y + i*32 + j*16 + 4,  vr1);
-            vst1q_f32(y + i*32 + j*16 + 8,  vr2);
-            vst1q_f32(y + i*32 + j*16 + 12, vr3);
-        }
-    }
-#endif
-#else
-    // scalar
-    for (int i = 0; i < nb; i++) {
-        const float d = *(const float *) (pd + i*bs);
-
-        const uint8_t * restrict pp = pb + i*bs;
-
-        for (int l = 0; l < QK; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0xf;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            y[i*QK + l + 0] += v0*v;
-            y[i*QK + l + 1] += v1*v;
-
-            assert(!isnan(y[i*QK + l + 0]));
-            assert(!isnan(y[i*QK + l + 1]));
-            assert(!isinf(y[i*QK + l + 0]));
-            assert(!isinf(y[i*QK + l + 1]));
-        }
-    }
-#endif
-}
-
-inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) {
-    assert(n % QK == 0);
-
-    const int nb = n / QK;
-    const size_t bs = 2*sizeof(float) + QK/2;
-
-    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
-    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float));
-    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
-
-    for (int i = 0; i < nb; i++) {
-        const float d = *(const float *) (pd + i*bs);
-        const float m = *(const float *) (pm + i*bs);
-
-        const uint8_t * restrict pp = pb + i*bs;
-
-        for (int l = 0; l < QK; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const uint8_t vi0 = vi & 0xf;
-            const uint8_t vi1 = vi >> 4;
-
-            const float v0 = d*vi0 + m;
-            const float v1 = d*vi1 + m;
-
-            y[i*QK + l + 0] += v0*v;
-            y[i*QK + l + 1] += v1*v;
-
-            assert(!isnan(y[i*QK + l + 0]));
-            assert(!isnan(y[i*QK + l + 1]));
-            assert(!isinf(y[i*QK + l + 0]));
-            assert(!isinf(y[i*QK + l + 1]));
-            //printf("mad: v0 %f v1 %f, i = %d, l = %d, d = %f, vi = %d, vi0 = %d, vi1 = %d\n", v0, v1, i, l, d, vi, vi0, vi1);
-        }
-    }
-}
-
 //inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(GGML_SIMD)
@@ -2612,9 +2467,13 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
-        (t0->ne[0]  == t1->ne[0])  &&
-        (t0->ne[2]  == t1->ne[2])  &&
-        (t0->ne[3]  == t1->ne[3]);
+        (t0->ne[0] == t1->ne[0])  &&
+        (t0->ne[2] == t1->ne[2])  &&
+        (t0->ne[3] == t1->ne[3]);
+}
+
+static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+    return tensor->nb[0] > tensor->nb[1];
 }
 
 static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
@@ -4010,6 +3869,7 @@ struct ggml_tensor * ggml_mul_mat(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b) {
     GGML_ASSERT(ggml_can_mul_mat(a, b));
+    GGML_ASSERT(!ggml_is_transposed(a));
 
     bool is_node = false;
 
@@ -5949,7 +5809,7 @@ static void ggml_compute_forward_mul_mat_f32(
     assert(ne3  == ne13);
 
     // TODO: we don't support permuted src0
-    assert(nb00 == sizeof(float) || nb01 == sizeof(float));
+    assert(nb00 == sizeof(float));
 
     // dst cannot be transposed or permuted
     assert(nb0 == sizeof(float));
@@ -5964,9 +5824,6 @@ static void ggml_compute_forward_mul_mat_f32(
 
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
-    //
-    // nb00 <  nb01 - src0 is transposed
-    //   compute by src0 columns
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@@ -6007,126 +5864,50 @@ static void ggml_compute_forward_mul_mat_f32(
 #endif
 
     if (params->type == GGML_TASK_INIT) {
-        if (nb01 >= nb00) {
-            return;
-        }
-
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
         return;
     }
 
     if (params->type == GGML_TASK_FINALIZE) {
-        if (nb01 >= nb00) {
-            return;
-        }
-
-        // TODO: fix this memset (wsize is overestimated)
-        //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);
-
-        float * const wdata = params->wdata;
-
-        // cols per thread
-        const int dc = (ne + nth - 1)/nth;
-
-        // col range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, ne);
-
-        ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);
-
-        for (int k = 1; k < nth; k++) {
-            ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);
-        }
-
         return;
     }
 
-    if (nb01 >= nb00) {
-        // TODO: do not support transposed src1
-        assert(nb10 == sizeof(float));
+    // TODO: do not support transposed src1
+    assert(nb10 == sizeof(float));
 
-        // parallelize by src0 rows using ggml_vec_dot_f32
+    // parallelize by src0 rows using ggml_vec_dot_f32
 
-        // total rows in src0
-        const int nr = ne01*ne02*ne03;
+    // total rows in src0
+    const int nr = ne01*ne02*ne03;
 
-        // rows per thread
-        const int dr = (nr + nth - 1)/nth;
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
 
-        // row range for this thread
-        const int ir0 = dr*ith;
-        const int ir1 = MIN(ir0 + dr, nr);
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
 
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0 indices
-            const int i03 = ir/(ne02*ne01);
-            const int i02 = (ir - i03*ne02*ne01)/ne01;
-            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
 
-            for (int ic = 0; ic < ne11; ++ic) {
-                // src1 indices
-                const int i13 = i03;
-                const int i12 = i02;
-                const int i11 = ic;
+        for (int ic = 0; ic < ne11; ++ic) {
+            // src1 indices
+            const int i13 = i03;
+            const int i12 = i02;
+            const int i11 = ic;
 
-                // dst indices
-                const int i0 = i01;
-                const int i1 = i11;
-                const int i2 = i02;
-                const int i3 = i03;
+            // dst indices
+            const int i0 = i01;
+            const int i1 = i11;
+            const int i2 = i02;
+            const int i3 = i03;
 
-                ggml_vec_dot_f32(ne00,
-                        (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
-                        (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
-                        (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
-            }
-        }
-    } else {
-        // parallelize by src1 columns using ggml_vec_mad_f32
-        // each thread has its own work data
-        // during FINALIZE we accumulate all work data into dst
-
-        // total columns in src1
-        const int nc = ne10;
-
-        // columns per thread
-        const int dc = (nc + nth - 1)/nth;
-
-        // column range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, nc);
-
-        // work data for thread
-        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
-        float * const wdata = params->wdata;
-
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                for (int i11 = 0; i11 < ne11; ++i11) {
-                    for (int ic = ic0; ic < ic1; ++ic) {
-                        // src1 indices
-                        const int i10 = ic;
-
-                        // src0 indices
-                        const int i03 = i13;
-                        const int i02 = i12;
-                        const int i00 = ic;
-
-                        // dst indices
-                        const int i1 = i11;
-                        const int i2 = i12;
-                        const int i3 = i13;
-
-                        assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
-
-                        ggml_vec_mad_f32(ne01,
-                                (float *) (wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0),
-                                (float *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)),
-                               *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)));
-                    }
-                }
-            }
+            ggml_vec_dot_f32(ne00,
+                    (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
+                    (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
+                    (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
         }
     }
 
@@ -6192,7 +5973,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     GGML_ASSERT(ne3  == ne13);
 
     // TODO: we don't support permuted src0
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
 
     // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
@@ -6207,9 +5988,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
-    //
-    // nb00 <  nb01 - src0 is transposed
-    //   compute by src0 columns
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@@ -6261,148 +6039,66 @@ static void ggml_compute_forward_mul_mat_f16_f32(
 #endif
 
     if (params->type == GGML_TASK_INIT) {
-        if (nb01 >= nb00) {
-            ggml_fp16_t * const wdata = params->wdata;
+        ggml_fp16_t * const wdata = params->wdata;
 
-            size_t id = 0;
-            for (int i13 = 0; i13 < ne13; ++i13) {
-                for (int i12 = 0; i12 < ne12; ++i12) {
-                    for (int i11 = 0; i11 < ne11; ++i11) {
-                        for (int i10 = 0; i10 < ne10; ++i10) {
-                            wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
-                        }
+        size_t id = 0;
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                for (int i11 = 0; i11 < ne11; ++i11) {
+                    for (int i10 = 0; i10 < ne10; ++i10) {
+                        wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
                     }
                 }
             }
-
-            GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
-
-            return;
         }
 
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
+        GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
+
         return;
     }
 
     if (params->type == GGML_TASK_FINALIZE) {
-        if (nb01 >= nb00) {
-            return;
-        }
-
-        // TODO: fix this memset (wsize is overestimated)
-        //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);
-
-        ggml_fp16_t * const wdata = params->wdata;
-
-        // cols per thread
-        const int dc = (ne + nth - 1)/nth;
-
-        // col range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, ne);
-
-        for (int i = ic0; i < ic1; ++i) {
-            ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
-        }
-
-        for (int k = 1; k < nth; k++) {
-            for (int i = ic0; i < ic1; ++i) {
-                ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
-            }
-        }
-
         return;
     }
 
-    if (nb01 >= nb00) {
-        // fp16 -> half the size, so divide by 2
-        // TODO: do not support transposed src1
-        assert(nb10/2 == sizeof(ggml_fp16_t));
+    // fp16 -> half the size, so divide by 2
+    // TODO: do not support transposed src1
+    assert(nb10/2 == sizeof(ggml_fp16_t));
 
-        // parallelize by src0 rows using ggml_vec_dot_f16
+    // parallelize by src0 rows using ggml_vec_dot_f16
 
-        // total rows in src0
-        const int nr = ne01*ne02*ne03;
+    // total rows in src0
+    const int nr = ne01*ne02*ne03;
 
-        // rows per thread
-        const int dr = (nr + nth - 1)/nth;
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
 
-        // row range for this thread
-        const int ir0 = dr*ith;
-        const int ir1 = MIN(ir0 + dr, nr);
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
 
-        ggml_fp16_t * wdata = params->wdata;
+    ggml_fp16_t * wdata = params->wdata;
 
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0 indices
-            const int i03 = ir/(ne02*ne01);
-            const int i02 = (ir - i03*ne02*ne01)/ne01;
-            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
 
-            const int i13 = i03;
-            const int i12 = i02;
+        const int i13 = i03;
+        const int i12 = i02;
 
-            const int i0 = i01;
-            const int i2 = i02;
-            const int i3 = i03;
+        const int i0 = i01;
+        const int i2 = i02;
+        const int i3 = i03;
 
-            ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-            ggml_fp16_t * src1_col =                                wdata + (       0 + i12*ne11 + i13*ne12*ne11)*ne00;
+        ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        ggml_fp16_t * src1_col =                                wdata + (       0 + i12*ne11 + i13*ne12*ne11)*ne00;
 
-            float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
 
-            for (int ic = 0; ic < ne11; ++ic) {
-                ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
-            }
-        }
-    } else {
-        // parallelize by src1 columns using ggml_vec_mad_f16
-        // each thread has its own work data
-        // during FINALIZE we accumulate all work data into dst
-
-        // total columns in src1
-        const int nc = ne10;
-
-        // columns per thread
-        const int dc = (nc + nth - 1)/nth;
-
-        // column range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, nc);
-
-        // work data for thread
-        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
-        ggml_fp16_t * const wdata = params->wdata;
-
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                for (int i11 = 0; i11 < ne11; ++i11) {
-                    // dst indices
-                    const int i1 = i11;
-                    const int i2 = i12;
-                    const int i3 = i13;
-
-                    ggml_fp16_t * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
-
-                    for (int ic = ic0; ic < ic1; ++ic) {
-                        // src1 indices
-                        const int i10 = ic;
-
-                        // src0 indices
-                        const int i03 = i13;
-                        const int i02 = i12;
-                        const int i00 = ic;
-
-                        assert(sizeof(ggml_fp16_t)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
-
-                        ggml_fp16_t * src0_col =  (ggml_fp16_t *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));
-                        float         src1_val = *      (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-
-                        ggml_vec_mad_f16(ne01, dst_row, src0_col, src1_val);
-                    }
-                }
-            }
+        for (int ic = 0; ic < ne11; ++ic) {
+            ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
         }
     }
 
@@ -6467,7 +6163,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
     GGML_ASSERT(ne3  == ne13);
 
     // TODO: we don't support permuted src0
-    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]);
+    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]);
 
     // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
@@ -6482,9 +6178,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
 
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
-    //
-    // nb00 <  nb01 - src0 is transposed
-    //   compute by src0 columns
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@@ -6509,9 +6202,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
                 {
                     size_t id = 0;
                     for (int i01 = 0; i01 < ne01; ++i01) {
-                        //for (int i00 = 0; i00 < ne00; ++i00) {
-                        //    wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
-                        //}
                         dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
                         id += ne00;
                     }
@@ -6538,143 +6228,63 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
 #endif
 
     if (params->type == GGML_TASK_INIT) {
-        //printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth);
-        if (nb01 >= nb00) {
-            char * wdata = params->wdata;
-
-            for (int i13 = 0; i13 < ne13; ++i13) {
-                for (int i12 = 0; i12 < ne12; ++i12) {
-                    for (int i11 = 0; i11 < ne11; ++i11) {
-                        //for (int i10 = 0; i10 < ne10; ++i10) {
-                        //    wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
-                        //}
-                        quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];
-                    }
-                }
-            }
-
-            return;
-        }
-
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        if (nb01 >= nb00) {
-            return;
-        }
-
-        float * const wdata = params->wdata;
-
-        // cols per thread
-        const int dc = (ne + nth - 1)/nth;
-
-        // col range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, ne);
-
-        ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);
-
-        for (int k = 1; k < nth; k++) {
-            ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);
-        }
-
-        return;
-    }
-
-    if (nb01 >= nb00) {
-        // TODO: do not support transposed src1
-
-        // parallelize by src0 rows using ggml_vec_dot_q4_0
-
-        // total rows in src0
-        const int nr = ne01*ne02*ne03;
-
-        // rows per thread
-        const int dr = (nr + nth - 1)/nth;
-
-        // row range for this thread
-        const int ir0 = dr*ith;
-        const int ir1 = MIN(ir0 + dr, nr);
-
-        void * wdata = params->wdata;
-
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0 indices
-            const int i03 = ir/(ne02*ne01);
-            const int i02 = (ir - i03*ne02*ne01)/ne01;
-            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int i13 = i03;
-            const int i12 = i02;
-
-            const int i0 = i01;
-            const int i2 = i02;
-            const int i3 = i03;
-
-            void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-            char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]);
-
-            float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-
-            assert(ne00 % 32 == 0);
-
-            for (int ic = 0; ic < ne11; ++ic) {
-                ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0])));
-            }
-        }
-    } else {
-        //printf("AAAAA ith = %d, nth = %d\n", ith, nth);
-        // parallelize by src1 columns using ggml_vec_mad_q4_0
-        // each thread has its own work data
-        // during FINALIZE we accumulate all work data into dst
-
-        // total columns in src1
-        const int nc = ne10;
-
-        // columns per thread
-        const int dc = (nc + nth - 1)/nth;
-
-        // column range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, nc);
-
-        // work data for thread
-        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
-        float * const wdata = params->wdata;
+        char * wdata = params->wdata;
 
         for (int i13 = 0; i13 < ne13; ++i13) {
             for (int i12 = 0; i12 < ne12; ++i12) {
                 for (int i11 = 0; i11 < ne11; ++i11) {
-                    // dst indices
-                    const int i1 = i11;
-                    const int i2 = i12;
-                    const int i3 = i13;
-
-                    float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
-
-                    for (int ic = ic0; ic < ic1; ++ic) {
-                        // src1 indices
-                        const int i10 = ic;
-
-                        // src0 indices
-                        const int i03 = i13;
-                        const int i02 = i12;
-                        const int i00 = ic;
-
-                        assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
-
-                        void * src0_col =   (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));
-                        float  src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-
-                        ggml_vec_mad_q4_0(ne01, dst_row, src0_col, src1_val);
-                    }
+                    quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                    wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];
                 }
             }
         }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: do not support transposed src1
+
+    // parallelize by src0 rows using ggml_vec_dot_q4_0
+
+    // total rows in src0
+    const int nr = ne01*ne02*ne03;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    void * wdata = params->wdata;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const int i13 = i03;
+        const int i12 = i02;
+
+        const int i0 = i01;
+        const int i2 = i02;
+        const int i3 = i03;
+
+        void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]);
+
+        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+
+        assert(ne00 % 32 == 0);
+
+        for (int ic = 0; ic < ne11; ++ic) {
+            ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0])));
+        }
     }
 
     //int64_t t1 = ggml_time_us();
@@ -6738,7 +6348,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
     GGML_ASSERT(ne3  == ne13);
 
     // TODO: we don't support permuted src0
-    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]);
+    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]);
 
     // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
@@ -6753,9 +6363,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
 
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
-    //
-    // nb00 <  nb01 - src0 is transposed
-    //   compute by src0 columns
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@@ -6780,9 +6387,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
                 {
                     size_t id = 0;
                     for (int i01 = 0; i01 < ne01; ++i01) {
-                        //for (int i00 = 0; i00 < ne00; ++i00) {
-                        //    wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
-                        //}
                         dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
                         id += ne00;
                     }
@@ -6809,143 +6413,66 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
 #endif
 
     if (params->type == GGML_TASK_INIT) {
-        //printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth);
-        if (nb01 >= nb00) {
-            char * wdata = params->wdata;
-
-            for (int i13 = 0; i13 < ne13; ++i13) {
-                for (int i12 = 0; i12 < ne12; ++i12) {
-                    for (int i11 = 0; i11 < ne11; ++i11) {
-                        //for (int i10 = 0; i10 < ne10; ++i10) {
-                        //    wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
-                        //}
-                        quantize_row_q4_1((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                        wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];
-                    }
-                }
-            }
-
-            return;
-        }
-
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        if (nb01 >= nb00) {
-            return;
-        }
-
-        float * const wdata = params->wdata;
-
-        // cols per thread
-        const int dc = (ne + nth - 1)/nth;
-
-        // col range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, ne);
-
-        ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);
-
-        for (int k = 1; k < nth; k++) {
-            ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);
-        }
-
-        return;
-    }
-
-    if (nb01 >= nb00) {
-        // TODO: do not support transposed src1
-
-        // parallelize by src0 rows using ggml_vec_dot_q4_1
-
-        // total rows in src0
-        const int nr = ne01*ne02*ne03;
-
-        // rows per thread
-        const int dr = (nr + nth - 1)/nth;
-
-        // row range for this thread
-        const int ir0 = dr*ith;
-        const int ir1 = MIN(ir0 + dr, nr);
-
-        void * wdata = params->wdata;
-
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src0 indices
-            const int i03 = ir/(ne02*ne01);
-            const int i02 = (ir - i03*ne02*ne01)/ne01;
-            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int i13 = i03;
-            const int i12 = i02;
-
-            const int i0 = i01;
-            const int i2 = i02;
-            const int i3 = i03;
-
-            void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-            char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]);
-
-            float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-
-            assert(ne00 % 32 == 0);
-
-            for (int ic = 0; ic < ne11; ++ic) {
-                ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1])));
-            }
-        }
-    } else {
-        //printf("AAAAA ith = %d, nth = %d\n", ith, nth);
-        // parallelize by src1 columns using ggml_vec_mad_q4_1
-        // each thread has its own work data
-        // during FINALIZE we accumulate all work data into dst
-
-        // total columns in src1
-        const int nc = ne10;
-
-        // columns per thread
-        const int dc = (nc + nth - 1)/nth;
-
-        // column range for this thread
-        const int ic0 = dc*ith;
-        const int ic1 = MIN(ic0 + dc, nc);
-
-        // work data for thread
-        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
-        float * const wdata = params->wdata;
+        char * wdata = params->wdata;
 
         for (int i13 = 0; i13 < ne13; ++i13) {
             for (int i12 = 0; i12 < ne12; ++i12) {
                 for (int i11 = 0; i11 < ne11; ++i11) {
-                    // dst indices
-                    const int i1 = i11;
-                    const int i2 = i12;
-                    const int i3 = i13;
-
-                    float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
-
-                    for (int ic = ic0; ic < ic1; ++ic) {
-                        // src1 indices
-                        const int i10 = ic;
-
-                        // src0 indices
-                        const int i03 = i13;
-                        const int i02 = i12;
-                        const int i00 = ic;
-
-                        assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
-
-                        void * src0_col =   (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));
-                        float  src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-
-                        ggml_vec_mad_q4_1(ne01, dst_row, src0_col, src1_val);
-                    }
+                    //for (int i10 = 0; i10 < ne10; ++i10) {
+                    //    wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
+                    //}
+                    quantize_row_q4_1((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
+                    wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];
                 }
             }
         }
+
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // TODO: do not support transposed src1
+
+    // parallelize by src0 rows using ggml_vec_dot_q4_1
+
+    // total rows in src0
+    const int nr = ne01*ne02*ne03;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    void * wdata = params->wdata;
+
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 indices
+        const int i03 = ir/(ne02*ne01);
+        const int i02 = (ir - i03*ne02*ne01)/ne01;
+        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const int i13 = i03;
+        const int i12 = i02;
+
+        const int i0 = i01;
+        const int i2 = i02;
+        const int i3 = i03;
+
+        void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
+        char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]);
+
+        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
+
+        assert(ne00 % 32 == 0);
+
+        for (int ic = 0; ic < ne11; ++ic) {
+            ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1])));
+        }
     }
 
     //int64_t t1 = ggml_time_us();
@@ -9588,57 +9115,51 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
                         size_t cur = 0;
 
-                        // TODO: better way to determine if the matrix is transposed
-                        if (node->src0->nb[1] < node->src0->nb[0]) {
-                            cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1)
-                                                                   // TODO: overestimated by factor of x2 for FP16
-                        } else {
-                            if (node->src0->type == GGML_TYPE_F16 &&
+                        if (node->src0->type == GGML_TYPE_F16 &&
                                 node->src1->type == GGML_TYPE_F32) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                    node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                       //       the threads are still spinning
-                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                                    //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
-                                    //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
-                                    //printf("cur = %zu\n", cur);
-                                } else {
-                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-                                }
-#else
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#endif
-                            } else if (node->src0->type == GGML_TYPE_F32 &&
-                                       node->src1->type == GGML_TYPE_F32) {
-                                cur = 0;
-                            } else if (node->src0->type == GGML_TYPE_Q4_0 &&
-                                       node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                    node->n_tasks = 1;
-                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                                } else {
-                                    cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];
-                                }
-#else
-                                cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];
-#endif
-                            } else if (node->src0->type == GGML_TYPE_Q4_1 &&
-                                       node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                    node->n_tasks = 1;
-                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                                } else {
-                                    cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];
-                                }
-#else
-                                cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];
-#endif
+                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                                node->n_tasks = 1; // TODO: this actually is doing nothing
+                                                   //       the threads are still spinning
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                                //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
+                                //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
+                                //printf("cur = %zu\n", cur);
                             } else {
-                                GGML_ASSERT(false);
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
                             }
+#else
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+#endif
+                        } else if (node->src0->type == GGML_TYPE_F32 &&
+                                node->src1->type == GGML_TYPE_F32) {
+                            cur = 0;
+                        } else if (node->src0->type == GGML_TYPE_Q4_0 &&
+                                node->src1->type == GGML_TYPE_F32) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                                node->n_tasks = 1;
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                            } else {
+                                cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];
+                            }
+#else
+                            cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];
+#endif
+                        } else if (node->src0->type == GGML_TYPE_Q4_1 &&
+                                node->src1->type == GGML_TYPE_F32) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                                node->n_tasks = 1;
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                            } else {
+                                cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];
+                            }
+#else
+                            cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];
+#endif
+                        } else {
+                            GGML_ASSERT(false);
                         }
 
                         work_size = MAX(work_size, cur);

From a316a425d04027453dc0fd45f003b647c12f66f9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Mar 2023 20:26:40 +0200
Subject: [PATCH 2/8] Overhaul the examples structure

- main -> examples
- utils -> examples (renamed to "common")
- quantize -> examples
- separate tools for "perplexity" and "embedding"

Hope I didn't break something !
---
 .gitignore                                    |   1 +
 CMakeLists.txt                                |  29 +---
 Makefile                                      |  19 ++-
 examples/CMakeLists.txt                       |  36 +++++
 utils.cpp => examples/common.cpp              |   4 +-
 utils.h => examples/common.h                  |   0
 examples/embedding/CMakeLists.txt             |   4 +
 examples/embedding/README.md                  |   3 +
 examples/embedding/embedding.cpp              | 106 +++++++++++++
 examples/main/CMakeLists.txt                  |   4 +
 examples/main/README.md                       |   3 +
 main.cpp => examples/main/main.cpp            | 119 ++------------
 examples/perplexity/CMakeLists.txt            |   4 +
 examples/perplexity/README.md                 |   3 +
 examples/perplexity/perplexity.cpp            | 146 ++++++++++++++++++
 examples/quantize/CMakeLists.txt              |   4 +
 examples/quantize/README.md                   |   3 +
 .../quantize/quantize.cpp                     |   0
 ggml.c                                        |  26 ++--
 tests/CMakeLists.txt                          |   2 +-
 tests/test-tokenizer-0.cpp                    |   6 +-
 21 files changed, 361 insertions(+), 161 deletions(-)
 create mode 100644 examples/CMakeLists.txt
 rename utils.cpp => examples/common.cpp (99%)
 rename utils.h => examples/common.h (100%)
 create mode 100644 examples/embedding/CMakeLists.txt
 create mode 100644 examples/embedding/README.md
 create mode 100644 examples/embedding/embedding.cpp
 create mode 100644 examples/main/CMakeLists.txt
 create mode 100644 examples/main/README.md
 rename main.cpp => examples/main/main.cpp (76%)
 create mode 100644 examples/perplexity/CMakeLists.txt
 create mode 100644 examples/perplexity/README.md
 create mode 100644 examples/perplexity/perplexity.cpp
 create mode 100644 examples/quantize/CMakeLists.txt
 create mode 100644 examples/quantize/README.md
 rename quantize.cpp => examples/quantize/quantize.cpp (100%)

diff --git a/.gitignore b/.gitignore
index 3087b0ea5..ce01fd541 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,6 +19,7 @@ models/*
 /main
 /quantize
 /result
+/perplexity
 
 arm_neon.h
 compile_commands.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51af97c4d..a1ff5a44e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -211,17 +211,6 @@ endif()
 # Build libraries
 #
 
-add_library(utils OBJECT
-            utils.cpp
-            utils.h)
-
-target_include_directories(utils PUBLIC .)
-target_compile_features(utils PUBLIC cxx_std_11) # don't bump
-target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
-if (BUILD_SHARED_LIBS)
-    set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
 add_library(ggml OBJECT
             ggml.c
             ggml.h)
@@ -239,22 +228,12 @@ add_library(llama
 
 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
-target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})
+target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
 endif()
 
-#
-# Executables
-#
-
-add_executable(main main.cpp)
-target_link_libraries(main PRIVATE llama ggml utils)
-
-add_executable(quantize quantize.cpp)
-target_link_libraries(quantize PRIVATE llama ggml utils)
-
 #
 # programs, examples and tests
 #
@@ -264,6 +243,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
     add_subdirectory(tests)
 endif ()
 
-#if (LLAMA_BUILD_EXAMPLES)
-#    add_subdirectory(examples)
-#endif()
+if (LLAMA_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+endif()
diff --git a/Makefile b/Makefile
index e8b128cb8..98a2d85f3 100644
--- a/Makefile
+++ b/Makefile
@@ -212,7 +212,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
-default: main quantize
+default: main quantize perplexity
 
 #
 # Build library
@@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h
 llama.o: llama.cpp llama.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
 
-utils.o: utils.cpp utils.h
-	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
+common.o: examples/common.cpp examples/common.h
+	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 
 clean:
-	rm -f *.o main quantize
+	rm -vf *.o main quantize perplexity
 
-main: main.cpp ggml.o llama.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: quantize.cpp ggml.o llama.o utils.o
-	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o
+	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
+	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
 
 #
 # Tests
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 000000000..ce3a34710
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,36 @@
+# dependencies
+
+find_package(Threads REQUIRED)
+
+# third-party
+
+# ...
+
+# common
+
+set(TARGET common)
+
+add_library(${TARGET} OBJECT
+    common.h
+    common.cpp
+    )
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
+
+# examples
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+if (EMSCRIPTEN)
+else()
+    add_subdirectory(main)
+    add_subdirectory(quantize)
+    add_subdirectory(perplexity)
+    add_subdirectory(embedding)
+endif()
diff --git a/utils.cpp b/examples/common.cpp
similarity index 99%
rename from utils.cpp
rename to examples/common.cpp
index cea309628..afa7d4026 100644
--- a/utils.cpp
+++ b/examples/common.cpp
@@ -1,6 +1,6 @@
-#include "ggml.h"
+#include "common.h"
 
-#include "utils.h"
+#include "ggml.h"
 
 #include <cassert>
 #include <cstring>
diff --git a/utils.h b/examples/common.h
similarity index 100%
rename from utils.h
rename to examples/common.h
diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt
new file mode 100644
index 000000000..88c425d4a
--- /dev/null
+++ b/examples/embedding/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET embedding)
+add_executable(${TARGET} embedding.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/embedding/README.md b/examples/embedding/README.md
new file mode 100644
index 000000000..21d8be65f
--- /dev/null
+++ b/examples/embedding/README.md
@@ -0,0 +1,3 @@
+# embedding
+
+TODO
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
new file mode 100644
index 000000000..3015293f7
--- /dev/null
+++ b/examples/embedding/embedding.cpp
@@ -0,0 +1,106 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <string>
+#include <vector>
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.model = "models/llama-7B/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    params.embedding = true;
+
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+
+    if (params.seed <= 0) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_context * ctx;
+
+    // load the model
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.n_ctx      = params.n_ctx;
+        lparams.n_parts    = params.n_parts;
+        lparams.seed       = params.seed;
+        lparams.f16_kv     = params.memory_f16;
+        lparams.logits_all = params.perplexity;
+        lparams.use_mlock  = params.use_mlock;
+        lparams.embedding  = params.embedding;
+
+        ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+
+    int n_past = 0;
+
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
+
+    // tokenize the prompt
+    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+
+    // determine newline token
+    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+
+    if (params.verbose_prompt) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        for (int i = 0; i < (int) embd_inp.size(); i++) {
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+        }
+        fprintf(stderr, "\n");
+    }
+
+    if (params.embedding){
+        if (embd_inp.size() > 0) {
+            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return 1;
+            }
+        }
+
+        const auto embeddings = llama_get_embeddings(ctx);
+
+        // TODO: print / use the embeddings
+    }
+
+    llama_print_timings(ctx);
+    llama_free(ctx);
+
+    return 0;
+}
diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt
new file mode 100644
index 000000000..b2dcc2910
--- /dev/null
+++ b/examples/main/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET main)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/main/README.md b/examples/main/README.md
new file mode 100644
index 000000000..4701aa558
--- /dev/null
+++ b/examples/main/README.md
@@ -0,0 +1,3 @@
+# main
+
+TODO
diff --git a/main.cpp b/examples/main/main.cpp
similarity index 76%
rename from main.cpp
rename to examples/main/main.cpp
index 77260bb71..b5f1a7b5c 100644
--- a/main.cpp
+++ b/examples/main/main.cpp
@@ -1,5 +1,4 @@
-#include "utils.h"
-#include "ggml.h"
+#include "common.h"
 #include "llama.h"
 
 #include <cassert>
@@ -65,79 +64,6 @@ void set_console_state(console_state new_st)
     }
 }
 
-std::vector<double> softmax(const std::vector<float>& logits) {
-    std::vector<double> probs(logits.size());
-    float max_logit = logits[0];
-    for (float v : logits) max_logit = std::max(max_logit, v);
-    double sum_exp = 0.0;
-    for (size_t i = 0; i < logits.size(); i++) {
-        // Subtract the maximum logit value from the current logit value for numerical stability
-        float logit = logits[i] - max_logit;
-        double exp_logit = std::exp(logit);
-        sum_exp += exp_logit;
-        probs[i] = exp_logit;
-    }
-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
-    return probs;
-}
-
-void perplexity(llama_context * ctx, const gpt_params & params) {
-    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-    // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
-    // Output: `perplexity: 13.5106 [114/114]`
-    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-
-    int count = 0;
-    double nll = 0.0;
-    int seq_count = tokens.size() / params.n_ctx;
-
-    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
-
-    for (int i = 0; i < seq_count; ++i) {
-        int start = i * params.n_ctx;
-        int end = start + params.n_ctx - 1;
-        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
-        auto start_t = std::chrono::high_resolution_clock::now();
-        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return;
-        }
-        auto end_t = std::chrono::high_resolution_clock::now();
-        if (i == 0) {
-            double seconds = std::chrono::duration<double>(end_t - start_t).count();
-            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
-        }
-        // We get the logits for all the tokens in the context window (params.n_ctx)
-        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
-        // calculate the perplexity over the last half the window (so the model always has
-        // some context to predict the token).
-        //
-        // We rely on the fact that attention in the forward pass only looks at previous
-        // tokens here, so the logits returned for each token are an accurate representation
-        // of what the model would have predicted at that point.
-        //
-        // Example, we have a context window of 512, we will compute perplexity for each of the
-        // last 256 tokens.  Then, we split the input up into context window size chunks to
-        // process the entire prompt.
-
-        auto logits = llama_get_logits(ctx);
-        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            int n_vocab = llama_n_vocab(ctx);
-            std::vector<float> tok_logits(
-                logits + j * n_vocab,
-                logits + (j + 1) * n_vocab);
-            double prob = softmax(tok_logits)[tokens[start + j + 1]];
-            nll += -std::log(prob);
-            ++count;
-        }
-        // perplexity is e^(average negative log-likelihood)
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
-    }
-    printf("\n");
-}
-
 static bool is_interacting = false;
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -155,9 +81,6 @@ void sigint_handler(int signo) {
 #endif
 
 int main(int argc, char ** argv) {
-    // has to be called once at the start of the program to init ggml stuff
-    ggml_time_init();
-
     gpt_params params;
     params.model = "models/llama-7B/ggml-model.bin";
 
@@ -165,6 +88,14 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.perplexity) {
+        printf("\n************\n");
+        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
     if (params.n_ctx > 2048) {
         fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
@@ -198,9 +129,7 @@ int main(int argc, char ** argv) {
         lparams.n_parts    = params.n_parts;
         lparams.seed       = params.seed;
         lparams.f16_kv     = params.memory_f16;
-        lparams.logits_all = params.perplexity;
         lparams.use_mlock  = params.use_mlock;
-        lparams.embedding  = params.embedding;
 
         ctx = llama_init_from_file(params.model.c_str(), lparams);
 
@@ -236,11 +165,6 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    if (params.perplexity) {
-        perplexity(ctx, params);
-        exit(0);
-    }
-
     int n_past = 0;
 
     // Add a space in front of the first character to match OG llama tokenizer behavior
@@ -346,27 +270,6 @@ int main(int argc, char ** argv) {
     // the first thing we will do is to output the prompt, so set color accordingly
     set_console_state(CONSOLE_STATE_PROMPT);
 
-    if (params.embedding){
-        embd = embd_inp;
-
-        if (embd.size() > 0) {
-            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return 1;
-            }
-        }
-
-        const auto embeddings = llama_get_embeddings(ctx);
-
-        // TODO: print / use the embeddings
-
-        if (params.use_color) {
-            printf(ANSI_COLOR_RESET);
-        }
-
-        return 0;
-    }
-
     while (remaining_tokens > 0 || params.interactive) {
         // predict
         if (embd.size() > 0) {
@@ -392,10 +295,6 @@ int main(int argc, char ** argv) {
                 auto logits = llama_get_logits(ctx);
 
                 if (params.ignore_eos) {
-                    // set the logit of the eos token to zero to avoid sampling it
-                    //logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
-                    // TODO: this does not work of params.logits_all == true
-                    assert(params.perplexity == false);
                     logits[llama_token_eos()] = 0;
                 }
 
diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt
new file mode 100644
index 000000000..5836df8b2
--- /dev/null
+++ b/examples/perplexity/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET perplexity)
+add_executable(${TARGET} perplexity.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md
new file mode 100644
index 000000000..a932275c2
--- /dev/null
+++ b/examples/perplexity/README.md
@@ -0,0 +1,3 @@
+# perplexity
+
+TODO
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
new file mode 100644
index 000000000..f0266a01f
--- /dev/null
+++ b/examples/perplexity/perplexity.cpp
@@ -0,0 +1,146 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+std::vector<double> softmax(const std::vector<float>& logits) {
+    std::vector<double> probs(logits.size());
+    float max_logit = logits[0];
+    for (float v : logits) max_logit = std::max(max_logit, v);
+    double sum_exp = 0.0;
+    for (size_t i = 0; i < logits.size(); i++) {
+        // Subtract the maximum logit value from the current logit value for numerical stability
+        float logit = logits[i] - max_logit;
+        double exp_logit = std::exp(logit);
+        sum_exp += exp_logit;
+        probs[i] = exp_logit;
+    }
+    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
+    return probs;
+}
+
+void perplexity(llama_context * ctx, const gpt_params & params) {
+    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Output: `perplexity: 13.5106 [114/114]`
+    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+
+    int count = 0;
+    double nll = 0.0;
+    int seq_count = tokens.size() / params.n_ctx;
+
+    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
+
+    for (int i = 0; i < seq_count; ++i) {
+        int start = i * params.n_ctx;
+        int end = start + params.n_ctx - 1;
+        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+        auto start_t = std::chrono::high_resolution_clock::now();
+        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return;
+        }
+        auto end_t = std::chrono::high_resolution_clock::now();
+        if (i == 0) {
+            double seconds = std::chrono::duration<double>(end_t - start_t).count();
+            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
+        }
+        // We get the logits for all the tokens in the context window (params.n_ctx)
+        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
+        // calculate the perplexity over the last half the window (so the model always has
+        // some context to predict the token).
+        //
+        // We rely on the fact that attention in the forward pass only looks at previous
+        // tokens here, so the logits returned for each token are an accurate representation
+        // of what the model would have predicted at that point.
+        //
+        // Example, we have a context window of 512, we will compute perplexity for each of the
+        // last 256 tokens.  Then, we split the input up into context window size chunks to
+        // process the entire prompt.
+
+        auto logits = llama_get_logits(ctx);
+        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
+            // Calculate probability of next token, given the previous ones.
+            int n_vocab = llama_n_vocab(ctx);
+            std::vector<float> tok_logits(
+                logits + j * n_vocab,
+                logits + (j + 1) * n_vocab);
+            double prob = softmax(tok_logits)[tokens[start + j + 1]];
+            nll += -std::log(prob);
+            ++count;
+        }
+        // perplexity is e^(average negative log-likelihood)
+        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        fflush(stdout);
+    }
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.model = "models/llama-7B/ggml-model.bin";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    params.perplexity = true;
+
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+
+    if (params.seed <= 0) {
+        params.seed = time(NULL);
+    }
+
+    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+    std::mt19937 rng(params.seed);
+    if (params.random_prompt) {
+        params.prompt = gpt_random_prompt(rng);
+    }
+
+    llama_context * ctx;
+
+    // load the model
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.n_ctx      = params.n_ctx;
+        lparams.n_parts    = params.n_parts;
+        lparams.seed       = params.seed;
+        lparams.f16_kv     = params.memory_f16;
+        lparams.logits_all = params.perplexity;
+        lparams.use_mlock  = params.use_mlock;
+        lparams.embedding  = params.embedding;
+
+        ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+    }
+
+    // print system information
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
+    }
+
+    perplexity(ctx, params);
+
+    llama_print_timings(ctx);
+    llama_free(ctx);
+
+    return 0;
+}
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
new file mode 100644
index 000000000..fb27d4517
--- /dev/null
+++ b/examples/quantize/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize)
+add_executable(${TARGET} quantize.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/README.md b/examples/quantize/README.md
new file mode 100644
index 000000000..f349e913e
--- /dev/null
+++ b/examples/quantize/README.md
@@ -0,0 +1,3 @@
+# quantize
+
+TODO
diff --git a/quantize.cpp b/examples/quantize/quantize.cpp
similarity index 100%
rename from quantize.cpp
rename to examples/quantize/quantize.cpp
diff --git a/ggml.c b/ggml.c
index 291e12a0a..b566b5684 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5741,8 +5741,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
               struct ggml_tensor * dst) {
-    const int ne00 = src0->ne[0];
-    const int ne01 = src0->ne[1];
+    //const int ne00 = src0->ne[0];
+    //const int ne01 = src0->ne[1];
 
     const int ne10 = src1->ne[0];
 
@@ -5776,16 +5776,16 @@ static void ggml_compute_forward_mul_mat_f32(
 
     const int ne10 = src1->ne[0];
     const int ne11 = src1->ne[1];
-    const int ne12 = src1->ne[2];
-    const int ne13 = src1->ne[3];
+    //const int ne12 = src1->ne[2];
+    //const int ne13 = src1->ne[3];
 
-    const int ne0  = dst->ne[0];
-    const int ne1  = dst->ne[1];
-    const int ne2  = dst->ne[2];
-    const int ne3  = dst->ne[3];
-    const int ne   = ne0*ne1*ne2*ne3;
+    //const int ne0  = dst->ne[0];
+    //const int ne1  = dst->ne[1];
+    //const int ne2  = dst->ne[2];
+    //const int ne3  = dst->ne[3];
+    //const int ne   = ne0*ne1*ne2*ne3;
 
-    const int nb00 = src0->nb[0];
+    //const int nb00 = src0->nb[0];
     const int nb01 = src0->nb[1];
     const int nb02 = src0->nb[2];
     const int nb03 = src0->nb[3];
@@ -5947,7 +5947,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     const int ne1  = dst->ne[1];
     const int ne2  = dst->ne[2];
     const int ne3  = dst->ne[3];
-    const int ne   = ne0*ne1*ne2*ne3;
+    //const int ne   = ne0*ne1*ne2*ne3;
 
     const int nb00 = src0->nb[0];
     const int nb01 = src0->nb[1];
@@ -6137,7 +6137,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
     const int ne1  = dst->ne[1];
     const int ne2  = dst->ne[2];
     const int ne3  = dst->ne[3];
-    const int ne   = ne0*ne1*ne2*ne3;
+    //const int ne   = ne0*ne1*ne2*ne3;
 
     const int nb00 = src0->nb[0];
     const int nb01 = src0->nb[1];
@@ -6322,7 +6322,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
     const int ne1  = dst->ne[1];
     const int ne2  = dst->ne[2];
     const int ne3  = dst->ne[3];
-    const int ne   = ne0*ne1*ne2*ne3;
+    //const int ne   = ne0*ne1*ne2*ne3;
 
     const int nb00 = src0->nb[0];
     const int nb01 = src0->nb[1];
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6a4170f80..b44d7fe7e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 function(llama_add_test source)
     get_filename_component(TEST_TARGET ${source} NAME_WE)
     add_executable(${TEST_TARGET} ${source})
-    target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
+    target_link_libraries(${TEST_TARGET} PRIVATE llama)
     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()
 
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
index 49bc232b6..382055324 100644
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -1,9 +1,9 @@
-#include "utils.h"
 #include "llama.h"
 
 #include <cstdio>
 #include <string>
 #include <map>
+#include <vector>
 
 static const std::map<std::string, std::vector<llama_token>> k_tests = {
     { "Hello World",        { 1,  10994,   2787, }, },
@@ -48,7 +48,9 @@ int main(int argc, char **argv) {
     }
 
     for (const auto & test_kv : k_tests) {
-        const auto res = ::llama_tokenize(ctx, test_kv.first, true);
+        std::vector<llama_token> res(test_kv.first.size());
+        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
+        res.resize(n);
 
         bool correct = res.size() == test_kv.second.size();
 

From 459e93cce07cab9052c06b5bf360819893442e1e Mon Sep 17 00:00:00 2001
From: slaren <2141330+slaren@users.noreply.github.com>
Date: Sat, 25 Mar 2023 19:31:48 +0100
Subject: [PATCH 3/8] Add AVX2 implementation of dequantize_row_q4_1 (#505)

---
 ggml.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index b566b5684..c9a4e8675 100644
--- a/ggml.c
+++ b/ggml.c
@@ -783,7 +783,7 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
 
             // Scale and store
             for (int j = 0; j < 4; j++) {
-                __m256 result = _mm256_mul_ps(vf[j], d_v);
+                const __m256 result = _mm256_mul_ps(vf[j], d_v);
                 _mm256_storeu_ps(y + i * QK + l + j*8, result);
             }
         }
@@ -879,6 +879,37 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
     const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
     const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
 
+#if defined(__AVX2__)
+    for (int i = 0; i < nb; i++) {
+        const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs));
+        const __m256 d_m = _mm256_broadcast_ss((const float *) (pm + i*bs));
+
+        const uint8_t * restrict pp = pb + i*bs;
+
+        for (int l = 0; l < QK; l += 32) {
+            // Load 32x4-bit integers into 32x8-bit integers
+            __m256i vx8 = bytesFromNibbles(pp+l/2);
+
+            // Convert to 16-bit int
+            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
+            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
+
+            // Convert to 32-bit int -> float 32
+            const __m256 vf[4] = {
+                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
+                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
+                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
+                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
+            };
+
+            // Scale, add m and store
+            for (int j = 0; j < 4; j++) {
+                const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
+                _mm256_storeu_ps(y + i * QK + l + j*8, result);
+            }
+        }
+    }
+#else
     for (int i = 0; i < nb; i++) {
         const float d = *(const float *) (pd + i*bs);
         const float m = *(const float *) (pm + i*bs);
@@ -901,6 +932,7 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
             assert(!isnan(y[i*QK + l + 1]));
         }
     }
+#endif
 }
 
 //

From 55ad42af845127bd0eb0c1f36f327ecec83f4bca Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Mar 2023 20:36:52 +0200
Subject: [PATCH 4/8] Move chat scripts into "./examples"

---
 README.md                           | 7 +++++--
 alpaca.sh => examples/alpaca.sh     | 4 ++++
 examples/{chatLLaMa => chat-13B.sh} | 0
 chat.sh => examples/chat.sh         | 4 ++++
 4 files changed, 13 insertions(+), 2 deletions(-)
 rename alpaca.sh => examples/alpaca.sh (89%)
 rename examples/{chatLLaMa => chat-13B.sh} (100%)
 rename chat.sh => examples/chat.sh (89%)

diff --git a/README.md b/README.md
index 8a84324b1..9ba6241da 100644
--- a/README.md
+++ b/README.md
@@ -179,7 +179,10 @@ Here is an example few-shot interaction, invoked with the command
 
 ```bash
 # default arguments using 7B model
-./chat.sh
+./examples/chat.sh
+
+# advanced chat with 13B model
+./examples/chat-13B.sh
 
 # custom arguments using 13B model
 ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
@@ -195,7 +198,7 @@ Note the use of `--color` to distinguish between user input and generated text.
 2. Run the `main` tool like this:
 
 ```
-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins
+./examples/alpaca.sh
 ```
 
 Sample run:
diff --git a/alpaca.sh b/examples/alpaca.sh
similarity index 89%
rename from alpaca.sh
rename to examples/alpaca.sh
index d8a9f456a..4c9aa5077 100755
--- a/alpaca.sh
+++ b/examples/alpaca.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
+
 #
 # Temporary script - will be removed in the future
 #
 
+cd `dirname $0`
+cd ..
+
 ./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
diff --git a/examples/chatLLaMa b/examples/chat-13B.sh
similarity index 100%
rename from examples/chatLLaMa
rename to examples/chat-13B.sh
diff --git a/chat.sh b/examples/chat.sh
similarity index 89%
rename from chat.sh
rename to examples/chat.sh
index 5531315b3..2b6a63e3f 100755
--- a/chat.sh
+++ b/examples/chat.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
+
 #
 # Temporary script - will be removed in the future
 #
 
+cd `dirname $0`
+cd ..
+
 ./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt

From 03f7e335604b3d68f74995aa2ccb4955833ee423 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Mar 2023 20:51:14 +0200
Subject: [PATCH 5/8] Cleanup STL headers + fix embedding examples + minor
 stuff

---
 examples/embedding/embedding.cpp   | 15 +++++----------
 examples/perplexity/perplexity.cpp |  8 --------
 llama.cpp                          | 22 ++++++++++++++--------
 llama.h                            |  1 +
 4 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 3015293f7..d397f35fd 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,15 +1,6 @@
 #include "common.h"
 #include "llama.h"
 
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <string>
-#include <vector>
-
 int main(int argc, char ** argv) {
     gpt_params params;
     params.model = "models/llama-7B/ggml-model.bin";
@@ -94,9 +85,13 @@ int main(int argc, char ** argv) {
             }
         }
 
+        const int n_embd = llama_n_embd(ctx);
         const auto embeddings = llama_get_embeddings(ctx);
 
-        // TODO: print / use the embeddings
+        for (int i = 0; i < n_embd; i++) {
+            printf("%f ", embeddings[i]);
+        }
+        printf("\n");
     }
 
     llama_print_timings(ctx);
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index f0266a01f..f617ba365 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,14 +1,6 @@
 #include "common.h"
 #include "llama.h"
 
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <string>
-#include <vector>
-
 std::vector<double> softmax(const std::vector<float>& logits) {
     std::vector<double> probs(logits.size());
     float max_logit = logits[0];
diff --git a/llama.cpp b/llama.cpp
index 0015edec1..2bd520353 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1261,10 +1261,10 @@ static llama_vocab::id llama_sample_top_p_top_k(
         double repeat_penalty) {
     auto & rng = lctx.rng;
 
-    const auto & vocab = lctx.vocab;
-    const auto & logits = lctx.logits;
+    const int n_logits = lctx.model.hparams.n_vocab;
 
-    int n_logits = vocab.id_to_token.size();
+    const auto & logits = lctx.logits;
+    const auto * plogits = logits.data() + logits.size() - n_logits;
 
     std::vector<std::pair<double, llama_vocab::id>> logits_id;
     logits_id.reserve(n_logits);
@@ -1276,13 +1276,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
             // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
             if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
                 // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (logits[i] < 0.0) {
-                    logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
+                if (plogits[i] < 0.0) {
+                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
                 } else {
-                    logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
+                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
                 }
             } else {
-                logits_id.push_back(std::make_pair(logits[i]*scale, i));
+                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
             }
         }
     }
@@ -1677,6 +1677,8 @@ struct llama_context * llama_init_from_file(
         }
 
         const auto & hparams = ctx->model.hparams;
+
+        // resized during inference
         if (params.logits_all) {
             ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
         } else {
@@ -1684,7 +1686,7 @@ struct llama_context * llama_init_from_file(
         }
 
         if (params.embedding){
-            ctx->embedding.reserve(hparams.n_embd);
+            ctx->embedding.resize(hparams.n_embd);
         }
 
         ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
@@ -1761,6 +1763,10 @@ int llama_n_ctx(struct llama_context * ctx) {
     return ctx->model.hparams.n_ctx;
 }
 
+int llama_n_embd(struct llama_context * ctx) {
+    return ctx->model.hparams.n_embd;
+}
+
 float * llama_get_logits(struct llama_context * ctx) {
     return ctx->logits.data();
 }
diff --git a/llama.h b/llama.h
index 827abc1f2..ebf55f41c 100644
--- a/llama.h
+++ b/llama.h
@@ -109,6 +109,7 @@ extern "C" {
 
     LLAMA_API int llama_n_vocab(struct llama_context * ctx);
     LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (struct llama_context * ctx);
 
     // Token logits obtained from the last call to llama_eval()
     // The logits for the last token are stored in the last row

From e2d490dafd860eaaaf9aa8008ab790527d556daf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Mar 2023 21:36:22 +0200
Subject: [PATCH 6/8] Inifinite generation via context swapping (#71)

---
 examples/chat.sh       |   8 ++-
 examples/common.cpp    |   9 ++-
 examples/common.h      |   1 +
 examples/main/main.cpp | 122 ++++++++++++++++++++++++++++-------------
 4 files changed, 100 insertions(+), 40 deletions(-)

diff --git a/examples/chat.sh b/examples/chat.sh
index 2b6a63e3f..97973d056 100755
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -7,4 +7,10 @@
 cd `dirname $0`
 cd ..
 
-./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+# Important:
+#
+#   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
+#
+./main -m ./models/7B/ggml-model-q4_0.bin -c 2048 -b 1024 -n 256 --keep 48 \
+    --repeat_penalty 1.0 --color -i \
+    -r "User:" -f prompts/chat-with-bob.txt
diff --git a/examples/common.cpp b/examples/common.cpp
index afa7d4026..866a6b063 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -112,6 +112,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             }
             params.n_batch = std::stoi(argv[i]);
             params.n_batch = std::min(512, params.n_batch);
+        } else if (arg == "--keep") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_keep = std::stoi(argv[i]);
         } else if (arg == "-m" || arg == "--model") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -134,7 +140,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.use_mlock = true;
         } else if (arg == "--mtest") {
             params.mem_test = true;
-        } else if (arg == "--verbose_prompt") {
+        } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
             if (++i >= argc) {
@@ -210,6 +216,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
     fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
+    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt\n");
     if (ggml_mlock_supported()) {
         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
diff --git a/examples/common.h b/examples/common.h
index dede80385..8caefd859 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -21,6 +21,7 @@ struct gpt_params {
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
     int32_t n_batch       = 8;    // batch size for prompt processing
+    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 
     // sampling parameters
     int32_t top_k = 40;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index b5f1a7b5c..f78936d45 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -44,8 +44,20 @@ enum console_state {
 static console_state con_st = CONSOLE_STATE_DEFAULT;
 static bool con_use_color = false;
 
-void set_console_state(console_state new_st)
-{
+void enable_console_colors() {
+#if defined (_WIN32)
+    if (params.use_color) {
+        // Enable ANSI colors on Windows 10+
+        unsigned long dwMode = 0;
+        void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
+        if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
+            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
+        }
+    }
+#endif
+}
+
+void set_console_state(console_state new_st) {
     if (!con_use_color) return;
     // only emit color code if state changed
     if (new_st != con_st) {
@@ -96,6 +108,14 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
+    if (params.embedding) {
+        printf("\n************\n");
+        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
+        printf("************\n\n");
+
+        return 0;
+    }
+
     if (params.n_ctx > 2048) {
         fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
                 "expect poor results\n", __func__, params.n_ctx);
@@ -165,8 +185,6 @@ int main(int argc, char ** argv) {
         return 0;
     }
 
-    int n_past = 0;
-
     // Add a space in front of the first character to match OG llama tokenizer behavior
     params.prompt.insert(0, 1, ' ');
 
@@ -175,7 +193,13 @@ int main(int argc, char ** argv) {
 
     const int n_ctx = llama_n_ctx(ctx);
 
-    params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
+    if ((int) embd_inp.size() > n_ctx - 4) {
+        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        return 1;
+    }
+
+    params.n_keep    = std::min(params.n_keep,    (int) embd_inp.size());
+    //params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
 
     // prefix & suffix for instruct mode
     const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
@@ -206,6 +230,13 @@ int main(int argc, char ** argv) {
         for (int i = 0; i < (int) embd_inp.size(); i++) {
             fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
         }
+        if (params.n_keep > 0) {
+        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
+            for (int i = 0; i < params.n_keep; i++) {
+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+            }
+            fprintf(stderr, "'\n");
+        }
         fprintf(stderr, "\n");
     }
 
@@ -222,7 +253,7 @@ int main(int argc, char ** argv) {
 
         fprintf(stderr, "%s: interactive mode on.\n", __func__);
 
-        if(params.antiprompt.size()) {
+        if (params.antiprompt.size()) {
             for (auto antiprompt : params.antiprompt) {
                 fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
             }
@@ -232,14 +263,12 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
         }
     }
-    fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     fprintf(stderr, "\n\n");
 
-    std::vector<llama_token> embd;
-
-
-    int last_n_size = params.repeat_last_n;
-    std::vector<llama_token> last_n_tokens(last_n_size);
+    // TODO: replace with ring-buffer
+    std::vector<llama_token> last_n_tokens(n_ctx);
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
 
     if (params.interactive) {
@@ -252,27 +281,42 @@ int main(int argc, char ** argv) {
         is_interacting = params.interactive_start || params.instruct;
     }
 
-    int input_consumed = 0;
     bool input_noecho = false;
 
-    int remaining_tokens = params.n_predict;
+    int n_past     = 0;
+    int n_remain   = params.n_predict;
+    int n_consumed = 0;
 
-#if defined (_WIN32)
-  if (params.use_color) {
-        // Enable ANSI colors on Windows 10+
-        unsigned long dwMode = 0;
-        void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
-        if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
-            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
-        }
-    }
-#endif
     // the first thing we will do is to output the prompt, so set color accordingly
+    enable_console_colors();
     set_console_state(CONSOLE_STATE_PROMPT);
 
-    while (remaining_tokens > 0 || params.interactive) {
+    std::vector<llama_token> embd;
+
+    while (n_remain > 0 || params.interactive) {
         // predict
         if (embd.size() > 0) {
+            // infinite text generation via context swapping
+            // if we run out of context:
+            // - take the n_keep first tokens from the original prompt (via n_past)
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+            if (n_past + (int) embd.size() > n_ctx) {
+                const int n_left = n_past - params.n_keep;
+
+                n_past = params.n_keep;
+
+                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
+
+                //printf("\n---\n");
+                //printf("resetting: '");
+                //for (int i = 0; i < (int) embd.size(); i++) {
+                //    printf("%s", llama_token_to_str(ctx, embd[i]));
+                //}
+                //printf("'\n");
+                //printf("\n---\n");
+            }
+
             if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
                 fprintf(stderr, "%s : failed to eval\n", __func__);
                 return 1;
@@ -282,7 +326,7 @@ int main(int argc, char ** argv) {
         n_past += embd.size();
         embd.clear();
 
-        if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
+        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
             const float top_k          = params.top_k;
             const float top_p          = params.top_p;
@@ -298,7 +342,9 @@ int main(int argc, char ** argv) {
                     logits[llama_token_eos()] = 0;
                 }
 
-                id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty);
+                id = llama_sample_top_p_top_k(ctx,
+                        last_n_tokens.data() + n_ctx - params.repeat_last_n,
+                        params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
 
                 last_n_tokens.erase(last_n_tokens.begin());
                 last_n_tokens.push_back(id);
@@ -321,14 +367,14 @@ int main(int argc, char ** argv) {
             input_noecho = false;
 
             // decrement remaining sampling budget
-            --remaining_tokens;
+            --n_remain;
         } else {
             // some user input remains from prompt or interaction, forward it to processing
-            while ((int) embd_inp.size() > input_consumed) {
-                embd.push_back(embd_inp[input_consumed]);
+            while ((int) embd_inp.size() > n_consumed) {
+                embd.push_back(embd_inp[n_consumed]);
                 last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[input_consumed]);
-                ++input_consumed;
+                last_n_tokens.push_back(embd_inp[n_consumed]);
+                ++n_consumed;
                 if ((int) embd.size() >= params.n_batch) {
                     break;
                 }
@@ -343,13 +389,13 @@ int main(int argc, char ** argv) {
             fflush(stdout);
         }
         // reset color to default if we there is no pending user input
-        if (!input_noecho && (int)embd_inp.size() == input_consumed) {
+        if (!input_noecho && (int)embd_inp.size() == n_consumed) {
             set_console_state(CONSOLE_STATE_DEFAULT);
         }
 
         // in interactive mode, and not currently processing queued inputs;
         // check if we should prompt the user for more
-        if (params.interactive && (int) embd_inp.size() <= input_consumed) {
+        if (params.interactive && (int) embd_inp.size() <= n_consumed) {
             // check for reverse prompt
             std::string last_output;
             for (auto id : last_n_tokens) {
@@ -371,7 +417,7 @@ int main(int argc, char ** argv) {
                 set_console_state(CONSOLE_STATE_USER_INPUT);
 
                 if (params.instruct) {
-                    input_consumed = embd_inp.size();
+                    n_consumed = embd_inp.size();
                     embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
 
                     printf("\n> ");
@@ -405,7 +451,7 @@ int main(int argc, char ** argv) {
                     embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                 }
 
-                remaining_tokens -= line_inp.size();
+                n_remain -= line_inp.size();
 
                 input_noecho = true; // do not echo this again
             }
@@ -426,8 +472,8 @@ int main(int argc, char ** argv) {
         }
 
         // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && remaining_tokens <= 0) {
-            remaining_tokens = params.n_predict;
+        if (params.interactive && n_remain <= 0) {
+            n_remain = params.n_predict;
             is_interacting = true;
         }
     }

From 79b2b266db6b198b5af450982c3cd61120fac951 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Mar 2023 21:51:41 +0200
Subject: [PATCH 7/8] If n_predict == -1, generate forever

---
 examples/chat.sh       | 2 +-
 examples/common.cpp    | 2 +-
 examples/main/main.cpp | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/chat.sh b/examples/chat.sh
index 97973d056..9a928ef05 100755
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./main -m ./models/7B/ggml-model-q4_0.bin -c 2048 -b 1024 -n 256 --keep 48 \
+./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
     --repeat_penalty 1.0 --color -i \
     -r "User:" -f prompts/chat-with-bob.txt
diff --git a/examples/common.cpp b/examples/common.cpp
index 866a6b063..2ab000f4f 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -204,7 +204,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
     fprintf(stderr, "  -f FNAME, --file FNAME\n");
     fprintf(stderr, "                        prompt file to start generation.\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 - infinity)\n", params.n_predict);
     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
     fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index f78936d45..a453743a5 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -199,7 +199,6 @@ int main(int argc, char ** argv) {
     }
 
     params.n_keep    = std::min(params.n_keep,    (int) embd_inp.size());
-    //params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size());
 
     // prefix & suffix for instruct mode
     const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
@@ -293,7 +292,7 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd;
 
-    while (n_remain > 0 || params.interactive) {
+    while (n_remain != 0 || params.interactive) {
         // predict
         if (embd.size() > 0) {
             // infinite text generation via context swapping

From c2b25b6912662d2637d9c6e6df3a5de931e0d7ce Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sat, 25 Mar 2023 21:53:39 +0200
Subject: [PATCH 8/8] Fix colors enabling on WIN32

---
 examples/main/main.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index a453743a5..7bb2b6bc4 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -46,13 +46,11 @@ static bool con_use_color = false;
 
 void enable_console_colors() {
 #if defined (_WIN32)
-    if (params.use_color) {
-        // Enable ANSI colors on Windows 10+
-        unsigned long dwMode = 0;
-        void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
-        if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
-            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
-        }
+    // Enable ANSI colors on Windows 10+
+    unsigned long dwMode = 0;
+    void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
+    if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
+        SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
     }
 #endif
 }
@@ -287,7 +285,9 @@ int main(int argc, char ** argv) {
     int n_consumed = 0;
 
     // the first thing we will do is to output the prompt, so set color accordingly
-    enable_console_colors();
+    if (params.use_color) {
+        enable_console_colors();
+    }
     set_console_state(CONSOLE_STATE_PROMPT);
 
     std::vector<llama_token> embd;