From cd4a7c4cb49f8b0cc45e907c6b1bac3978ef0e53 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 21 Mar 2024 10:37:38 +0200
Subject: [PATCH 1/3] Make quantize_row_iq4_nl do the same thing is
 quantization on CUDA

---
 ggml-quants.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 109dd6660..61989d135 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -11705,9 +11705,8 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
         ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
         float * scales, float * weight, uint8_t * L,
         const int8_t * values,
-        const float * quant_weights) {
-
-    const int ntry = 7;
+        const float * quant_weights,
+        const int ntry) {
 
     float sigma2 = 0;
     for (int j = 0; j < super_block_size; ++j) sigma2 += x[j]*x[j];
@@ -11823,7 +11822,7 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL;
             quantize_row_iq4_nl_impl(QK4_NL, 32, src + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
-                    &scale, weight, L, kvalues_iq4nl, qw);
+                    &scale, weight, L, kvalues_iq4nl, qw, 7);
         }
         src += n_per_row;
         qrow += nblock*sizeof(block_iq4_nl);
@@ -11832,9 +11831,21 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow
 }
 
 void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_NL == 0);
-    block_iq4_nl * restrict y = vy;
-    quantize_row_iq4_nl_reference(x, y, k);
+    GGML_ASSERT(k%QK4_NL == 0);
+    int nblock = k/QK4_NL;
+    uint8_t L[QK4_NL];
+    float weight[QK4_NL];
+    uint16_t unused_h;
+    uint8_t * unused_l = NULL;
+    float scale;
+    block_iq4_nl * iq4 = (block_iq4_nl *)vy;
+    for (int ibl = 0; ibl < nblock; ++ibl) {
+        quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
+                &scale, weight, L, kvalues_iq4nl, NULL, -1);
+    }
+    //assert(k % QK4_NL == 0);
+    //block_iq4_nl * restrict y = vy;
+    //quantize_row_iq4_nl_reference(x, y, k);
 }
 
 void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
@@ -11857,7 +11868,7 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow
         for (int ibl = 0; ibl < nblock; ++ibl) {
             const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL;
             quantize_row_iq4_nl_impl(QK_K, 32, src + QK_K*ibl, &iq4[ibl].d, iq4[ibl].qs, &iq4[ibl].scales_h, iq4[ibl].scales_l,
-                    scales, weight, L, kvalues_iq4nl, qw);
+                    scales, weight, L, kvalues_iq4nl, qw, 7);
         }
         src += n_per_row;
         qrow += nblock*sizeof(block_iq4_xs);

From 30eef31b070666ebe9537fd4c28cf4b1946a9fb6 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 21 Mar 2024 12:19:16 +0200
Subject: [PATCH 2/3] Make quantize_row_iq4_nl do the same thing is
 quantization on CUDA

This time for real. backend-ops tests pass.
---
 ggml-quants.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 61989d135..29cfafa3a 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -11718,6 +11718,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
     float max_scale = 0, amax_scale = 0;
     for (int ib = 0; ib < super_block_size/block_size; ++ib) {
         const float * xb = x + ib*block_size;
+        uint8_t * Lb = L + ib*block_size;
         if (quant_weights) {
             const float * qw = quant_weights + ib*block_size;
             for (int j = 0; j < block_size; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
@@ -11735,12 +11736,13 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
             scales[ib] = 0;
             continue;
         }
-        float d = -max/values[0];
+        float d = ntry > 0 ? -max/values[0] : max/values[0];
         float id = 1/d;
         float sumqx = 0, sumq2 = 0;
         for (int j = 0; j < block_size; ++j) {
             float al = id*xb[j];
             int l = best_index_int8(16, values, al);
+            Lb[j] = l;
             float q = values[l];
             float w = weight[j];
             sumqx += w*q*xb[j];
@@ -11795,9 +11797,11 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
         }
     } else {
         dh[0] = GGML_FP32_TO_FP16(scales[0]);
-        float id = scales[0] ? 1/scales[0] : 0;
-        for (int j = 0; j < super_block_size; ++j) {
-            L[j] = best_index_int8(16, values, id*x[j]);
+        if (ntry > 0) {
+            float id = scales[0] ? 1/scales[0] : 0;
+            for (int j = 0; j < super_block_size; ++j) {
+                L[j] = best_index_int8(16, values, id*x[j]);
+            }
         }
     }
 

From 68e4fed4d93d886ae4b4d5d6d9cc6c8602f83a7d Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 21 Mar 2024 12:18:03 +0100
Subject: [PATCH 3/3] Now fix test-quantize-fns

---
 ggml-quants.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 29cfafa3a..2eaca0593 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -11847,14 +11847,11 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
         quantize_row_iq4_nl_impl(QK4_NL, 32, x + QK4_NL*ibl, &iq4[ibl].d, iq4[ibl].qs, &unused_h, unused_l,
                 &scale, weight, L, kvalues_iq4nl, NULL, -1);
     }
-    //assert(k % QK4_NL == 0);
-    //block_iq4_nl * restrict y = vy;
-    //quantize_row_iq4_nl_reference(x, y, k);
 }
 
 void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
     assert(k % QK4_NL == 0);
-    quantize_iq4_nl(x, y, 1, k, NULL);
+    quantize_row_iq4_nl(x, y, k);
 }
 
 size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {