From d6313d83857c3e481a70c5bd7b371c284d19ce34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Wed, 27 Dec 2023 12:57:43 +0200
Subject: [PATCH] Refactor

---
 ggml-quants.c | 271 +++++++++++++++++++++++++-------------------------
 1 file changed, 135 insertions(+), 136 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 0c3b0d42d..a6b6c6a7b 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1303,6 +1303,135 @@ static inline int nearest_int(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
+static void quantize_q_k_1(const float * x, int bits, int scale_bits, int block_size, int * block_scales, int * block_mins, uint8_t * L, ggml_fp16_t * block_scale, ggml_fp16_t * block_min) {
+    float max_scale = 0.0f;
+    float max_min = 0.0f;
+
+    // If all the weight are positive we can invert the sign of min.
+    // Otherwise blocks with all positive weights need to be quantized with zero
+    // min, because min scale is unsigned.
+    int all_positive = 1;
+    for (int j = 0; j < QK_K; j++) {
+        if (x[j] < 0.0f) {
+            all_positive = 0;
+            break;
+        }
+    }
+
+    float scales[QK_K];
+    float mins[QK_K];
+
+    for (int j = 0; j < QK_K/block_size; j++) {
+        uint8_t q[QK_K/block_size];
+        // First find least squares solution for min and scale for each block.
+        quantize_1(&x[block_size*j], block_size, bits, q, &mins[j], &scales[j]);
+        // Flip the sign because quantize_1 assumes that min is added, but min
+        // is subtracted in k-quants.
+        mins[j] = -mins[j];
+        if (!all_positive && mins[j] < 0) {
+            // All weights are positive in this block, but some blocks have
+            // negative weights. Find new least squares scale with zero min.
+            mins[j] = 0.0f;
+            quantize_1_0min(&x[block_size*j], block_size, bits, q, &scales[j]);
+        }
+        if (scales[j] > max_scale) {
+            max_scale = scales[j];
+        }
+        if (j == 0) {
+            max_min = mins[j];
+        } else if (!all_positive && mins[j] > max_min) {
+            max_min = mins[j];
+        } else if (all_positive && mins[j] < max_min) {
+            max_min = mins[j];
+        }
+    }
+
+    int max_group = (1 << scale_bits) - 1;
+
+    // Increasing passes would decrease RMS error by miniscule amount with
+    // drawback of taking a lot more time.
+    for(int pass = 0; pass < 2; pass++) {
+        float inv_scale = max_scale == 0.0f ? 0.0f : max_group/max_scale;
+        float inv_min   = max_min == 0.0f ? 0.0f : max_group/max_min;
+        for (int j = 0; j < QK_K/block_size; ++j) {
+            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t lm = nearest_int(inv_min*mins[j]);
+            uint8_t best_lm = lm;
+            uint8_t best_ls = ls;
+            ls = MIN(max_group, ls);
+            lm = MIN(max_group, lm);
+            float best_rms = FLT_MAX;
+            const float d1 = max_scale / max_group;
+            const float dmin1 = max_min / max_group;
+            int limit = 1;
+            // Increase limit for minor RMS error decrease while increasing the
+            // quantization run time.
+            if (pass > 0) limit = 8;
+            // Due to quantization the best ls and lm might not be the nearest
+            // to the ones obtained by the round to nearest.
+            // Loop through few nearby choices and choose lm and ls that
+            // minimize RMS error.
+            for (int lst = MAX(0, ls-limit); lst <= MIN(max_group, ls+limit); lst++) {
+                for (int lmt = MAX(0, lm-limit); lmt <= MIN(max_group, lm+limit); lmt++) {
+                    float rms = 0.0f;
+                    for (int ii = 0; ii < block_size; ii++) {
+                        const float d = d1 * lst;
+                        const float dm1 = dmin1 * lmt;
+                        int l1 = 0;
+                        if (d) {
+                            l1 = nearest_int((x[block_size*j + ii] + dm1)/d);
+                            l1 = MAX(0, MIN((1 << bits) - 1, l1));
+                        }
+                        const float e = (d*l1 - dm1) - x[block_size*j + ii];
+                        rms += e*e;
+                    }
+                    if (rms < best_rms) {
+                        best_lm = lmt;
+                        best_ls = lst;
+                        best_rms = rms;
+                    }
+                }
+            }
+            block_scales[j] = best_ls;
+            block_mins[j] = best_lm;
+        }
+
+        float block_d = max_scale/max_group;
+        float block_dmin = max_min/max_group;
+        float q_fit[QK_K];
+        float q_m[QK_K];
+
+        // Quantize elements and populate arrays needed for least squares fit.
+        for (int j = 0; j < QK_K/block_size; ++j) {
+            const float d = block_d * block_scales[j];
+            const float dm = block_dmin * block_mins[j];
+            q_m[j] = block_mins[j];
+            for (int ii = 0; ii < block_size; ++ii) {
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[block_size*j + ii] + dm)/d);
+                    l = MAX(0, MIN((1 << bits) - 1, l));
+                }
+                L[block_size*j + ii] = l;
+                q_fit[block_size*j + ii] = block_scales[j] * l;
+            }
+        }
+
+        // Least squares fit min and scale.
+        float min, scale;
+        lstsq_q_k(q_fit, x, q_m, block_size, &min, &scale);
+        // Check for nans.
+        assert(min == min);
+        assert(scale == scale);
+        // Quantize to fp16 for the next pass.
+        max_scale = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(scale)) * max_group;
+        max_min = GGML_FP16_TO_FP32(GGML_FP32_TO_FP16(min)) * max_group;
+
+        *block_scale = GGML_FP32_TO_FP16(scale);
+        *block_min = GGML_FP32_TO_FP16(min);
+    }
+}
+
 static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) {
     float max = 0;
     float amax = 0;
@@ -1844,94 +1973,16 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    float mins[QK_K/32];
-    float scales[QK_K/32];
 
     for (int i = 0; i < nb; i++) {
-        float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-
-        int all_positive = 1;
-        for (int j = 0; j < QK_K; j++) {
-            if (x[j] < 0.0f) {
-                all_positive = 0;
-                break;
-            }
-        }
-
-        for (int j = 0; j < QK_K/32; j++) {
-            uint8_t q[QK_K/32];
-            quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]);
-            mins[j] = -mins[j];
-            if ((!all_positive) && (mins[j] < 0)) {
-                mins[j] = 0.0f;
-                quantize_1_0min(&x[32*j], 32, 4, q, &scales[j]);
-            }
-
-            if (j == 0 || scales[j] > max_scale) {
-                max_scale = scales[j];
-            }
-            if (j == 0) {
-                max_min = mins[j];
-            }
-            if (!all_positive && mins[j] > max_min) {
-                max_min = mins[j];
-            } else if (all_positive && mins[j] < max_min) {
-                max_min = mins[j];
-            }
-        }
-
-        int ql_loop = 0;
-quant_loop: ;
 #if QK_K == 256
-        float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale;
-        float inv_min   = max_min  == 0.0f ? 0.0f : 63.f/max_min;
+        int block_scales[QK_K/32];
+        int block_mins[QK_K/32];
+        quantize_q_k_1(x, 4, 6, 32, block_scales, block_mins, L, &y[i].d, &y[i].dmin);
+
         for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
-            uint8_t lm = nearest_int(inv_min*mins[j]);
-            uint8_t best_lm = lm;
-            uint8_t best_ls = ls;
-            ls = MIN(63, ls);
-            lm = MIN(63, lm);
-            float best_rms = FLT_MAX;
-            const float d1 = max_scale / 63.0f;
-            const float dmin1 = max_min / 63.0f;
-            int limit = 1;
-            if (ql_loop) limit = 4;
-            for (int lst = MAX(0, ls-limit); lst <= MIN(63, ls+limit); lst++) {
-                for (int lmt = MAX(0, lm-limit); lmt <= MIN(63, lm+limit); lmt++) {
-                    float rms = 0.0f;
-                    for (int ii = 0; ii < 32; ii++) {
-                        const float d = d1 * lst;
-                        const float dm1 = dmin1 * lmt;
-                        int l1 = 0;
-                        if (d) {
-                            l1 = nearest_int((x[32*j + ii] + dm1)/d);
-                            l1 = MAX(0, MIN(15, l1));
-                        }
-                        float e = ((d*l1 - dm1) - x[32*j + ii]);
-                        rms += e*e;
-                    }
-                    if (rms < best_rms) {
-                        best_lm = lmt;
-                        best_ls = lst;
-                        best_rms = rms;
-                    }
-                }
-            }
-            //if (lm != best_lm) {
-            //    printf("best %d, orig %d\n", best_lm, lm);
-            //}
-            lm = best_lm;
-            ls = best_ls;
-            //if (rms2 < rms && rms2 < rms3) {
-            //    printf("rms2 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3);
-            //    lm = lm2;
-            //}
-            //if (rms3 < rms && rms3 < rms2) {
-            //    printf("rms3 %f %f %f, lm %d %d %d\n", rms, rms2, rms3, lm, lm2, lm3);
-            //    lm = lm3;
-            //}
+            int ls = block_scales[j];
+            int lm = block_mins[j];
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -1941,58 +1992,6 @@ quant_loop: ;
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
-
-        //if (all_zero_lm && !all_positive && !ql_loop) {
-        //    all_positive = 1;
-        //    //printf("**********red_pos4\n");
-        //    goto redo_pos4;
-        //}
-        //} else if (all_zero_lm) {
-        //    //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
-        //}
-
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
-        float q_fit[QK_K];
-        float q_m[QK_K/32];
-
-        uint8_t sc, m;
-        for (int j = 0; j < QK_K/32; ++j) {
-            get_scale_min_k4(j, y[i].scales, &sc, &m);
-            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
-            q_m[j] = m;
-            for (int ii = 0; ii < 32; ++ii) {
-                int l = 0;
-                if (d) {
-                    l = nearest_int((x[32*j + ii] + dm)/d);
-                    l = MAX(0, MIN(15, l));
-                }
-                L[32*j + ii] = l;
-                q_fit[32*j + ii] = sc * l;
-            }
-        }
-
-        //printf("%d orig: %f %f, ", ql_loop, max_min, max_scale);
-        float min, scale;
-        lstsq_q_k(q_fit, x, q_m, 32, &min, &scale);
-        if (min != min) {
-            printf("min nan\n");
-        }
-        if (scale != scale) {
-            printf("scale nan\n");
-        }
-        //printf("fit: %f %f\n", max_min, max_scale);
-        y[i].d = GGML_FP32_TO_FP16(scale);
-        y[i].dmin = GGML_FP32_TO_FP16(min);
-        //printf("%f %f, %f %f\n", max_min, min * 63.0f, max_scale, scale * 63.0f);
-        max_scale = GGML_FP16_TO_FP32(y[i].d) * 63.0f;
-        max_min = GGML_FP16_TO_FP32(y[i].dmin) * 63.0f;
-
-        ql_loop++;
-        if (ql_loop == 1) {
-            goto quant_loop;
-        }
 #else
         const float s_factor = 15.f;
         float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;