From 433c4f505004afb0e60416f3c1514b2438f7c587 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Forst=C3=A9n?= <henrik.forsten@gmail.com>
Date: Tue, 26 Dec 2023 17:37:46 +0200
Subject: [PATCH] Least squares quantization

---
 ggml-quants.c | 606 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 360 insertions(+), 246 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index a15a24048..aa793ffc5 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -5,6 +5,7 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
+#include <stdio.h>
 
 #ifdef __ARM_NEON
 
@@ -424,6 +425,138 @@ static const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
 static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
 #endif
 
+static void lstsq_q_1(const uint8_t * restrict q, const float * restrict x, int qk, float *min, float *d) {
+    // Least squares fits `d * q + m = x` for d and m.
+    float qs = 0.0f;
+    float qs2 = 0.0f;
+    float xs = 0.0f;
+    float xq = 0.0f;
+    float minx = x[0];
+    float maxx = x[0];
+    for (int i = 0; i < qk; i++) {
+        float qf = q[i];
+        qs += qf;
+        qs2 += qf*qf;
+        xs += x[i];
+        xq += x[i]*qf;
+        if (x[i] < minx) minx = x[i];
+        if (x[i] > maxx) maxx = x[i];
+    }
+    float denom = qs*qs - qs2*qk;
+    if (minx == maxx) {
+        *min = x[0];
+        *d = 0.0f;
+    } else if (denom == 0.0f) {
+        *min = 0.0f;
+        *d = 0.0f;
+    } else {
+        *min = (qs*xq - qs2*xs) / denom;
+        *d = (qs*xs - qk*xq) / denom;
+    }
+}
+
+static float lstsq_q_0(const float * restrict q, const float * restrict x, int qk) {
+    // Least squares fits `d * q = x` for d.
+    float qs2 = 0.0f;
+    float xq = 0.0f;
+    for (int i = 0; i < qk; i++) {
+        qs2 += q[i]*q[i];
+        xq += x[i]*q[i];
+    }
+    if (qs2 == 0.0f) {
+        return 0.0f;
+    }
+    return xq / qs2;
+}
+
+static float lstsq_q_0_u8(const uint8_t * restrict q, const float * restrict x, int qk) {
+    // Least squares fits `d * q = x` for d.
+    float qs2 = 0.0f;
+    float xq = 0.0f;
+    for (int i = 0; i < qk; i++) {
+        qs2 += q[i]*q[i];
+        xq += x[i]*q[i];
+    }
+    if (qs2 == 0.0f) {
+        return 0.0f;
+    }
+    return xq / qs2;
+}
+
+static void lstsq_q_k(const float * restrict q, const float * restrict x, const float * restrict s, int bs, float *min, float *d) {
+    // Least squares fits `d * q - s * m = x` for d and m.
+    float s2 = 0.0f;
+    float qs = 0.0f;
+    float q2 = 0.0f;
+    float sx = 0.0f;
+    float qx = 0.0f;
+    for (int i = 0; i < QK_K; i++) {
+        s2 += s[i/bs]*s[i/bs];
+        qs += q[i]*s[i/bs];
+        q2 += q[i]*q[i];
+        sx += s[i/bs]*x[i];
+        qx += q[i]*x[i];
+    }
+    float denom = qs*qs - q2*s2;
+    if (s2 == 0.0f) {
+        // All s are zero.
+        *min = 0.0f;
+        *d = qx / q2;
+    } else if (denom == 0.0f) {
+        *min = 0.0f;
+        *d = 0.0f;
+    } else {
+        *min = (q2*sx - qs*qx) / denom;
+        *d = (qs*sx - qx*s2) / denom;
+    }
+}
+
+static void quantize_1_0min(const float * restrict x, int n, int bits, uint8_t * restrict q, float * scale) {
+    // Least squares fits `d * q = x` for d with unsigned q.
+    float max = -FLT_MAX;
+    for (int l = 0; l < n; l++) {
+        const float v = fabsf(x[l]);
+        if (v > max) max = v;
+    }
+
+    float d  = max / ((1 << bits) - 1);
+    *scale = d;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    for (int l = 0; l < n; l++) {
+        const float x0 = x[l]*id;
+
+        const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f));
+
+        q[l] = xi0;
+    }
+    *scale = lstsq_q_0_u8(q, x, n);
+}
+
+static void quantize_1(const float * restrict x, int n, int bits, uint8_t * restrict q, float * m, float * scale) {
+    float min = FLT_MAX;
+    float max = -FLT_MAX;
+    for (int l = 0; l < n; l++) {
+        const float v = x[l];
+
+        if (v < min) min = v;
+        if (v > max) max = v;
+    }
+
+    float d  = (max - min) / ((1 << bits) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    for (int l = 0; l < n; l++) {
+        const float x0 = (x[l] - min)*id;
+
+        const uint8_t xi0 = MIN((1 << bits) - 1, (int8_t)(x0 + 0.5f));
+
+        q[l] = xi0;
+    }
+
+    lstsq_q_1(q, x, n, m, scale);
+}
+
 // reference implementation for deterministic creation of model files
 void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     static const int qk = QK4_0;
@@ -474,32 +607,18 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
+        uint8_t q[qk];
+        float min;
+        float scale;
+        quantize_1(&x[i*qk], qk, 4, q, &min, &scale);
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
+            y[i].qs[j]  = q[j];
+            y[i].qs[j] |= q[j + qk/2] << 4;
         }
+
+        y[i].d = GGML_FP32_TO_FP16(scale);
+        y[i].m = GGML_FP32_TO_FP16(min);
     }
 }
 
@@ -563,30 +682,16 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
     const int nb = k / qk;
 
     for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
+        uint8_t q[qk];
+        float min;
+        float scale;
+        quantize_1(&x[i*qk], qk, 5, q, &min, &scale);
 
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
-            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+            const uint8_t xi0 = q[j];
+            const uint8_t xi1 = q[j + qk/2];
 
             y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
 
@@ -595,6 +700,9 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
             qh |= ((xi1 & 0x10u) >> 4) << (j + qk/2);
         }
 
+        y[i].d = GGML_FP32_TO_FP16(scale);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
         memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
     }
 }
@@ -1318,130 +1426,6 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
     return 1/iscale;
 }
 
-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
-        int ntry, float alpha) {
-    float min = x[0];
-    float max = x[0];
-    for (int i = 1; i < n; ++i) {
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-    }
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = 0;
-        return 0.f;
-    }
-    if (min > 0) min = 0;
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    for (int itry = 0; itry < ntry; ++itry) {
-        float sumlx = 0; int suml2 = 0;
-        bool did_change = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            if (l != L[i]) {
-                L[i] = l;
-                did_change = true;
-            }
-            sumlx += (x[i] - min)*l;
-            suml2 += l*l;
-        }
-        scale = sumlx/suml2;
-        float sum = 0;
-        for (int i = 0; i < n; ++i) {
-            sum += x[i] - scale*L[i];
-        }
-        min = alpha*min + (1 - alpha)*sum/n;
-        if (min > 0) min = 0;
-        iscale = 1/scale;
-        if (!did_change) break;
-    }
-    *the_min = -min;
-    return scale;
-}
-
-static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
-        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
-        float rmin, float rdelta, int nstep, bool use_mad) {
-    float min = x[0];
-    float max = x[0];
-    float sum_w = weights[0];
-    float sum_x = sum_w * x[0];
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 1; i < n; ++i) {
-#else
-    for (int i = 1; i < n; ++i) {
-#endif
-        if (x[i] < min) min = x[i];
-        if (x[i] > max) max = x[i];
-        float w = weights[i];
-        sum_w += w;
-        sum_x += w * x[i];
-    }
-    if (min > 0) min = 0;
-    if (max == min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
-        *the_min = -min;
-        return 0.f;
-    }
-    float iscale = nmax/(max - min);
-    float scale = 1/iscale;
-    float best_mad = 0;
-    for (int i = 0; i < n; ++i) {
-        int l = nearest_int(iscale*(x[i] - min));
-        L[i] = MAX(0, MIN(nmax, l));
-        float diff = scale * L[i] + min - x[i];
-        diff = use_mad ? fabsf(diff) : diff * diff;
-        float w = weights[i];
-        best_mad += w * diff;
-    }
-    if (nstep < 1) {
-        *the_min = -min;
-        return scale;
-    }
-    for (int is = 0; is <= nstep; ++is) {
-        iscale = (rmin + rdelta*is + nmax)/(max - min);
-        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale*(x[i] - min));
-            l = MAX(0, MIN(nmax, l));
-            Laux[i] = l;
-            float w = weights[i];
-            sum_l += w*l;
-            sum_l2 += w*l*l;
-            sum_xl += w*l*x[i];
-        }
-        float D = sum_w * sum_l2 - sum_l * sum_l;
-        if (D > 0) {
-            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
-            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
-            if (this_min > 0) {
-                this_min = 0;
-                this_scale = sum_xl / sum_l2;
-            }
-            float mad = 0;
-            for (int i = 0; i < n; ++i) {
-                float diff = this_scale * Laux[i] + this_min - x[i];
-                diff = use_mad ? fabsf(diff) : diff * diff;
-                float w = weights[i];
-                mad += w * diff;
-            }
-            if (mad < best_mad) {
-                for (int i = 0; i < n; ++i) {
-                    L[i] = Laux[i];
-                }
-                best_mad = mad;
-                scale = this_scale;
-                min = this_min;
-            }
-        }
-    }
-    *the_min = -min;
-    return scale;
-}
-
 #if QK_K == 256
 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
     if (j < 4) {
@@ -1460,8 +1444,6 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    uint8_t Laux[16];
-    float   weights[16];
     float mins[QK_K/16];
     float scales[QK_K/16];
 
@@ -1469,24 +1451,43 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
 
     for (int i = 0; i < nb; i++) {
         float max_scale = 0; // as we are deducting the min, scales are always positive
-        float max_min = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
-            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
+        float max_min = FLT_MAX;
+
+        int all_positive = 1;
+        for (int j = 0; j < QK_K; j++) {
+            if (x[j] < 0.0f) {
+                all_positive = 0;
+                max_min = -FLT_MAX;
+                break;
             }
         }
 
-        if (max_scale > 0) {
+redo_pos2:
+
+        for (int j = 0; j < QK_K/16; j++) {
+            uint8_t q[QK_K/16];
+            quantize_1(&x[16*j], 16, 2, q, &mins[j], &scales[j]);
+            mins[j] = -mins[j];
+            if ((!all_positive) && (mins[j] < 0)) {
+                mins[j] = 0.0f;
+                quantize_1_0min(&x[16*j], 16, 2, q, &scales[j]);
+            }
+            if (scales[j] > max_scale) {
+                max_scale = scales[j];
+            }
+            if (!all_positive && mins[j] > max_min) {
+                max_min = mins[j];
+            } else if (all_positive && mins[j] < max_min) {
+                max_min = mins[j];
+            }
+        }
+
+        int all_zero_lm = 1;
+        if (max_scale != 0) {
             float iscale = q4scale/max_scale;
             for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*scales[j]);
+                int l = MAX(0, MIN(63, nearest_int(iscale*scales[j])));
+                if (l != 0) all_zero_lm = 0;
                 y[i].scales[j] = l;
             }
             y[i].d = GGML_FP32_TO_FP16(max_scale/q4scale);
@@ -1494,27 +1495,52 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
             for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
             y[i].d = GGML_FP32_TO_FP16(0.f);
         }
-        if (max_min > 0) {
+        if (max_min != 0) {
             float iscale = q4scale/max_min;
             for (int j = 0; j < QK_K/16; ++j) {
-                int l = nearest_int(iscale*mins[j]);
+                int l = MAX(0, MIN(63, nearest_int(iscale*mins[j])));
                 y[i].scales[j] |= (l << 4);
             }
             y[i].dmin = GGML_FP32_TO_FP16(max_min/q4scale);
         } else {
             y[i].dmin = GGML_FP32_TO_FP16(0.f);
         }
+
+        if (all_zero_lm && !all_positive) {
+            all_positive = 1;
+            //printf("**********red_pos2\n");
+            goto redo_pos2;
+        } else if (all_zero_lm) {
+            //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
+        }
+
+
+        float q_fit[QK_K];
+        float q_m[QK_K/16];
+
         for (int j = 0; j < QK_K/16; ++j) {
-            const float d = GGML_FP16_TO_FP32(y[i].d) * (y[i].scales[j] & 0xF);
-            if (!d) continue;
-            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * (y[i].scales[j] >> 4);
+            uint8_t sc = y[i].scales[j] & 0xF;
+            uint8_t m = y[i].scales[j] >> 4;
+            const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
+            const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            q_m[j] = (y[i].scales[j] >> 4);
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int((x[16*j + ii] + dm)/d);
-                l = MAX(0, MIN(3, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[16*j + ii] + dm)/d);
+                    l = MAX(0, MIN(3, l));
+                }
                 L[16*j + ii] = l;
+                q_fit[16*j + ii] = sc * l;
             }
         }
 
+        float min;
+        float d;
+        lstsq_q_k(q_fit, x, q_m, 16, &min, &d);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].dmin = GGML_FP32_TO_FP16(min);
+
 #if QK_K == 256
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; ++l) {
@@ -1634,17 +1660,19 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
         }
 
         int8_t sc;
+        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
             sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
             float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) {
-                continue;
-            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int(x[16*j + ii]/d);
+                    l = MAX(-4, MIN(3, l));
+                }
                 L[16*j + ii] = l + 4;
+                q_fit[16*j + ii] = l * sc;
             }
         }
 #else
@@ -1664,20 +1692,24 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict
             }
             y[i].d = GGML_FP32_TO_FP16(0.f);
         }
+        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
             float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
-            if (!d) {
-                continue;
-            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-4, MIN(3, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int(x[16*j + ii]/d);
+                    l = MAX(-4, MIN(3, l));
+                }
                 L[16*j + ii] = l + 4;
+                q_fit[16*j + ii] = l * (s - 8);
             }
         }
 #endif
 
+        y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K));
+
         memset(y[i].hmask, 0, QK_K/8);
         // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
         int m = 0;
@@ -1812,40 +1844,55 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
     const int nb = k / QK_K;
 
     uint8_t L[QK_K];
-    uint8_t Laux[32];
-    float   weights[32];
     float mins[QK_K/32];
     float scales[QK_K/32];
 
     for (int i = 0; i < nb; i++) {
-
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
+
+        int all_positive = 1;
+        for (int j = 0; j < QK_K; j++) {
+            if (x[j] < 0.0f) {
+                all_positive = 0;
+                break;
             }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
+        }
+
+redo_pos4:
+
+        for (int j = 0; j < QK_K/32; j++) {
+            uint8_t q[QK_K/32];
+            quantize_1(&x[32*j], 32, 4, q, &mins[j], &scales[j]);
+            mins[j] = -mins[j];
+            if ((!all_positive) && (mins[j] < 0)) {
+                mins[j] = 0.0f;
+                quantize_1_0min(&x[32*j], 32, 4, q, &scales[j]);
+            }
+
+            if (j == 0 || scales[j] > max_scale) {
+                max_scale = scales[j];
+            }
+            if (j == 0) {
+                max_min = mins[j];
+            }
+            if (!all_positive && mins[j] > max_min) {
+                max_min = mins[j];
+            } else if (all_positive && mins[j] < max_min) {
+                max_min = mins[j];
             }
         }
 
 #if QK_K == 256
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        float inv_scale = max_scale == 0.0f ? 0.0f : 63.f/max_scale;
+        float inv_min   = max_min  == 0.0f ? 0.0f : 63.f/max_min;
+        int all_zero_lm = 1;
         for (int j = 0; j < QK_K/32; ++j) {
             uint8_t ls = nearest_int(inv_scale*scales[j]);
             uint8_t lm = nearest_int(inv_min*mins[j]);
             ls = MIN(63, ls);
             lm = MIN(63, lm);
+            if (lm != 0) all_zero_lm = 0;
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -1855,21 +1902,43 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
+
+        if (all_zero_lm && !all_positive) {
+            all_positive = 1;
+            //printf("**********red_pos4\n");
+            goto redo_pos4;
+        } else if (all_zero_lm) {
+            //printf("all_zero_lm, all_pos %d, max_scale %f, max_min %f\n", all_positive, max_scale, max_min);
+        }
+
         y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
         y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+        float q_fit[QK_K];
+        float q_m[QK_K/32];
 
         uint8_t sc, m;
         for (int j = 0; j < QK_K/32; ++j) {
             get_scale_min_k4(j, y[i].scales, &sc, &m);
             const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
             const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            q_m[j] = m;
             for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(15, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[32*j + ii] + dm)/d);
+                    l = MAX(0, MIN(15, l));
+                }
                 L[32*j + ii] = l;
+                q_fit[32*j + ii] = sc * l;
             }
         }
+
+        float min;
+        float d;
+        lstsq_q_k(q_fit, x, q_m, 32, &min, &d);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].dmin = GGML_FP32_TO_FP16(min);
+
 #else
         const float s_factor = 15.f;
         float inv_scale = max_scale > 0 ? s_factor/max_scale : 0.f;
@@ -1980,8 +2049,6 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
     uint8_t L[QK_K];
     float mins[QK_K/32];
     float scales[QK_K/32];
-    float weights[32];
-    uint8_t Laux[32];
 #else
     int8_t L[QK_K];
     float scales[QK_K/16];
@@ -1993,30 +2060,50 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
 
         float max_scale = 0; // as we are deducting the min, scales are always positive
         float max_min = 0;
-        for (int j = 0; j < QK_K/32; ++j) {
-            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
-            float sum_x2 = 0;
-            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
-            float av_x = sqrtf(sum_x2/32);
-            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
-            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
-            float scale = scales[j];
-            if (scale > max_scale) {
-                max_scale = scale;
-            }
-            float min = mins[j];
-            if (min > max_min) {
-                max_min = min;
+
+        int all_positive = 1;
+        for (int j = 0; j < QK_K; j++) {
+            if (x[j] < 0.0f) {
+                all_positive = 0;
+                break;
             }
         }
 
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+redo_pos:
+
+        for (int j = 0; j < QK_K/32; j++) {
+            uint8_t q[QK_K/32];
+            quantize_1(&x[32*j], 32, 5, q, &mins[j], &scales[j]);
+            mins[j] = -mins[j];
+            if ((!all_positive) && (mins[j] < 0)) {
+                mins[j] = 0.0f;
+                quantize_1_0min(&x[32*j], 32, 5, q, &scales[j]);
+            }
+
+            if (j == 0 || scales[j] > max_scale) {
+                max_scale = scales[j];
+            }
+
+            if (j == 0) {
+                max_min = mins[j];
+            }
+            if (!all_positive && mins[j] > max_min) {
+                max_min = mins[j];
+            } else if (all_positive && mins[j] < max_min) {
+                max_min = mins[j];
+            }
+        }
+
+
+        float inv_scale = max_scale == 0.0f ? 0.f : 63.f/max_scale;
+        float inv_min   = max_min == 0.0f ? 0.f : 63.f/max_min;
+        int all_zero_lm = 1;
         for (int j = 0; j < QK_K/32; ++j) {
             uint8_t ls = nearest_int(inv_scale*scales[j]);
             uint8_t lm = nearest_int(inv_min*mins[j]);
             ls = MIN(63, ls);
             lm = MIN(63, lm);
+            if (lm != 0) all_zero_lm = 0;
             if (j < 4) {
                 y[i].scales[j] = ls;
                 y[i].scales[j+4] = lm;
@@ -2026,22 +2113,45 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
                 y[i].scales[j-0] |= ((lm >> 4) << 6);
             }
         }
+
+        if (all_positive) {
+            //printf("all_pos: %f %f %d\n", max_scale, max_min, all_zero_lm);
+        }
+
+        if (all_zero_lm && !all_positive) {
+            all_positive = 1;
+            //printf("**********red_pos\n");
+            goto redo_pos;
+        }
+
         y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
         y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+        float q_fit[QK_K];
+        float q_m[QK_K/32];
 
         uint8_t sc, m;
         for (int j = 0; j < QK_K/32; ++j) {
             get_scale_min_k4(j, y[i].scales, &sc, &m);
             const float d = GGML_FP16_TO_FP32(y[i].d) * sc;
-            if (!d) continue;
             const float dm = GGML_FP16_TO_FP32(y[i].dmin) * m;
+            q_m[j] = m;
             for (int ii = 0; ii < 32; ++ii) {
-                int l = nearest_int((x[32*j + ii] + dm)/d);
-                l = MAX(0, MIN(31, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int((x[32*j + ii] + dm)/d);
+                    l = MAX(0, MIN(31, l));
+                }
                 L[32*j + ii] = l;
+                q_fit[32*j + ii] = sc * l;
             }
         }
 
+        float min;
+        float d;
+        lstsq_q_k(q_fit, x, q_m, 32, &min, &d);
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].dmin = GGML_FP32_TO_FP16(min);
+
         uint8_t * restrict qh = y[i].qh;
         uint8_t * restrict ql = y[i].qs;
         memset(qh, 0, QK_K/8);
@@ -2216,18 +2326,22 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
             y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
         }
 
+        float q_fit[QK_K];
         for (int j = 0; j < QK_K/16; ++j) {
             float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) {
-                continue;
-            }
             for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-32, MIN(31, l));
+                int l = 0;
+                if (d) {
+                    l = nearest_int(x[16*j + ii]/d);
+                    l = MAX(-32, MIN(31, l));
+                }
                 L[16*j + ii] = l + 32;
+                q_fit[16*j + ii] = l * y[i].scales[j];
             }
         }
 
+        y[i].d = GGML_FP32_TO_FP16(lstsq_q_0(q_fit, x, QK_K));
+
         uint8_t * restrict ql = y[i].ql;
         uint8_t * restrict qh = y[i].qh;
 #if QK_K == 256