Q4_K: slightly better quantization

2024-02-06 07:53:10 +02:00 · 2024-02-06 07:53:10 +02:00 · f58d49e5ce
commit f58d49e5ce
parent c6b395535a
1 changed files with 15 additions and 12 deletions
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -2381,7 +2381,10 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
    uint8_t L[QK_K];
    uint8_t Laux[32];
    uint8_t Ls[QK_K/32];
    uint8_t Lm[QK_K/32];
    float   weights[32];
    float   sw[QK_K/32];
    float   mins[QK_K/32];
    float   scales[QK_K/32];
@ -2389,7 +2392,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
        float sum_x2 = 0;
        for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
-        float sigma2 = sum_x2/QK_K;
+        float sigma2 = 2*sum_x2/QK_K;
        float av_x = sqrtf(sigma2);
        float max_scale = 0; // as we are deducting the min, scales are always positive
@ -2401,8 +2404,10 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
            } else {
                for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
            }
            float sumw = 0;
            for (int l = 0; l < 32; ++l) sumw += weights[l];
            sw[j] = sumw;
            scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
          //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
@ -2413,13 +2418,11 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
            }
        }
-        float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
+        float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
-        float inv_min   = max_min   > 0 ? 63.f/max_min   : 0.f;
+        float m_block = make_qp_quants(QK_K/32, 63, mins,   Lm, sw);
        for (int j = 0; j < QK_K/32; ++j) {
-            uint8_t ls = nearest_int(inv_scale*scales[j]);
+            uint8_t ls = Ls[j];
-            uint8_t lm = nearest_int(inv_min*mins[j]);
+            uint8_t lm = Lm[j];
            ls = MIN(63, ls);
            lm = MIN(63, lm);
            if (j < 4) {
                y[i].scales[j] = ls;
                y[i].scales[j+4] = lm;
@ -2429,8 +2432,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
                y[i].scales[j-0] |= ((lm >> 4) << 6);
            }
        }
-        y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
+        y[i].d = GGML_FP32_TO_FP16(d_block);
-        y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
+        y[i].dmin = GGML_FP32_TO_FP16(m_block);
        uint8_t sc, m;
        for (int j = 0; j < QK_K/32; ++j) {