Improve Q4_0 MSE

Somehow I had it hard-wired in my brain that quants need to be in -7...7 to be comparable to the original Q4_0. But this is clearly not the case, and if we relax this requirement this simple change brings the rmse down to 0.001966 at the expense of a somewhat longer computation (~67 seconds vs 49 seconds for the 7B model on M2 Max). Perplexity test is still running but it looks like the improvement compared to the previous version will be quite modest ~0.03) despite the significant improvement in MSE. The change does not affect Q4_1 as there we already use the full range of 16 possible int values.
2023-04-11 22:08:47 +02:00 · 2023-04-11 22:08:47 +02:00 · 931ae36050
commit 931ae36050
parent b6df974577
1 changed files with 15 additions and 5 deletions
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@ -23,7 +23,7 @@ inline int toNearestInt(float fval) {
    return (i & 0x007fffff) - 0x00400000;
 }

-float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
+std::pair<float, float> kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
    work.clear();
    work.reserve(n*(nmax+2));
    float max = 0; int imax = -1;
@ -33,7 +33,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,i
    }
    if (imax < 0) {  // all X are zero
        for (int i=0; i<n; ++i) L[i] = 0;
-        return 1.f;
+        return {1.f, 0.f};
    }
    float maxi = 1/max;
    int kmin, kmax;
@ -97,7 +97,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,i
        lasts = s;
    }
    for (int i=0; i<n; ++i) L[i] = std::max(nmin, std::min(nmax, toNearestInt(bests*X[i])));
-    return bestSumlx/bestSuml2;
+    return {bestSumlx/bestSuml2, bestSumlx*bestSumlx/bestSuml2};
 }

 std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector<float>& tmpX,
@ -137,7 +137,17 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
    auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
        auto q = (uint8_t*)y;
        if (type == 0) {
-            float scale = kQuantize0(QK, X, L, work, -7, 7);
+            if (int(tmpX.size()) < QK) tmpX.resize(QK);
+            auto r1 = kQuantize0(QK, X, L, work, -8, 7);
+            for (int i=0; i<QK; ++i) tmpX[i] = -X[i];
+            int8_t L2[QK];
+            auto r2 = kQuantize0(QK, tmpX.data(), L2, work, -8, 7);
+            float scale = r1.first;
+            if (r2.second > r1.first) {
+                scale = -r2.first;
+                std::memcpy(L, L2, QK);
+            }
+            //float scale = kQuantize0(QK, X, L, work, -7, 7);
            std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
            for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
        } else {