From 931ae360500ed8dd9d7421fe2257654db7f96c0d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 22:08:47 +0200 Subject: [PATCH] Improve Q4_0 MSE Somehow I had it hard-wired in my brain that quants need to be in -7...7 to be comparable to the original Q4_0. But this is clearly not the case, and if we relax this requirement this simple change brings the rmse down to 0.001966 at the expense of a somewhat longer computation (~67 seconds vs 49 seconds for the 7B model on M2 Max). Perplexity test is still running but it looks like the improvement compared to the previous version will be quite modest ~0.03) despite the significant improvement in MSE. The change does not affect Q4_1 as there we already use the full range of 16 possible int values. --- ggml_extra.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ggml_extra.cpp b/ggml_extra.cpp index e2ae005df..9eb9bbe83 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -23,7 +23,7 @@ inline int toNearestInt(float fval) { return (i & 0x007fffff) - 0x00400000; } -float kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { +std::pair kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { work.clear(); work.reserve(n*(nmax+2)); float max = 0; int imax = -1; @@ -33,7 +33,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector 0) { - kmin = nmax-2; kmax = nmax + 1; + kmin = nmax-2; kmax = nmax+1; } else { kmin = nmax/2; kmax = nmax+1; } @@ -97,7 +97,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector kQuantize1(int n, const float* X, int8_t* L, std::vector& tmpX, @@ -137,7 +137,17 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector>& work, std::vector& tmpX) { auto q = (uint8_t*)y; if (type == 0) { - float scale = kQuantize0(QK, X, L, work, -7, 7); + if (int(tmpX.size()) < QK) tmpX.resize(QK); + auto r1 = kQuantize0(QK, X, L, work, -8, 7); + for (int i=0; i r1.first) { + scale = -r2.first; + std::memcpy(L, L2, QK); + } + //float scale = kQuantize0(QK, X, L, work, -7, 7); std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k