diff --git a/ggml_extra.cpp b/ggml_extra.cpp index 9eb9bbe83..3a996d56d 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -1,5 +1,6 @@ #include "ggml_extra.h" +#include #include #include #include @@ -23,6 +24,94 @@ inline int toNearestInt(float fval) { return (i & 0x007fffff) - 0x00400000; } +// Adapted from PR #835, function quantize_row_q4_0_rmse() +// +// I absolutely cannot reproduce the rmse = 0.00185915 reported in #835. +// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 +// with the modification that determines the scale actually minimizing +// the rmse. +// +// Do I have a bug? iI don't see it. +// The only difference is that I'm using toNearestInt() +// instead of round(), but what are the odds for getting scaled weights at +// exactly 2.5, 4.5, and 6.5, where toNearestInt() and round() differ. +// (with toNearestInt() behaving as expected and rounding towards the even integer, +// while round() always rounding up. +float quanizeRmse(int n, const float* X, int8_t* L) { +#define Q4_0_SCALE_CANDIDATE_COUNT 8 + static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f }; + float max = 0, amax = 0; + for (int i=0; i amax) { amax = ax; max = X[i]; } + } + if (!amax) { // all zero + for (int i=0; i::max(), bestScale = 0; + for (int si=0; si best*suml2) { + best = sumlx*sumlx/suml2; bestScale = iscale; + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { work.clear(); work.reserve(n*(nmax+2)); @@ -137,17 +226,19 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector>& work, std::vector& tmpX) { auto q = (uint8_t*)y; if (type == 0) { - if (int(tmpX.size()) < QK) tmpX.resize(QK); - auto r1 = kQuantize0(QK, X, L, work, -8, 7); - for (int i=0; i r1.first) { - scale = -r2.first; - std::memcpy(L, L2, QK); - } - //float scale = kQuantize0(QK, X, L, work, -7, 7); + auto scale = quanizeRmseK(QK, X, L); + // The following is not quite as good as quanizeRmseK() and it is slower too. + //if (int(tmpX.size()) < QK) tmpX.resize(QK); + //auto r1 = kQuantize0(QK, X, L, work, -8, 7); + //for (int i=0; i r1.first) { + // scale = -r2.first; + // std::memcpy(L, L2, QK); + //} + ////float scale = kQuantize0(QK, X, L, work, -7, 7); std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k