From 6eec06081b71f6ef55c3deb2114df39ec52064b0 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 19 Apr 2023 17:10:58 +0200 Subject: [PATCH] Q4_2 quantization with rmse-optimized scale and quants For quantize-stats we get q4_2: rmse 0.00159301, maxerr 0.17480469, 95pct<0.0030, median<0.0012 For 7B perplexity with BLAS enabled we get 6.2038 after 655 chunks. Quantization is slow (~90 seconds on my Mac for 7B) as not multi-threaded as in PR #896. --- ggml.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/ggml.c b/ggml.c index 13c1548fe..8792dd638 100644 --- a/ggml.c +++ b/ggml.c @@ -19,6 +19,7 @@ #include #include #include +#include // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 @@ -1123,12 +1124,94 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r } } +inline int nearestInt(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +static float kQuantizeWithBounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, + const float * restrict candidates, int8_t * restrict L) { + assert (nmin >= INT8_MIN); + assert (nmax <= INT8_MAX); + float amax = 0; + for (int i=0; i sumlxM2*suml2P) { + if (sumlxP2 > best*suml2P) { + best = sumlxP2/suml2P; bestScale = iscale; + } + } else { + if (sumlxM2 > best*suml2M) { + best = sumlxM2/suml2M; bestScale = -iscale; + } + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i