From 29b83e5fd665931226d152f29cc2ee21d7215523 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Apr 2023 16:25:19 +0200 Subject: [PATCH] Various experiments, including 5-bit qunatization --- examples/quantize-stats/quantize-stats.cpp | 8 +- ggml_extra.cpp | 211 +++++++++++++++++++-- ggml_extra.h | 6 + 3 files changed, 211 insertions(+), 14 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index cfee120c3..8ab1d02b0 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -306,13 +306,17 @@ int main(int argc, char ** argv) { std::vector output_scratch(SCRATCH_ELEMENTS); // loop throught quantization types - for (int i = 0; i < GGML_TYPE_COUNT; i++) { + //for (int i = 0; i < GGML_TYPE_COUNT; i++) { + for (int i = 1; i < 2; i++) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); if (i < 2 && checkNewQuantization) { - qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1; + qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; + if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1; } if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { diff --git a/ggml_extra.cpp b/ggml_extra.cpp index 3a996d56d..ed6330a3e 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -1,4 +1,5 @@ #include "ggml_extra.h" +#include "ggml.h" #include #include @@ -27,8 +28,7 @@ inline int toNearestInt(float fval) { // Adapted from PR #835, function quantize_row_q4_0_rmse() // // I absolutely cannot reproduce the rmse = 0.00185915 reported in #835. -// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 -// with the modification that determines the scale actually minimizing +// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 // with the modification that determines the scale actually minimizing // the rmse. // // Do I have a bug? iI don't see it. @@ -79,12 +79,58 @@ float quanizeRmse(int n, const float* X, int8_t* L) { //return 1/bestScale; } +float quanizeRmseK(int n, const float* X, int8_t* L, + int nCandidates, const float* candidates, int nmin, int nmax) { + float max = 0; + for (int i=0; i best*suml2) { + best = sumlx*sumlx/suml2; bestScale = iscale; + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i best*suml2) { - best = sumlx*sumlx/suml2; bestScale = iscale; + if (sumxlp*sumxlp*suml2m >= sumxlm*sumxlm*suml2p) { + if (sumxlp*sumxlp > best*suml2p) { + best = sumxlp*sumxlp/suml2p; bestScale = iscale; + } + } else { + if (sumxlm*sumxlm > best*suml2m) { + best = sumxlm*sumxlm/suml2m; bestScale = -iscale; + } } } float sumlx = 0; int suml2 = 0; @@ -112,6 +170,40 @@ float quanizeRmseK(int n, const float* X, int8_t* L) { return sumlx/suml2; } +float quanizeRmseOpt(int n, const float* X, int8_t* L, std::vector>& work) { + work.clear(); + work.reserve(n*17); + for (int l=-8; l<=8; ++l) { + float scale = l - 0.4999f; + for (int i=0; i 0 && sumlx*sumlx > best*suml2) { + best = sumlx*sumlx/suml2; bestScale = s; + } + } + } + sumlx = 0; suml2 = 0; + for (int i=0; i kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { work.clear(); work.reserve(n*(nmax+2)); @@ -200,9 +292,10 @@ std::pair kQuantize1(int n, const float* X, int8_t* L, std::vector return {min, 1.f}; } if (int(tmpX.size()) < n) tmpX.resize(n); - double a = min, b; - for (int itry=0; itry<3; ++itry) { + double a = min, b = 0; + for (int itry=0; itry<5; ++itry) { for (int i=0; i kQuantize1(int n, const float* X, int8_t* L, std::vector sumx += X[i]; } int64_t D = suml2*n - suml*suml; + auto aold = a, bold = b; a = (sumx*suml2 - sumlx*suml)/D; b = (sumlx*n - sumx*suml)/D; + if (itry > 0 && std::abs(a - aold) < 1e-6*std::abs(aold) && std::abs(b - bold) < 1e-6*std::abs(bold)) break; } return {a, b}; } +std::pair kQuantize1Fast(int n, const float* X, int8_t* L, int nmax) { + float min = X[0], max = X[1]; + for (int i=1; i>& work, std::vector& tmpX) { auto q = (uint8_t*)y; if (type == 0) { - auto scale = quanizeRmseK(QK, X, L); + auto scale = quanizeRmseK7(QK, X, L); + //auto scale = quanizeRmseFast(QK, X, L); + //auto scale = quanizeRmseOpt(QK, X, L, work); // The following is not quite as good as quanizeRmseK() and it is slower too. //if (int(tmpX.size()) < QK) tmpX.resize(QK); //auto r1 = kQuantize0(QK, X, L, work, -8, 7); @@ -241,11 +364,29 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { ////float scale = kQuantize0(QK, X, L, work, -7, 7); std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k 15) { l1 -= 16; *u |= m; } + m <<= 1; + if (l2 > 15) { l2 -= 16; *u |= m; } + m <<= 1; + q[k] = l1 | (l2 << 4); + } } }; @@ -318,6 +459,14 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) { kQuantizeQ4(x, buffer, k, 1); } +void kQuantizeQ5_1(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 2); +} + +void kQuantizeQ5_1_Fast(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 3); +} + size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) { kQuantizeQ4(x, buffer, k, 0); collectHisto(k, buffer, hist, 0); @@ -330,4 +479,42 @@ size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) { return (k / QK) * kBucketSize1; } +size_t kQuantizeQ5_1H(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 2); + collectHisto(k, buffer, hist, 1); + return (k / QK) * kBucketSize1; +} + +size_t kQuantizeQ5_1H_Fast(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 3); + collectHisto(k, buffer, hist, 1); + return (k / QK) * kBucketSize1; +} + +void kDequantizeQ5_1(const void* x, float* y, int k) { + assert(k % QK == 0); + int n = k / QK; + auto data = (const uint8_t*)x; + for (int i=0; i> 4; + if (u & m) l1 += 16; + m <<= 1; + if (u & m) l2 += 16; + m <<= 1; + *y++ = a + b*l1; + *y++ = a + b*l2; + } + data += 16; + } +} + } diff --git a/ggml_extra.h b/ggml_extra.h index 788fcd0ea..7faa43801 100644 --- a/ggml_extra.h +++ b/ggml_extra.h @@ -22,6 +22,12 @@ size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); +void kQuantizeQ5_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ5_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); +void kQuantizeQ5_1_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); +void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + #ifdef __cplusplus } #endif