diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index ae807f493..5789bd9ea 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -307,7 +307,7 @@ int main(int argc, char ** argv) { // loop throught quantization types //for (int i = 0; i < GGML_TYPE_COUNT; i++) { - for (int i = 1; i < 2; i++) { + for (int i = 0; i < 1; i++) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } @@ -315,12 +315,14 @@ int main(int argc, char ** argv) { if (i < 2 && checkNewQuantization) { //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1; - //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; + ////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1; //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1; //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1; - qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K; - qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K; + //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K; + qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K; + qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K; } if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { diff --git a/ggml_extra.cpp b/ggml_extra.cpp index 787d62edb..fa2591e68 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -105,7 +105,30 @@ float quanizeRmseK(int n, const float* X, int8_t* L, sumlx += X[i]*l; suml2 += l*l; L[i] = l; } - return sumlx/suml2; + float scale = sumlx/suml2; + best = scale*sumlx; + for (int itry=0; itry<3; ++itry) { + bool haveChanges = false; + for (int i=0; i 0 && L[i] < nmax) { + auto s1 = sumlx + X[i]; + auto s2 = suml2 + 2*L[i] + 1; + if (s2 > 0 && s1*s1 > best*s2) { + scale = s1/s2; best = scale*s1; ++L[i]; sumlx = s1; suml2 = s2; haveChanges = true; + } + } + else if (g < 0 && L[i] > nmin) { + auto s1 = sumlx - X[i]; + auto s2 = suml2 - 2*L[i] + 1; + if (s2 > 0 && s1*s1 > best*s2) { + scale = s1/s2; best = scale*s1; --L[i]; sumlx = s1; suml2 = s2; haveChanges = true; + } + } + } + if (!haveChanges) break; + } + return scale; } // The following improves the above. // It gives RMSE = 0.00185228 for the 7B model. @@ -125,6 +148,19 @@ float quanizeRmseK15(int n, const float* X, int8_t* L) { return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 15); } +float quanizeRmseK31(int n, const float* X, int8_t* L) { + constexpr int kCandiateCount = 24; + static const float candidates[kCandiateCount] = { + +35.25, +34.25f, +33.25f, +32.75f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +29.75f, +29.25f, +28.25f, +27.25f, +26.25f, + +25.25f, +24.25f, +23.25, +22.25f, +21.25f, +20.25f, +19.25f, +18.25f, +17.25f, +16.25f + }; + //static const float candidates[kCandiateCount] = { + // +33.25f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +30.25f, +29.25f, +28.75f, +27.25f, +26.25f, +25.25f, +24.25f, +23.25, +22.25f, + // +21.25f + //}; + return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 31); +} + // Fast (as much faster than doing the optimization), but not very good. float quanizeRmseFast(int n, const float* X, int8_t* L) { //constexpr int kCandiateCount = 3; @@ -295,8 +331,9 @@ std::pair kQuantize1(int n, const float* X, int8_t* L, std::vector double a = min, b = 0; for (int itry=0; itry<5; ++itry) { for (int i=0; i 0) { + float iscale = 127.f/max; + float scale = max/127.f; + std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale); + for (int k=0; k<16; ++k) data[k] = toNearestInt(iscale * *x++); + data += 16; + } else { + float scale = 1; + std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale); + auto aux = (uint32_t*)data; + aux[0] = aux[1] = aux[2] = aux[3] = 0; + data += 16; + } + } +} + +void kDequantizeQ8(const void* x, float* y, int k) { + assert(k % QK == 0); + auto data = (const int8_t*)x; + int n = k / (QK/2); + for (int i=0; i