POC: Measure rmse of 8 bit quantization

q8_0 : rmse 0.00010729, maxerr 0.01030385, 95pct<0.0002, median<0.0002
This commit is contained in:
Iwan Kawrakow 2023-04-13 12:00:24 +02:00
parent 6f34961559
commit 97d7ac7565
3 changed files with 84 additions and 7 deletions

View file

@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
// loop throught quantization types
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
for (int i = 1; i < 2; i++) {
for (int i = 0; i < 1; i++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
@ -315,12 +315,14 @@ int main(int argc, char ** argv) {
if (i < 2 && checkNewQuantization) {
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
//if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K;
qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K;
}
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {

View file

@ -105,7 +105,30 @@ float quanizeRmseK(int n, const float* X, int8_t* L,
sumlx += X[i]*l; suml2 += l*l;
L[i] = l;
}
return sumlx/suml2;
float scale = sumlx/suml2;
best = scale*sumlx;
for (int itry=0; itry<3; ++itry) {
bool haveChanges = false;
for (int i=0; i<n; ++i) {
auto g = X[i] - scale*L[i];
if (g > 0 && L[i] < nmax) {
auto s1 = sumlx + X[i];
auto s2 = suml2 + 2*L[i] + 1;
if (s2 > 0 && s1*s1 > best*s2) {
scale = s1/s2; best = scale*s1; ++L[i]; sumlx = s1; suml2 = s2; haveChanges = true;
}
}
else if (g < 0 && L[i] > nmin) {
auto s1 = sumlx - X[i];
auto s2 = suml2 - 2*L[i] + 1;
if (s2 > 0 && s1*s1 > best*s2) {
scale = s1/s2; best = scale*s1; --L[i]; sumlx = s1; suml2 = s2; haveChanges = true;
}
}
}
if (!haveChanges) break;
}
return scale;
}
// The following improves the above.
// It gives RMSE = 0.00185228 for the 7B model.
@ -125,6 +148,19 @@ float quanizeRmseK15(int n, const float* X, int8_t* L) {
return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 15);
}
float quanizeRmseK31(int n, const float* X, int8_t* L) {
constexpr int kCandiateCount = 24;
static const float candidates[kCandiateCount] = {
+35.25, +34.25f, +33.25f, +32.75f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +29.75f, +29.25f, +28.25f, +27.25f, +26.25f,
+25.25f, +24.25f, +23.25, +22.25f, +21.25f, +20.25f, +19.25f, +18.25f, +17.25f, +16.25f
};
//static const float candidates[kCandiateCount] = {
// +33.25f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +30.25f, +29.25f, +28.75f, +27.25f, +26.25f, +25.25f, +24.25f, +23.25, +22.25f,
// +21.25f
//};
return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 31);
}
// Fast (as much faster than doing the optimization), but not very good.
float quanizeRmseFast(int n, const float* X, int8_t* L) {
//constexpr int kCandiateCount = 3;
@ -295,8 +331,9 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
double a = min, b = 0;
for (int itry=0; itry<5; ++itry) {
for (int i=0; i<n; ++i) tmpX[i] = X[i] - a;
//quanizeRmseK15(n, tmpX.data(), L);
kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
if (nmax == 7) quanizeRmseK15(n, tmpX.data(), L);
else if (nmax == 15) quanizeRmseK31(n, tmpX.data(), L);
else kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
double sumlx = 0, sumx = 0;
int suml2 = 0, suml = 0;
for (int i=0; i<n; ++i) {
@ -598,4 +635,39 @@ void kDequantizeQ4_1K(const void* x, float* y, int k) {
}
}
void kQuantizeQ8Simple(const float* x, void* y, int k) {
assert(k % QK == 0);
auto data = (int8_t*)y;
int n = k / (QK/2);
for (int i=0; i<n; ++i) {
float max = 0;
for (int k=0; k<16; ++k) max = std::max(max, std::abs(x[k]));
if (max > 0) {
float iscale = 127.f/max;
float scale = max/127.f;
std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale);
for (int k=0; k<16; ++k) data[k] = toNearestInt(iscale * *x++);
data += 16;
} else {
float scale = 1;
std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale);
auto aux = (uint32_t*)data;
aux[0] = aux[1] = aux[2] = aux[3] = 0;
data += 16;
}
}
}
void kDequantizeQ8(const void* x, float* y, int k) {
assert(k % QK == 0);
auto data = (const int8_t*)x;
int n = k / (QK/2);
for (int i=0; i<n; ++i) {
float scale;
std::memcpy(&scale, data, sizeof(scale)); data += sizeof(scale);
for (int k=0; k<16; ++k) *y++ = scale*data[k];
data += 16;
}
}
}

View file

@ -34,6 +34,9 @@ void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int
void kQuantizeQ4_1K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
void kDequantizeQ4_1K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
void kQuantizeQ8Simple(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
void kDequantizeQ8(const void* GGML_RESTRICT x, float* GGML_RESTRICT y, int k);
#ifdef __cplusplus
}
#endif