POC: Measure rmse of 8 bit quantization
q8_0 : rmse 0.00010729, maxerr 0.01030385, 95pct<0.0002, median<0.0002
This commit is contained in:
parent
6f34961559
commit
97d7ac7565
3 changed files with 84 additions and 7 deletions
|
@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// loop throught quantization types
|
||||
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
for (int i = 1; i < 2; i++) {
|
||||
for (int i = 0; i < 1; i++) {
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
|
@ -315,12 +315,14 @@ int main(int argc, char ** argv) {
|
|||
if (i < 2 && checkNewQuantization) {
|
||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
|
||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
|
||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
|
||||
////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
|
||||
//if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
|
||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
|
||||
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
|
||||
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
|
||||
qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
|
||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
|
||||
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
|
||||
qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K;
|
||||
qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K;
|
||||
}
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
|
|
|
@ -105,7 +105,30 @@ float quanizeRmseK(int n, const float* X, int8_t* L,
|
|||
sumlx += X[i]*l; suml2 += l*l;
|
||||
L[i] = l;
|
||||
}
|
||||
return sumlx/suml2;
|
||||
float scale = sumlx/suml2;
|
||||
best = scale*sumlx;
|
||||
for (int itry=0; itry<3; ++itry) {
|
||||
bool haveChanges = false;
|
||||
for (int i=0; i<n; ++i) {
|
||||
auto g = X[i] - scale*L[i];
|
||||
if (g > 0 && L[i] < nmax) {
|
||||
auto s1 = sumlx + X[i];
|
||||
auto s2 = suml2 + 2*L[i] + 1;
|
||||
if (s2 > 0 && s1*s1 > best*s2) {
|
||||
scale = s1/s2; best = scale*s1; ++L[i]; sumlx = s1; suml2 = s2; haveChanges = true;
|
||||
}
|
||||
}
|
||||
else if (g < 0 && L[i] > nmin) {
|
||||
auto s1 = sumlx - X[i];
|
||||
auto s2 = suml2 - 2*L[i] + 1;
|
||||
if (s2 > 0 && s1*s1 > best*s2) {
|
||||
scale = s1/s2; best = scale*s1; --L[i]; sumlx = s1; suml2 = s2; haveChanges = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!haveChanges) break;
|
||||
}
|
||||
return scale;
|
||||
}
|
||||
// The following improves the above.
|
||||
// It gives RMSE = 0.00185228 for the 7B model.
|
||||
|
@ -125,6 +148,19 @@ float quanizeRmseK15(int n, const float* X, int8_t* L) {
|
|||
return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 15);
|
||||
}
|
||||
|
||||
float quanizeRmseK31(int n, const float* X, int8_t* L) {
|
||||
constexpr int kCandiateCount = 24;
|
||||
static const float candidates[kCandiateCount] = {
|
||||
+35.25, +34.25f, +33.25f, +32.75f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +29.75f, +29.25f, +28.25f, +27.25f, +26.25f,
|
||||
+25.25f, +24.25f, +23.25, +22.25f, +21.25f, +20.25f, +19.25f, +18.25f, +17.25f, +16.25f
|
||||
};
|
||||
//static const float candidates[kCandiateCount] = {
|
||||
// +33.25f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +30.25f, +29.25f, +28.75f, +27.25f, +26.25f, +25.25f, +24.25f, +23.25, +22.25f,
|
||||
// +21.25f
|
||||
//};
|
||||
return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 31);
|
||||
}
|
||||
|
||||
// Fast (as much faster than doing the optimization), but not very good.
|
||||
float quanizeRmseFast(int n, const float* X, int8_t* L) {
|
||||
//constexpr int kCandiateCount = 3;
|
||||
|
@ -295,8 +331,9 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
|
|||
double a = min, b = 0;
|
||||
for (int itry=0; itry<5; ++itry) {
|
||||
for (int i=0; i<n; ++i) tmpX[i] = X[i] - a;
|
||||
//quanizeRmseK15(n, tmpX.data(), L);
|
||||
kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
|
||||
if (nmax == 7) quanizeRmseK15(n, tmpX.data(), L);
|
||||
else if (nmax == 15) quanizeRmseK31(n, tmpX.data(), L);
|
||||
else kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
|
||||
double sumlx = 0, sumx = 0;
|
||||
int suml2 = 0, suml = 0;
|
||||
for (int i=0; i<n; ++i) {
|
||||
|
@ -598,4 +635,39 @@ void kDequantizeQ4_1K(const void* x, float* y, int k) {
|
|||
}
|
||||
}
|
||||
|
||||
void kQuantizeQ8Simple(const float* x, void* y, int k) {
|
||||
assert(k % QK == 0);
|
||||
auto data = (int8_t*)y;
|
||||
int n = k / (QK/2);
|
||||
for (int i=0; i<n; ++i) {
|
||||
float max = 0;
|
||||
for (int k=0; k<16; ++k) max = std::max(max, std::abs(x[k]));
|
||||
if (max > 0) {
|
||||
float iscale = 127.f/max;
|
||||
float scale = max/127.f;
|
||||
std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale);
|
||||
for (int k=0; k<16; ++k) data[k] = toNearestInt(iscale * *x++);
|
||||
data += 16;
|
||||
} else {
|
||||
float scale = 1;
|
||||
std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale);
|
||||
auto aux = (uint32_t*)data;
|
||||
aux[0] = aux[1] = aux[2] = aux[3] = 0;
|
||||
data += 16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kDequantizeQ8(const void* x, float* y, int k) {
|
||||
assert(k % QK == 0);
|
||||
auto data = (const int8_t*)x;
|
||||
int n = k / (QK/2);
|
||||
for (int i=0; i<n; ++i) {
|
||||
float scale;
|
||||
std::memcpy(&scale, data, sizeof(scale)); data += sizeof(scale);
|
||||
for (int k=0; k<16; ++k) *y++ = scale*data[k];
|
||||
data += 16;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,6 +34,9 @@ void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int
|
|||
void kQuantizeQ4_1K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||
void kDequantizeQ4_1K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
void kQuantizeQ8Simple(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||
void kDequantizeQ8(const void* GGML_RESTRICT x, float* GGML_RESTRICT y, int k);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue