POC: Even lower rmse 4-bit Q4_0 quantization
Basically, we use two Q4_0 quantizations, each having 16 weights, to a quantize a set of 32 weights. We get two separate scaling factors, which we store as fp16, ending up using the exact same 5 bits per weight as the current Q4_0. We end up witn an rmse of ~0.00159, so basically the same as the improved Q4_1. But this should run faster than `Q4_1` (unless fp16 -> fp32 conversion is somehow very slow).
This commit is contained in:
parent
29b83e5fd6
commit
679e1cb6c0
3 changed files with 47 additions and 4 deletions
|
@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// loop throught quantization types
|
// loop throught quantization types
|
||||||
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
for (int i = 1; i < 2; i++) {
|
for (int i = 0; i < 1; i++) {
|
||||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -315,8 +315,10 @@ int main(int argc, char ** argv) {
|
||||||
if (i < 2 && checkNewQuantization) {
|
if (i < 2 && checkNewQuantization) {
|
||||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
|
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
|
||||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
|
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
|
||||||
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
|
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
|
||||||
if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
|
//if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
|
||||||
|
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
|
||||||
|
qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
|
||||||
}
|
}
|
||||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||||
if (params.verbose) {
|
if (params.verbose) {
|
||||||
|
|
|
@ -369,6 +369,15 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
|
||||||
std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
|
std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
|
||||||
std::memcpy(q, &result.first, sizeof(result.first)); q += sizeof(result.first);
|
std::memcpy(q, &result.first, sizeof(result.first)); q += sizeof(result.first);
|
||||||
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
|
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
|
||||||
|
} else if (type == 4) {
|
||||||
|
auto scale1 = quanizeRmseK7(QK/2, X, L);
|
||||||
|
auto scale2 = quanizeRmseK7(QK/2, X+QK/2, L+QK/2);
|
||||||
|
//printf("scale1 = %g, scale2 = %g\n",scale1,scale2);
|
||||||
|
auto scale1fp16 = ggml_fp32_to_fp16(scale1);
|
||||||
|
auto scale2fp16 = ggml_fp32_to_fp16(scale2);
|
||||||
|
std::memcpy(q, &scale1fp16, sizeof(scale1fp16)); q += sizeof(scale1fp16);
|
||||||
|
std::memcpy(q, &scale2fp16, sizeof(scale2fp16)); q += sizeof(scale2fp16);
|
||||||
|
for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
|
||||||
} else {
|
} else {
|
||||||
auto result = type == 2 ? kQuantize1(QK, X, L, tmpX, work, 15) : kQuantize1Fast(QK, X, L, 31);
|
auto result = type == 2 ? kQuantize1(QK, X, L, tmpX, work, 15) : kQuantize1Fast(QK, X, L, 31);
|
||||||
auto afp16 = ggml_fp32_to_fp16(result.first);
|
auto afp16 = ggml_fp32_to_fp16(result.first);
|
||||||
|
@ -390,7 +399,7 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto bucketSize = type == 0 ? kBucketSize0 : kBucketSize1;
|
auto bucketSize = type == 0 || type == 4 ? kBucketSize0 : kBucketSize1;
|
||||||
auto y = (char*)buffer;
|
auto y = (char*)buffer;
|
||||||
int nchunk = (k + kChunkSize-1)/kChunkSize;
|
int nchunk = (k + kChunkSize-1)/kChunkSize;
|
||||||
if (nchunk < 2) {
|
if (nchunk < 2) {
|
||||||
|
@ -517,4 +526,33 @@ void kDequantizeQ5_1(const void* x, float* y, int k) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void kQuantizeQ4_0K(const float* x, void* buffer, int k) {
|
||||||
|
kQuantizeQ4(x, buffer, k, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
void kDequantizeQ4_0K(const void* x, float* y, int k) {
|
||||||
|
assert(k % QK == 0);
|
||||||
|
int n = k / QK;
|
||||||
|
auto data = (const uint8_t*)x;
|
||||||
|
for (int i=0; i<n; ++i) {
|
||||||
|
ggml_fp16_t afp16, bfp16;
|
||||||
|
std::memcpy(&afp16, data, sizeof(afp16)); data += sizeof(afp16);
|
||||||
|
std::memcpy(&bfp16, data, sizeof(bfp16)); data += sizeof(bfp16);
|
||||||
|
auto a = ggml_fp16_to_fp32(afp16);
|
||||||
|
auto b = ggml_fp16_to_fp32(bfp16);
|
||||||
|
for (int k=0; k<8; ++k) {
|
||||||
|
int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
|
||||||
|
l1 -= 8; l2 -= 8;
|
||||||
|
*y++ = a*l1; *y++ = a*l2;
|
||||||
|
}
|
||||||
|
data += 8;
|
||||||
|
for (int k=0; k<8; ++k) {
|
||||||
|
int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
|
||||||
|
l1 -= 8; l2 -= 8;
|
||||||
|
*y++ = b*l1; *y++ = b*l2;
|
||||||
|
}
|
||||||
|
data += 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,9 @@ void kQuantizeQ5_1_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int
|
||||||
size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
|
size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
|
||||||
void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
|
||||||
|
void kQuantizeQ4_0K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||||
|
void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue