POC: Q4_1 for groups of 16 weight
As last commit, but Q4_1 type, using the same memory as existing Q4_1 via fp16. We end up with rmse 0.00125125, maxerr 0.11657715, 95pct<0.0024, median<0.0010 after a quantize - dequantize roundtrip. This is quite a bit better than Q4_1 with groups of 32 weights, but by far not as good as 5-bit quantization that uses the same amount of memory where we had rmse 0.00076131, maxerr 0.05273438, 95pct<0.0016, median<0.0006
This commit is contained in:
parent
679e1cb6c0
commit
6f34961559
3 changed files with 51 additions and 3 deletions
|
@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// loop throught quantization types
|
||||
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
for (int i = 0; i < 1; i++) {
|
||||
for (int i = 1; i < 2; i++) {
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
|
@ -317,8 +317,10 @@ int main(int argc, char ** argv) {
|
|||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
|
||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
|
||||
//if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
|
||||
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
|
||||
qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
|
||||
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
|
||||
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
|
||||
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
|
||||
qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
|
||||
}
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
|
|
|
@ -378,6 +378,18 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
|
|||
std::memcpy(q, &scale1fp16, sizeof(scale1fp16)); q += sizeof(scale1fp16);
|
||||
std::memcpy(q, &scale2fp16, sizeof(scale2fp16)); q += sizeof(scale2fp16);
|
||||
for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
|
||||
} else if (type == 5) {
|
||||
auto result1 = kQuantize1(QK/2, X, L, tmpX, work, 7);
|
||||
auto result2 = kQuantize1(QK/2, X + QK/2, L + QK/2, tmpX, work, 7);
|
||||
auto a1fp16 = ggml_fp32_to_fp16(result1.first);
|
||||
auto b1fp16 = ggml_fp32_to_fp16(result1.second);
|
||||
auto a2fp16 = ggml_fp32_to_fp16(result2.first);
|
||||
auto b2fp16 = ggml_fp32_to_fp16(result2.second);
|
||||
std::memcpy(q, &a1fp16, sizeof(a1fp16)); q += sizeof(a1fp16);
|
||||
std::memcpy(q, &b1fp16, sizeof(b1fp16)); q += sizeof(b1fp16);
|
||||
std::memcpy(q, &a2fp16, sizeof(a2fp16)); q += sizeof(a2fp16);
|
||||
std::memcpy(q, &b2fp16, sizeof(b2fp16)); q += sizeof(b2fp16);
|
||||
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
|
||||
} else {
|
||||
auto result = type == 2 ? kQuantize1(QK, X, L, tmpX, work, 15) : kQuantize1Fast(QK, X, L, 31);
|
||||
auto afp16 = ggml_fp32_to_fp16(result.first);
|
||||
|
@ -555,4 +567,35 @@ void kDequantizeQ4_0K(const void* x, float* y, int k) {
|
|||
}
|
||||
}
|
||||
|
||||
void kQuantizeQ4_1K(const float* x, void* buffer, int k) {
|
||||
kQuantizeQ4(x, buffer, k, 5);
|
||||
}
|
||||
|
||||
void kDequantizeQ4_1K(const void* x, float* y, int k) {
|
||||
assert(k % QK == 0);
|
||||
int n = k / QK;
|
||||
auto data = (const uint8_t*)x;
|
||||
for (int i=0; i<n; ++i) {
|
||||
ggml_fp16_t a1fp16, b1fp16, a2fp16, b2fp16;
|
||||
std::memcpy(&a1fp16, data, sizeof(a1fp16)); data += sizeof(a1fp16);
|
||||
std::memcpy(&b1fp16, data, sizeof(b1fp16)); data += sizeof(b1fp16);
|
||||
std::memcpy(&a2fp16, data, sizeof(a2fp16)); data += sizeof(a2fp16);
|
||||
std::memcpy(&b2fp16, data, sizeof(b2fp16)); data += sizeof(b2fp16);
|
||||
auto a1 = ggml_fp16_to_fp32(a1fp16);
|
||||
auto b1 = ggml_fp16_to_fp32(b1fp16);
|
||||
auto a2 = ggml_fp16_to_fp32(a2fp16);
|
||||
auto b2 = ggml_fp16_to_fp32(b2fp16);
|
||||
for (int k=0; k<8; ++k) {
|
||||
int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
|
||||
*y++ = a1 + b1*l1; *y++ = a1 + b1*l2;
|
||||
}
|
||||
data += 8;
|
||||
for (int k=0; k<8; ++k) {
|
||||
int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
|
||||
*y++ = a2 + b2*l1; *y++ = a2 + b2*l2;
|
||||
}
|
||||
data += 8;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -31,6 +31,9 @@ void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int
|
|||
void kQuantizeQ4_0K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||
void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
void kQuantizeQ4_1K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
|
||||
void kDequantizeQ4_1K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue