imatrix: Add Q2_K quantization
This commit is contained in:
parent
8da2b25b0b
commit
75f4cbf232
4 changed files with 253 additions and 8 deletions
240
ggml-quants.c
240
ggml-quants.c
|
@ -1610,6 +1610,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
|
||||||
return (n/QK_K*sizeof(block_q2_K));
|
return (n/QK_K*sizeof(block_q2_K));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
||||||
|
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
||||||
|
float rmin, float rdelta, int nstep, bool use_mad) {
|
||||||
|
float min = x[0];
|
||||||
|
float max = x[0];
|
||||||
|
float sum_w = weights ? weights[0] : x[0]*x[0];
|
||||||
|
float sum_x = sum_w * x[0];
|
||||||
|
for (int i = 1; i < n; ++i) {
|
||||||
|
if (x[i] < min) min = x[i];
|
||||||
|
if (x[i] > max) max = x[i];
|
||||||
|
float w = weights ? weights[i] : x[i]*x[i];
|
||||||
|
sum_w += w;
|
||||||
|
sum_x += w * x[i];
|
||||||
|
}
|
||||||
|
if (min > 0) {
|
||||||
|
min = 0;
|
||||||
|
}
|
||||||
|
if (max <= min) {
|
||||||
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||||
|
*the_min = -min;
|
||||||
|
return 0.f;
|
||||||
|
}
|
||||||
|
float iscale = nmax/(max - min);
|
||||||
|
float scale = 1/iscale;
|
||||||
|
float best_mad = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale*(x[i] - min));
|
||||||
|
L[i] = MAX(0, MIN(nmax, l));
|
||||||
|
float diff = scale * L[i] + min - x[i];
|
||||||
|
diff = use_mad ? fabsf(diff) : diff*diff;
|
||||||
|
float w = weights ? weights[i] : x[i]*x[i];
|
||||||
|
best_mad += w * diff;
|
||||||
|
}
|
||||||
|
if (nstep < 1) {
|
||||||
|
*the_min = -min;
|
||||||
|
return scale;
|
||||||
|
}
|
||||||
|
for (int is = 0; is <= nstep; ++is) {
|
||||||
|
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
||||||
|
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale*(x[i] - min));
|
||||||
|
l = MAX(0, MIN(nmax, l));
|
||||||
|
Laux[i] = l;
|
||||||
|
float w = weights ? weights[i] : x[i]*x[i];
|
||||||
|
sum_l += w*l;
|
||||||
|
sum_l2 += w*l*l;
|
||||||
|
sum_xl += w*l*x[i];
|
||||||
|
}
|
||||||
|
float D = sum_w * sum_l2 - sum_l * sum_l;
|
||||||
|
if (D > 0) {
|
||||||
|
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
||||||
|
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
||||||
|
if (this_min > 0) {
|
||||||
|
this_min = 0;
|
||||||
|
this_scale = sum_xl / sum_l2;
|
||||||
|
}
|
||||||
|
float mad = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
float diff = this_scale * Laux[i] + this_min - x[i];
|
||||||
|
diff = use_mad ? fabsf(diff) : diff*diff;
|
||||||
|
float w = weights ? weights[i] : x[i]*x[i];
|
||||||
|
mad += w * diff;
|
||||||
|
}
|
||||||
|
if (mad < best_mad) {
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
L[i] = Laux[i];
|
||||||
|
}
|
||||||
|
best_mad = mad;
|
||||||
|
scale = this_scale;
|
||||||
|
min = this_min;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*the_min = -min;
|
||||||
|
return scale;
|
||||||
|
}
|
||||||
|
|
||||||
|
static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
|
||||||
|
float max = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
max = MAX(max, x[i]);
|
||||||
|
}
|
||||||
|
if (!max) { // all zero
|
||||||
|
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
||||||
|
return 0.f;
|
||||||
|
}
|
||||||
|
float iscale = nmax / max;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
L[i] = nearest_int(iscale * x[i]);
|
||||||
|
}
|
||||||
|
float scale = 1/iscale;
|
||||||
|
float best_mse = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
float diff = x[i] - scale*L[i];
|
||||||
|
float w = quant_weights[i];
|
||||||
|
best_mse += w*diff*diff;
|
||||||
|
}
|
||||||
|
for (int is = -4; is <= 4; ++is) {
|
||||||
|
if (is == 0) continue;
|
||||||
|
float iscale_is = (0.1f*is + nmax)/max;
|
||||||
|
float scale_is = 1/iscale_is;
|
||||||
|
float mse = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale_is*x[i]);
|
||||||
|
l = MIN(nmax, l);
|
||||||
|
float diff = x[i] - scale_is*l;
|
||||||
|
float w = quant_weights[i];
|
||||||
|
mse += w*diff*diff;
|
||||||
|
}
|
||||||
|
if (mse < best_mse) {
|
||||||
|
best_mse = mse;
|
||||||
|
iscale = iscale_is;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
float sumlx = 0;
|
||||||
|
float suml2 = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
int l = nearest_int(iscale * x[i]);
|
||||||
|
l = MIN(nmax, l);
|
||||||
|
L[i] = l;
|
||||||
|
float w = quant_weights[i];
|
||||||
|
sumlx += w*x[i]*l;
|
||||||
|
suml2 += w*l*l;
|
||||||
|
}
|
||||||
|
for (int itry = 0; itry < 5; ++itry) {
|
||||||
|
int n_changed = 0;
|
||||||
|
for (int i = 0; i < n; ++i) {
|
||||||
|
float w = quant_weights[i];
|
||||||
|
float slx = sumlx - w*x[i]*L[i];
|
||||||
|
float sl2 = suml2 - w*L[i]*L[i];
|
||||||
|
if (slx > 0 && sl2 > 0) {
|
||||||
|
int new_l = nearest_int(x[i] * sl2 / slx);
|
||||||
|
new_l = MIN(nmax, new_l);
|
||||||
|
if (new_l != L[i]) {
|
||||||
|
slx += w*x[i]*new_l;
|
||||||
|
sl2 += w*new_l*new_l;
|
||||||
|
if (slx*slx*suml2 > sumlx*sumlx*sl2) {
|
||||||
|
L[i] = new_l; sumlx = slx; suml2 = sl2;
|
||||||
|
++n_changed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!n_changed) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sumlx / suml2;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
||||||
|
GGML_ASSERT(quant_weights);
|
||||||
|
assert(k % QK_K == 0);
|
||||||
|
const int nb = k / QK_K;
|
||||||
|
const bool requantize = true;
|
||||||
|
|
||||||
|
uint8_t L[QK_K];
|
||||||
|
uint8_t Laux[16];
|
||||||
|
float mins[QK_K/16];
|
||||||
|
float scales[QK_K/16];
|
||||||
|
float sw[QK_K/16];
|
||||||
|
float weight[QK_K/16];
|
||||||
|
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i++) {
|
||||||
|
memset(sw, 0, QK_K/16*sizeof(float));
|
||||||
|
float sumx2 = 0;
|
||||||
|
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
||||||
|
float sigma2 = sumx2/QK_K;
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
||||||
|
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
||||||
|
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
||||||
|
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
||||||
|
float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
||||||
|
y[i].d = GGML_FP32_TO_FP16(dm);
|
||||||
|
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
||||||
|
dm = GGML_FP16_TO_FP32(y[i].d);
|
||||||
|
mm = GGML_FP16_TO_FP32(y[i].dmin);
|
||||||
|
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
y[i].scales[j] = Ls[j] | (Lm[j] << 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (requantize) {
|
||||||
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
|
const float d = dm * (y[i].scales[j] & 0xF);
|
||||||
|
if (!d) continue;
|
||||||
|
const float m = mm * (y[i].scales[j] >> 4);
|
||||||
|
for (int ii = 0; ii < 16; ++ii) {
|
||||||
|
int l = nearest_int((x[16*j + ii] + m)/d);
|
||||||
|
l = MAX(0, MIN(3, l));
|
||||||
|
L[16*j + ii] = l;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if QK_K == 256
|
||||||
|
for (int j = 0; j < QK_K; j += 128) {
|
||||||
|
for (int l = 0; l < 32; ++l) {
|
||||||
|
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for (int l = 0; l < 16; ++l) {
|
||||||
|
y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
x += QK_K;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
|
(void)hist;
|
||||||
|
int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||||
|
if (!quant_weights) {
|
||||||
|
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
char * qrow = (char *)dst;
|
||||||
|
for (int row = 0; row < nrow; ++row) {
|
||||||
|
quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
|
||||||
|
src += n_per_row;
|
||||||
|
qrow += row_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nrow * row_size;
|
||||||
|
}
|
||||||
|
|
||||||
//========================= 3-bit (de)-quantization
|
//========================= 3-bit (de)-quantization
|
||||||
|
|
||||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
||||||
|
@ -8094,6 +8329,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
||||||
}
|
}
|
||||||
u |= (best_index << 8*k);
|
u |= (best_index << 8*k);
|
||||||
grid = (const uint8_t *)(kgrid_q2xs + best_index);
|
grid = (const uint8_t *)(kgrid_q2xs + best_index);
|
||||||
|
//grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
||||||
for (int j = 0; j < 8; ++j) {
|
for (int j = 0; j < 8; ++j) {
|
||||||
float q = db * grid[j] * signs[j];
|
float q = db * grid[j] * signs[j];
|
||||||
sumqx += wk[j] * q * xk[j];
|
sumqx += wk[j] * q * xk[j];
|
||||||
|
@ -8287,7 +8523,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
(void)hist;
|
(void)hist;
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
|
@ -8300,7 +8536,7 @@ size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_
|
||||||
return nrow * nblock * sizeof(block_iq2_xxs);
|
return nrow * nblock * sizeof(block_iq2_xxs);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t ggml_quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||||
(void)hist;
|
(void)hist;
|
||||||
GGML_ASSERT(n_per_row%QK_K == 0);
|
GGML_ASSERT(n_per_row%QK_K == 0);
|
||||||
int nblock = n_per_row/QK_K;
|
int nblock = n_per_row/QK_K;
|
||||||
|
|
|
@ -242,3 +242,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
|
//
|
||||||
|
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
|
||||||
|
|
11
ggml.c
11
ggml.c
|
@ -18664,8 +18664,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(start % QK_K == 0);
|
GGML_ASSERT(start % QK_K == 0);
|
||||||
block_q2_K * block = (block_q2_K*)dst + start / QK_K;
|
GGML_ASSERT(start % n_per_row == 0);
|
||||||
result = ggml_quantize_q2_K(src + start, block, n, n, hist);
|
size_t start_row = start / n_per_row;
|
||||||
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
|
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
{
|
{
|
||||||
|
@ -18698,7 +18701,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(imatrix);
|
GGML_ASSERT(imatrix);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = ggml_quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_IQ2_XS:
|
case GGML_TYPE_IQ2_XS:
|
||||||
|
@ -18708,7 +18711,7 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
||||||
GGML_ASSERT(imatrix);
|
GGML_ASSERT(imatrix);
|
||||||
size_t start_row = start / n_per_row;
|
size_t start_row = start / n_per_row;
|
||||||
size_t row_size = ggml_row_size(type, n_per_row);
|
size_t row_size = ggml_row_size(type, n_per_row);
|
||||||
result = ggml_quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
||||||
GGML_ASSERT(result == row_size * nrows);
|
GGML_ASSERT(result == row_size * nrows);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -2062,8 +2062,6 @@ extern "C" {
|
||||||
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||||
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
||||||
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
|
||||||
|
|
||||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
||||||
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue