Minor 4-bit quantization improvement

For the same model size as previus commit, we get
PPL = 5.9069 vs 5.9138.
This commit is contained in:
Iwan Kawrakow 2023-08-13 13:06:07 +03:00
parent f26f9ef42c
commit 77aea7214f
2 changed files with 8 additions and 7 deletions

View file

@ -221,7 +221,8 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
return 1/iscale; return 1/iscale;
} }
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) { static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
int ntry, float alpha) {
float min = x[0]; float min = x[0];
float max = x[0]; float max = x[0];
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
@ -254,7 +255,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
sum += x[i] - scale*L[i]; sum += x[i] - scale*L[i];
} }
min = sum/n; min = alpha*min + (1 - alpha)*sum/n;
if (min > 0) min = 0; if (min > 0) min = 0;
iscale = 1/scale; iscale = 1/scale;
if (!did_change) break; if (!did_change) break;
@ -291,7 +292,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
float max_scale = 0; // as we are deducting the min, scales are always positive float max_scale = 0; // as we are deducting the min, scales are always positive
float max_min = 0; float max_min = 0;
for (int j = 0; j < QK_K/16; ++j) { for (int j = 0; j < QK_K/16; ++j) {
scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5); scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5, 0.f);
float scale = scales[j]; float scale = scales[j];
if (scale > max_scale) { if (scale > max_scale) {
max_scale = scale; max_scale = scale;
@ -645,7 +646,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
float max_scale = 0; // as we are deducting the min, scales are always positive float max_scale = 0; // as we are deducting the min, scales are always positive
float max_min = 0; float max_min = 0;
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5); scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
float scale = scales[j]; float scale = scales[j];
if (scale > max_scale) { if (scale > max_scale) {
max_scale = scale; max_scale = scale;
@ -810,7 +811,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
float max_scale = 0; // as we are deducting the min, scales are always positive float max_scale = 0; // as we are deducting the min, scales are always positive
float max_min = 0; float max_min = 0;
for (int j = 0; j < QK_K/32; ++j) { for (int j = 0; j < QK_K/32; ++j) {
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5); scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5, 0.f);
float scale = scales[j]; float scale = scales[j];
if (scale > max_scale) { if (scale > max_scale) {
max_scale = scale; max_scale = scale;

View file

@ -3725,7 +3725,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K; use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q5_K;
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
@ -3739,7 +3739,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q5_K;
++i_feed_forward_w2; ++i_feed_forward_w2;
} else if (name.find("attn_output.weight") != std::string::npos) { } else if (name.find("attn_output.weight") != std::string::npos) {