Minor 4-bit quantization improvement
For the same model size as previus commit, we get PPL = 5.9069 vs 5.9138.
This commit is contained in:
parent
f26f9ef42c
commit
77aea7214f
2 changed files with 8 additions and 7 deletions
11
k_quants.c
11
k_quants.c
|
@ -221,7 +221,8 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
|
||||||
return 1/iscale;
|
return 1/iscale;
|
||||||
}
|
}
|
||||||
|
|
||||||
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) {
|
static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
|
||||||
|
int ntry, float alpha) {
|
||||||
float min = x[0];
|
float min = x[0];
|
||||||
float max = x[0];
|
float max = x[0];
|
||||||
for (int i = 1; i < n; ++i) {
|
for (int i = 1; i < n; ++i) {
|
||||||
|
@ -254,7 +255,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
sum += x[i] - scale*L[i];
|
sum += x[i] - scale*L[i];
|
||||||
}
|
}
|
||||||
min = sum/n;
|
min = alpha*min + (1 - alpha)*sum/n;
|
||||||
if (min > 0) min = 0;
|
if (min > 0) min = 0;
|
||||||
iscale = 1/scale;
|
iscale = 1/scale;
|
||||||
if (!did_change) break;
|
if (!did_change) break;
|
||||||
|
@ -291,7 +292,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/16; ++j) {
|
for (int j = 0; j < QK_K/16; ++j) {
|
||||||
scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
|
scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5, 0.f);
|
||||||
float scale = scales[j];
|
float scale = scales[j];
|
||||||
if (scale > max_scale) {
|
if (scale > max_scale) {
|
||||||
max_scale = scale;
|
max_scale = scale;
|
||||||
|
@ -645,7 +646,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/32; ++j) {
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5);
|
scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
|
||||||
float scale = scales[j];
|
float scale = scales[j];
|
||||||
if (scale > max_scale) {
|
if (scale > max_scale) {
|
||||||
max_scale = scale;
|
max_scale = scale;
|
||||||
|
@ -810,7 +811,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
|
||||||
float max_scale = 0; // as we are deducting the min, scales are always positive
|
float max_scale = 0; // as we are deducting the min, scales are always positive
|
||||||
float max_min = 0;
|
float max_min = 0;
|
||||||
for (int j = 0; j < QK_K/32; ++j) {
|
for (int j = 0; j < QK_K/32; ++j) {
|
||||||
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
|
scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5, 0.f);
|
||||||
float scale = scales[j];
|
float scale = scales[j];
|
||||||
if (scale > max_scale) {
|
if (scale > max_scale) {
|
||||||
max_scale = scale;
|
max_scale = scale;
|
||||||
|
|
|
@ -3725,7 +3725,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q6_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_attention_wv < 2) new_type = GGML_TYPE_Q5_K;
|
||||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
@ -3739,7 +3739,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q6_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S && i_feed_forward_w2 < 2) new_type = GGML_TYPE_Q5_K;
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue