Fix ffn_down quantization mix for MoE models

In #4872 I did not consider the part where every third
tensor is quantized with more bits. Fir MoE this leads to tensors
of the same layer being quantized with different number of bits,
which is not considered as a possibility in the inference implementation
(it is assumed all experts use the same quantization).
This commit is contained in:
Iwan Kawrakow 2024-01-14 08:39:10 +02:00
parent 76484fbfd3
commit 998b635a17

View file

@ -8462,13 +8462,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
new_type = GGML_TYPE_Q8_0;
}
} else if (name.find("ffn_down") != std::string::npos) {
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
const int i_layer = qs.i_feed_forward_w2 / n_expert;
const int n_layer = qs.i_feed_forward_w2 / n_expert;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@ -8476,14 +8479,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (arch == LLM_ARCH_FALCON) {
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
}
++qs.i_feed_forward_w2;