imatrix: guard Q4_0/Q5_0 against ffn_down craziness
This commit is contained in:
parent
6f9ec42a27
commit
bb9abb5cd8
1 changed files with 6 additions and 2 deletions
|
@ -8548,8 +8548,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
||||||
new_type = GGML_TYPE_Q5_K;
|
new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_0 && qs.has_imatrix) {
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
|
||||||
new_type = GGML_TYPE_Q4_K;
|
&& qs.has_imatrix && i_layer < n_layer/8) {
|
||||||
|
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
|
||||||
|
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
|
||||||
|
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
||||||
|
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
||||||
}
|
}
|
||||||
++qs.i_feed_forward_w2;
|
++qs.i_feed_forward_w2;
|
||||||
} else if (name.find("attn_output.weight") != std::string::npos) {
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue