Q3_K_XS: quanize first 1/8 of ffn_down layers with Q4_K
Together with an importance matrix, this brings perplexity for LLaMA-v2-70B below the perplexity of the former Q2_K with a 800 MB smaller quantized model size.
This commit is contained in:
parent
ec4b80104f
commit
29c41d49fe
1 changed files with 1 additions and 1 deletions
|
@ -8934,7 +8934,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|||
}
|
||||
}
|
||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) {
|
||||
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
||||
}
|
||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue