From 29c41d49fe024d3791cd7b683c6c85fcf80a370e Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sun, 21 Jan 2024 09:22:52 +0200 Subject: [PATCH] Q3_K_XS: quanize first 1/8 of ffn_down layers with Q4_K Together with an importance matrix, this brings perplexity for LLaMA-v2-70B below the perplexity of the former Q2_K with a 800 MB smaller quantized model size. --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index cb6d02013..7062adb43 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8934,7 +8934,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } } if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {