From 6b5cebfb2b5b521bfc6081e94fa57afd0c1ce601 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:40:40 +0200 Subject: [PATCH] Revamp a bit output weight for more granularity in low quants. --- src/llama.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d8726bc4a..817e4cc4c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16426,23 +16426,35 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q6_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || - ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q6_K;