From ed4be6bb0d7f6645aca23f2a87b01a634f0a23cd Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 29 Mar 2024 13:12:27 +0100 Subject: [PATCH] Update llama.cpp - IQ4_XS output for models lesser than 8 experts or GQA 8 - granularity for QKV tensor when existing - also, drop for Mistral & Yi attn.k.weight from IQ2_XS to IQ2_XXS --- llama.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 30dae82ab..6798995ed 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12773,7 +12773,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; - else new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || @@ -12814,7 +12815,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_IQ2_S; - else if (qs.model.hparams.n_gqa() >= 4) new_type = GGML_TYPE_IQ2_XS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; } else if (name.find("attn_v.weight") != std::string::npos) { @@ -12832,7 +12832,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = GGML_TYPE_IQ2_XXS; } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() > 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_Q2_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||