From 8006b15fd1b4863940612ff1cba98ee01d69c334 Mon Sep 17 00:00:00 2001
From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com>
Date: Thu, 8 Aug 2024 18:50:48 +0200
Subject: [PATCH] Avoid to shrink attn.k.weight for IQ3_XS and XXS when GQA or
 MOE

---
 src/llama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 87657737a..304a113bf 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -15414,10 +15414,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             // TODO: explore better strategies
             new_type = GGML_TYPE_Q8_0;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (qs.model.hparams.n_gqa() < 2 || qs.model.hparams.n_expert < 2)) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (qs.model.hparams.n_gqa() < 2 || qs.model.hparams.n_expert < 2)) {
             new_type = GGML_TYPE_IQ2_S;
         }
     } else if (name.find("attn_q.weight") != std::string::npos) {