From 599a4b2cc6fd7ff0b676b64a5951eebb063c2a66 Mon Sep 17 00:00:00 2001
From: Nexesenex <124105151+Nexesenex@users.noreply.github.com>
Date: Tue, 26 Mar 2024 13:41:16 +0100
Subject: [PATCH] Update llama.cpp - switch from IQ4_XS to Q4_K in related
 cases.

- There's indeed a slight bonus worthy of not being missed for such a cheap cost with Q4_K compared to IQ4_XS, especially on the K & V attention tensors.
- Obsession on size doesn't matter much for the smallest models which are small anyway and need an offset toward quality for the sake of logic, while the bigger models which can actually be usable almost won't be impacted in size but will appreciate the slight quality bump offered by Q4_K vs IQ4_XS.
---
 llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index b940ea3d2..c8fa46496 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12449,7 +12449,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
                 if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K;
-                else new_type = GGML_TYPE_IQ4_XS;
+                else new_type = GGML_TYPE_Q4_K;
             }
             else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
@@ -12484,7 +12484,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
             else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XXS;
         }
         if (name.find("attn_k.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS;
+            if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K;
             else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S;
             else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS;
             else if (qs.model.hparams.n_gqa() >= 8) new_type = GGML_TYPE_IQ2_S;
@@ -12493,14 +12493,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         }
         else if (name.find("attn_v.weight") != std::string::npos) {
             if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K;
-            else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS;
+            else if (qs.model.hparams.n_gqa() >= 8 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
             else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S;
             else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS;
             else new_type = GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
         else if (name.find("attn_output.weight") != std::string::npos) {
-            if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_IQ4_XS;
+            if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K;
             else if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S;
             else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS;
             else new_type = GGML_TYPE_IQ2_XXS;