From 5ae59714d21023ba8cc654332703e8d0afdb5ebd Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 24 Aug 2024 20:50:07 +0200 Subject: [PATCH] Revamp Q2_K and Q3_K quants Q3_K_XL takes the place of Q3_K_L. Q3_K_L becomes intermediary between Q3_K_M and XL. --- examples/quantize/quantize.cpp | 3 +- gguf-py/gguf/constants.py | 3 +- include/llama.h | 3 +- src/llama.cpp | 138 ++++++++++++++++++++++----------- 4 files changed, 97 insertions(+), 50 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index c4bed4e62..619b8e6b4 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -41,7 +41,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.10 bpw quantization mix", }, + { "Q3_K_XL", LLAMA_FTYPE_MOSTLY_Q3_K_XL, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "IQ4_XSR", LLAMA_FTYPE_MOSTLY_IQ4_XSR, " 4.xx bpw non-linear quantization", }, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9f425c19e..3dffab0a7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1227,7 +1227,7 @@ class LlamaFileType(IntEnum): MOSTLY_Q2_K = 10 # except 1d tensors MOSTLY_Q3_K_S = 11 # except 1d tensors MOSTLY_Q3_K_M = 12 # except 1d tensors - MOSTLY_Q3_K_L = 13 # except 1d tensors + MOSTLY_Q3_K_XL = 13 # except 1d tensors MOSTLY_Q4_K_S = 14 # except 1d tensors MOSTLY_Q4_K_M = 15 # except 1d tensors MOSTLY_Q5_K_S = 16 # except 1d tensors @@ -1257,6 +1257,7 @@ class LlamaFileType(IntEnum): MOSTLY_IQ1_XL = 42 # except 1d tensors MOSTLY_IQ4_XSR = 43 # except 1d tensors MOSTLY_IQ3_XXL = 44 # except 1d tensors + MOSTLY_Q3_K_L = 45 # except 1d tensors GUESSED = 1024 # not specified in the model file diff --git a/include/llama.h b/include/llama.h index dae99bfc5..a5d09f824 100644 --- a/include/llama.h +++ b/include/llama.h @@ -143,7 +143,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_XL = 13, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors @@ -173,6 +173,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ1_XL = 42, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XSR = 43, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXL = 44, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_L = 45, // except 1d tensors LLAMA_FTYPE_CQS = 99, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file diff --git a/src/llama.cpp b/src/llama.cpp index f6dd9f3f1..0d1b9dadd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5000,6 +5000,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q3_K_XL: return "Q3_K - Xtra Large"; case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; @@ -16428,8 +16429,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; @@ -16470,7 +16476,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.params->token_embedding_type; } else { if (qs.model.hparams.n_expert >= 4) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { new_type = GGML_TYPE_IQ3_S; } @@ -16479,10 +16490,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = GGML_TYPE_IQ4_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = GGML_TYPE_Q4_K; - } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K; + else new_type = GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { @@ -16537,10 +16548,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && (difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv))) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; @@ -16598,12 +16612,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q4_K; } @@ -16694,10 +16712,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } ++qs.i_attention_wk; } else if (name.find("attn_q.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || @@ -16723,18 +16749,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ++qs.i_attention_wq; } else if (name.find("attn_output.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || - ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; @@ -16750,29 +16776,30 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ4_XS; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) + new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ4_XS; } ++qs.i_attention_wo; } else if (name.find("attn_qkv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; @@ -16801,11 +16828,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; @@ -16815,6 +16838,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -16822,9 +16849,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; @@ -16991,10 +17023,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; @@ -17005,6 +17033,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -17012,7 +17045,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { @@ -17038,11 +17076,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; @@ -17052,6 +17086,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -17059,9 +17097,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; @@ -17270,7 +17313,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: + case LLAMA_FTYPE_MOSTLY_Q3_K_XL: default_type = GGML_TYPE_Q3_K; break; case LLAMA_FTYPE_MOSTLY_Q4_K_S: case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break; case LLAMA_FTYPE_MOSTLY_Q5_K_S: