From b77cdd83ff08a3aa760b193fd77b58a758aed9a9 Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 2 Aug 2024 20:40:04 +0200 Subject: [PATCH 01/59] Small changes for IQ2 quant strategies (notably IQ2_S and IQ2_M) Here's a few edits I consider useful to improve a bit the IQ2 model quant strategies for some models: - The tensor attn.v.weight passed in Q4_K for models like Gemma (GQA 2), and the various franken MOEs having 2 experts, this to not sabotage them with a too small value head quant (Q2_K is meh for such important head) while the size of that head is low relatively to the total size of the affected models. - The tensor attn.k.weight passed in Q4_K for models with 8 experts or more, rather than simply 8 experts. - The tensor attn.output.weight passed in IQ3_XXS (instead of IQ3_S) for the quant strategies IQ2_S and IQ2_M, this to have a progressiveness between the IQ2_XS quant strategies (which use IQ2_XS for the attn.output.weight) and the IQ3_XXS quant strategies (which use.. IQ3_S quant for attn.output.weight). The benefit of an IQ3_S quant instead of an IQ3_XXS for that tensor is quasi-inexistant on IQ2_S and IQ2_M quant strategies, especially compared to the size bump it provokes. More broadly, I think that the whole IQ2 quant strategies bunch should be harmonized/refactored like the rest of the quant strategies are established (tensor by tensor), rather than under an different kind of tree mixing these 5 quant strategies. I'm using these settings (and many more edits) for a long time, with benefit, and I think they could be standard. --- src/llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e6f303d31..f798dcdfe 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15348,11 +15348,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { + else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { new_type = GGML_TYPE_Q4_K; } else if (name.find("ffn_down") != std::string::npos) { @@ -15366,7 +15366,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } } } else if (name.find("attn_v.weight") != std::string::npos) { From 63986631a3e440094a7e7f15e4de6cd269b6eb8a Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 2 Aug 2024 23:49:03 +0200 Subject: [PATCH 02/59] Apply the GQA2/Expert2 conditionality to the IQ3 quants In coherence with the proposed modifications to the IQ2 quant strategies, which make even more sense for the IQ3 quant strategies. --- src/llama.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f798dcdfe..ac8b006e7 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15371,15 +15371,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { From 7d337d0f893e3eb0ac66db5115232f06df6471a5 Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 3 Aug 2024 01:35:08 +0200 Subject: [PATCH 03/59] Slight reorder of the attn.weight tree And application of the attn.v.weight logic I used for IQ2 and IQ3, but only when such logic is already implied by the existing quant strategies, as a compromise to not disturb too much Ikawrakow's quant strategies. --- src/llama.cpp | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ac8b006e7..a29a44d8f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15370,7 +15370,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (qs.model.hparams.n_expert >= 8) { + // for the 8-expert model, bumping this to Q8_0 trades just ~128MB + // TODO: explore better strategies + new_type = GGML_TYPE_Q8_0; + } + else if (qs.model.hparams.n_gqa() >= 7) { + // The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is + // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with + // nearly negligible increase in model size by quantizing this tensor with more bits. + // That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12). + if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -15389,26 +15401,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - if (qs.model.type == MODEL_70B) { - // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is - // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with - // nearly negligible increase in model size by quantizing this tensor with more bits: - if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; - } - if (qs.model.hparams.n_expert == 8) { - // for the 8-expert model, bumping this to Q8_0 trades just ~128MB - // TODO: explore better strategies - new_type = GGML_TYPE_Q8_0; - } ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; From d5779c27bad222e42aa286bc3683ee5343e83f0e Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 3 Aug 2024 03:04:25 +0200 Subject: [PATCH 04/59] More occurences of n_experts == 8 changed to >= in quant strategies --- src/llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a29a44d8f..e6f11c2ac 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15362,7 +15362,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { new_type = GGML_TYPE_Q5_K; } else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; @@ -15443,7 +15443,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || - (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { + (qs.model.hparams.n_expert >= 8 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -15474,7 +15474,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert == 8) { + if (qs.model.hparams.n_expert >= 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || From 93c35f86a9619499b9cd9a879500c47986d9e331 Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 4 Aug 2024 11:59:52 +0200 Subject: [PATCH 05/59] attn.output.tensor of FYPE IQ3_M in IQ4_XS If FTYPE IQ4_XS has attn.output.tensor in IQ4_XS (4.5BPW), there's no reason to have FTYPE IQ3_M to have attn.output.tensor in Q4_K (4.5BPW). In terms of perplexity, on a Llama 3.1 70b model, the proposed change reduces the size by 1%, and increases the preplexity by 0.25%. --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index e6f11c2ac..931a36a86 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15486,7 +15486,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_XS; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; From 59c5d479de38a95b206727a1050d619be6357298 Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 4 Aug 2024 12:06:06 +0200 Subject: [PATCH 06/59] attn_qkv.weight in IQ4_XS for FTYPE IQ3_M If FTYPE IQ4_XS has attn_qkv.weight in IQ4_XS, then FTYPE IQ3_M should not have it in Q4_K (4.5BPW), but in IQ4_XS (4.25BPW) also. --- src/llama.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 931a36a86..87657737a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15493,9 +15493,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } From 8006b15fd1b4863940612ff1cba98ee01d69c334 Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Thu, 8 Aug 2024 18:50:48 +0200 Subject: [PATCH 07/59] Avoid to shrink attn.k.weight for IQ3_XS and XXS when GQA or MOE --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 87657737a..304a113bf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15414,10 +15414,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (qs.model.hparams.n_gqa() < 2 || qs.model.hparams.n_expert < 2)) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (qs.model.hparams.n_gqa() < 2 || qs.model.hparams.n_expert < 2)) { new_type = GGML_TYPE_IQ2_S; } } else if (name.find("attn_q.weight") != std::string::npos) { From 1118c046df26377486d7583671a1fc25673542a8 Mon Sep 17 00:00:00 2001 From: Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Date: Thu, 8 Aug 2024 18:56:20 +0200 Subject: [PATCH 08/59] correct mistake in conditionality for attn.k --- src/llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 304a113bf..50a19ac35 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15414,10 +15414,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (qs.model.hparams.n_gqa() < 2 || qs.model.hparams.n_expert < 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (qs.model.hparams.n_gqa() < 2 || qs.model.hparams.n_expert < 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { new_type = GGML_TYPE_IQ2_S; } } else if (name.find("attn_q.weight") != std::string::npos) { From 1bc4dc5c15a729401ca2b6842b8fbb84f7bc2a54 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 9 Aug 2024 22:49:42 +0200 Subject: [PATCH 09/59] Bump IQ3_M attn.v in Q5_K attn.k in IQ4_XS --- src/llama.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 50a19ac35..7140a5580 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15391,11 +15391,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q4_K; + new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -15414,6 +15417,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { new_type = GGML_TYPE_IQ3_XXS; } From 7212098755469f7da0261ed92b1440ca02a700ca Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 12:52:57 +0200 Subject: [PATCH 10/59] IQ1 and IQ2 refactor Attn_q in Q3_K for experts >= 8 Attn_k in Q5_K for experts >= 8 Attn_v in Q6_K for experts >= 8, in IQ3_XXS for IQ2_XXS and IQ2_XS Attn_output in Q4_K for experts >= 8 --- src/llama.cpp | 66 +++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7140a5580..56a175ddc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15345,32 +15345,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q4_0; } } - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; - else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } else if (name.find("attn_v.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K; + else { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; + } ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert >= 8 && name.find("attn_k.weight") != std::string::npos) { - new_type = GGML_TYPE_Q4_K; - } - else if (name.find("ffn_down") != std::string::npos) { - if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; - } - ++qs.i_ffn_down; - } - else if (name.find("attn_output.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { - new_type = GGML_TYPE_Q5_K; - } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; - } - } - } else if (name.find("attn_v.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { + else if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -15380,7 +15366,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // nearly negligible increase in model size by quantizing this tensor with more bits. // That logic applies also to models like Yi 34B (-> GQA 7) and Mistral Large 123B (-> GQA 12). - if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K; + if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S || + new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -15412,7 +15399,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K; + } + else if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -15430,6 +15421,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } @@ -15440,6 +15435,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.i_ffn_down < qs.n_ffn_down/8) { + new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; + } + ++qs.i_ffn_down; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } @@ -15482,10 +15484,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert >= 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { - new_type = GGML_TYPE_Q5_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K; + else { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; From 8f1b99fee8cd0c4e15b458cdf491c63bc732e150 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 13:09:11 +0200 Subject: [PATCH 11/59] Shortening formatting --- src/llama.cpp | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 56a175ddc..2a7d72fda 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15334,14 +15334,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q2_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ3_S; - } - else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || - new_type == GGML_TYPE_Q4_0_8_8) { + else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } } @@ -15381,12 +15377,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q5_K; - } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -15418,16 +15410,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ2_S; } } else if (name.find("attn_q.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - new_type = GGML_TYPE_IQ3_XXS; - } + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ2_S; - } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; From aa4eb594ef5db3dd7becfb665f7f40fa0d5ae209 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 16:33:55 +0200 Subject: [PATCH 12/59] Further refactor attn_k With attn_k set for all quants bellow 3bpw except Q2_K_S. --- src/llama.cpp | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 2a7d72fda..36673cbcc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15391,23 +15391,45 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q5_K; - } - else if (qs.model.hparams.n_expert >= 8) { + if (qs.model.hparams.n_expert >= 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies - new_type = GGML_TYPE_Q8_0; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ4_XS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_IQ1_M; + } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_IQ2_XXS; + } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_IQ2_XS; + } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_IQ2_S; + } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && + (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { - new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_Q4_K; + } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_Q5_K; } } else if (name.find("attn_q.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS; From 8c8e43ce20e885f05bb2816d10c36a7ce7b39624 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 16:38:11 +0200 Subject: [PATCH 13/59] Settings for MOE >= 8 experts applied to >= 4 experts --- src/llama.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 36673cbcc..d2dbe02fa 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15344,7 +15344,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q6_K; + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ3_XXS; @@ -15352,7 +15352,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert >= 8) { + else if (qs.model.hparams.n_expert >= 4) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; @@ -15391,7 +15391,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { - if (qs.model.hparams.n_expert >= 8) { + if (qs.model.hparams.n_expert >= 4) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || @@ -15436,7 +15436,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q3_K; + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); @@ -15461,7 +15461,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || - (qs.model.hparams.n_expert >= 8 && use_more_bits(i_layer, n_layer)))) { + (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -15492,7 +15492,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert >= 8) { + if (qs.model.hparams.n_expert >= 4) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || @@ -15500,7 +15500,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_expert >= 8) new_type = GGML_TYPE_Q4_K; + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; From 415d5e40e10cedcaf887a2d1232c87d934b4b082 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 17:32:29 +0200 Subject: [PATCH 14/59] Refactor furthermore attn.v And also lower attn_q for IQ2_XS, in order to separate it more for the quite misnamed IQ2_S --- src/llama.cpp | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d2dbe02fa..d84341b00 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15342,20 +15342,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } } else if (name.find("attn_v.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; - else { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; - } - ++qs.i_attention_wv; - } - else if (qs.model.hparams.n_expert >= 4) { + if (qs.model.hparams.n_expert >= 4) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies - new_type = GGML_TYPE_Q8_0; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q8_0; } else if (qs.model.hparams.n_gqa() >= 7) { // The Llama 70B models have 8 heads sharing the same attn_v weights (-> GQA 8). As a result, the attn_v.weight tensor is @@ -15365,20 +15358,28 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_Q4_K; - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -15394,9 +15395,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_expert >= 4) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || - ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q6_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q8_0; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -15427,7 +15428,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } @@ -15435,8 +15436,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS; } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); @@ -15461,7 +15463,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || - (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { + (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { From 49617b19606882edbc8868c9ff292ea5136f7318 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 18:37:29 +0200 Subject: [PATCH 15/59] Advancing on several tensors - Progressivity for token embeddings and attn_qkv - FFN down for IQ1 and IQ2 quants - FFN gate and up for IQ2_S and IQ2_M, for progressivity in the IQ2 range. --- src/llama.cpp | 63 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d84341b00..5533ba2cf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15330,13 +15330,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - new_type = GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ3_S; - } + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_Q2_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } @@ -15384,7 +15381,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && + (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && @@ -15445,20 +15443,28 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.i_ffn_down < qs.n_ffn_down/8) { - new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; - } - ++qs.i_ffn_down; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K + new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } @@ -15505,7 +15511,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; @@ -15522,7 +15529,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } @@ -15532,6 +15547,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ3_XXS; + } ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { @@ -15540,6 +15561,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { new_type = GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ3_XXS; + } ++qs.i_ffn_up; } @@ -15976,8 +16003,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS || - new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ1_S || + (new_type == GGML_TYPE_IQ2_S && strcmp(tensor->name, "token_embd.weight")) || (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); From f0806ac943b1afd482a3e7778b0b5447dcc95dd9 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 20:34:17 +0200 Subject: [PATCH 16/59] IQ2_XL , IQ3_XL , Q2_K_L Plus some adjustments on the FFNs --- src/llama.cpp | 115 +++++++++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 47 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5533ba2cf..252e45dd6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4456,6 +4456,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; + case LLAMA_FTYPE_MOSTLY_Q2_K_L: return "Q2_K - Large"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; @@ -4468,6 +4469,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_XL: return "IQ2_XL - 2.9 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; @@ -4476,6 +4478,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.96 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; @@ -15317,9 +15320,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = GGML_TYPE_Q5_K; } else if (new_type != GGML_TYPE_Q8_0) { @@ -15344,7 +15350,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_Q6_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q8_0; } else if (qs.model.hparams.n_gqa() >= 7) { @@ -15361,10 +15368,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { @@ -15374,7 +15381,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { @@ -15395,7 +15402,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ) new_type = GGML_TYPE_Q6_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q8_0; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -15410,17 +15418,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ2_S; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && + (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && + (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -15434,30 +15444,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS; } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q3_K; - } + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { - if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XS; - } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { - if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { @@ -15472,6 +15477,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } @@ -15502,12 +15508,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert >= 4) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; @@ -15515,11 +15522,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; @@ -15529,14 +15536,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; @@ -15544,29 +15554,37 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { - new_type = GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { - if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ3_XXS; - } + if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q3_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q4_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ4_XS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { - new_type = GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { - if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ3_XXS; - } + if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q3_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q4_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ4_XS; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } @@ -15691,7 +15709,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_Q2_K_L: default_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break; @@ -15704,6 +15723,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XL: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; @@ -15711,6 +15731,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; From 8bc7a9849e531f2fff3880ac7c4309b1279f6134 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 10 Aug 2024 20:40:27 +0200 Subject: [PATCH 17/59] 2 forgotten files --- examples/quantize/quantize.cpp | 3 +++ include/llama.h | 3 +++ 2 files changed, 6 insertions(+) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 8d7647258..19b6e0fea 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -24,13 +24,16 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, + { "IQ2_XL", LLAMA_FTYPE_MOSTLY_IQ2_XL, " 2.85 bpw quantization mix", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, + { "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, + { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.85 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, diff --git a/include/llama.h b/include/llama.h index f23355a6b..38924b312 100644 --- a/include/llama.h +++ b/include/llama.h @@ -166,6 +166,9 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_XL = 36, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_XL = 37, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; From 8ad71f4469e6e1b10c74cda3f10713cce7f8eb81 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 01:11:24 +0200 Subject: [PATCH 18/59] IQ1_XS and small adjustments. --- include/llama.h | 2 +- include/quantize.cpp | 457 +++++++++++++++++++++++++++++++++++++++++++ src/llama.cpp | 32 +-- 3 files changed, 477 insertions(+), 14 deletions(-) create mode 100644 include/quantize.cpp diff --git a/include/llama.h b/include/llama.h index 20375bf04..c9d7cb8a1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -169,7 +169,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_XL = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XL = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors - + LLAMA_FTYPE_MOSTLY_IQ1_XS = 39, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/include/quantize.cpp b/include/quantize.cpp new file mode 100644 index 000000000..db04bf64e --- /dev/null +++ b/include/quantize.cpp @@ -0,0 +1,457 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include + +struct quant_option { + std::string name; + llama_ftype ftype; + std::string desc; +}; + +static const std::vector QUANT_OPTIONS = { + { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, + { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, + { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, + { "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", }, + { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, + { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, + { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, + { "IQ2_XL", LLAMA_FTYPE_MOSTLY_IQ2_XL, " 2.85 bpw quantization mix", }, + { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", }, + { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, + { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, + { "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, + { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, + { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, + { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, + { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.85 bpw quantization mix", }, + { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, + { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, + { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, + { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, + { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, + { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, + { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, + { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, + { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, + { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, + { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, + { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, + { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, + { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, + { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, + { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, +}; + +static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; + +static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { + std::string ftype_str; + + for (auto ch : ftype_str_in) { + ftype_str.push_back(std::toupper(ch)); + } + for (auto & it : QUANT_OPTIONS) { + if (it.name == ftype_str) { + ftype = it.ftype; + ftype_str_out = it.name; + return true; + } + } + try { + int ftype_int = std::stoi(ftype_str); + for (auto & it : QUANT_OPTIONS) { + if (it.ftype == ftype_int) { + ftype = it.ftype; + ftype_str_out = it.name; + return true; + } + } + } + catch (...) { + // stoi failed + } + return false; +} + +// usage: +// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] +// +[[noreturn]] +static void usage(const char * executable) { + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); + printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); + printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); + printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); + printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); + printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --keep-split: will generate quatized model in the same shards as input"); + printf(" --override-kv KEY=TYPE:VALUE\n"); + printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); + printf("Note: --include-weights and --exclude-weights cannot be used together\n"); + printf("\nAllowed quantization types:\n"); + for (auto & it : QUANT_OPTIONS) { + if (it.name != "COPY") { + printf(" %2d or ", it.ftype); + } else { + printf(" "); + } + printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str()); + } + exit(1); +} + +static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map> & imatrix_data) { + std::ifstream in(imatrix_file.c_str(), std::ios::binary); + if (!in) { + printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); + exit(1); + } + int n_entries; + in.read((char *)&n_entries, sizeof(n_entries)); + if (in.fail() || n_entries < 1) { + printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); + exit(1); + } + for (int i = 0; i < n_entries; ++i) { + int len; in.read((char *)&len, sizeof(len)); + std::vector name_as_vec(len+1); + in.read((char *)name_as_vec.data(), len); + if (in.fail()) { + printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); + exit(1); + } + name_as_vec[len] = 0; + std::string name{name_as_vec.data()}; + auto & e = imatrix_data[name]; + int ncall; + in.read((char *)&ncall, sizeof(ncall)); + int nval; + in.read((char *)&nval, sizeof(nval)); + if (in.fail() || nval < 1) { + printf("%s: failed reading number of values for entry %d\n", __func__, i); + imatrix_data = {}; + exit(1); + } + e.resize(nval); + in.read((char *)e.data(), nval*sizeof(float)); + if (in.fail()) { + printf("%s: failed reading data for entry %d\n", __func__, i); + imatrix_data = {}; + exit(1); + } + if (ncall > 0) { + for (auto& v : e) v /= ncall; + } + + if (getenv("LLAMA_TRACE")) { + printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); + } + } + + // latest imatrix version contains the dataset filename at the end of the file + int m_last_call = 0; + if (in.peek() != EOF) { + in.read((char *)&m_last_call, sizeof(m_last_call)); + int dataset_len; + in.read((char *)&dataset_len, sizeof(dataset_len)); + std::vector dataset_as_vec(dataset_len); + in.read(dataset_as_vec.data(), dataset_len); + imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end()); + printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str()); + } + printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); + return m_last_call; +} + +static int prepare_imatrix(const std::string & imatrix_file, + std::string & imatrix_dataset, + const std::vector & included_weights, + const std::vector & excluded_weights, + std::unordered_map> & imatrix_data) { + int m_last_call = -1; + if (!imatrix_file.empty()) { + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data); + } + if (imatrix_data.empty()) { + return m_last_call; + } + if (!excluded_weights.empty()) { + for (auto& name : excluded_weights) { + for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) { + auto pos = it->first.find(name); + if (pos != std::string::npos) it = imatrix_data.erase(it); + else ++it; + } + } + } + if (!included_weights.empty()) { + std::unordered_map> tmp; + for (auto& name : included_weights) { + for (auto& e : imatrix_data) { + auto pos = e.first.find(name); + if (pos != std::string::npos) { + tmp.emplace(std::move(e)); + } + } + } + imatrix_data = std::move(tmp); + } + if (!imatrix_data.empty()) { + printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); + } + return m_last_call; +} + +static ggml_type parse_ggml_type(const char * arg) { + ggml_type result = GGML_TYPE_COUNT; + for (int j = 0; j < GGML_TYPE_COUNT; ++j) { + auto type = ggml_type(j); + const auto * name = ggml_type_name(type); + if (name && strcmp(arg, name) == 0) { + result = type; break; + } + } + return result; +} + +int main(int argc, char ** argv) { + if (argc < 3) { + usage(argv[0]); + } + + llama_model_quantize_params params = llama_model_quantize_default_params(); + + int arg_idx = 1; + std::string imatrix_file; + std::vector included_weights, excluded_weights; + std::vector kv_overrides; + + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { + params.quantize_output_tensor = false; + } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { + if (arg_idx < argc-1) { + params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { + if (arg_idx < argc-1) { + params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { + if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { + params.allow_requantize = true; + } else if (strcmp(argv[arg_idx], "--pure") == 0) { + params.pure = true; + } else if (strcmp(argv[arg_idx], "--imatrix") == 0) { + if (arg_idx < argc-1) { + imatrix_file = argv[++arg_idx]; + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--include-weights") == 0) { + if (arg_idx < argc-1) { + included_weights.emplace_back(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) { + if (arg_idx < argc-1) { + excluded_weights.emplace_back(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--keep-split") == 0) { + params.keep_split = true; + } else { + usage(argv[0]); + } + } + + if (argc - arg_idx < 2) { + printf("%s: bad arguments\n", argv[0]); + usage(argv[0]); + } + if (!included_weights.empty() && !excluded_weights.empty()) { + usage(argv[0]); + } + + std::string imatrix_dataset; + std::unordered_map> imatrix_data; + int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data); + if (!imatrix_data.empty()) { + params.imatrix = &imatrix_data; + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_file.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + if (!imatrix_dataset.empty()) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_dataset.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = imatrix_data.size(); + kv_overrides.emplace_back(std::move(kvo)); + } + + if (m_last_call > 0) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = m_last_call; + kv_overrides.emplace_back(std::move(kvo)); + } + } + if (!kv_overrides.empty()) { + kv_overrides.emplace_back(); + kv_overrides.back().key[0] = 0; + params.kv_overrides = &kv_overrides; + } + + llama_backend_init(); + + // parse command line arguments + const std::string fname_inp = argv[arg_idx]; + arg_idx++; + std::string fname_out; + + std::string ftype_str; + std::string suffix = ".gguf"; + if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { + std::string fpath; + const size_t pos = fname_inp.find_last_of("/\\"); + if (pos != std::string::npos) { + fpath = fname_inp.substr(0, pos + 1); + } + + // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting + fname_out = fpath + "ggml-model-" + ftype_str; + if (!params.keep_split) { + fname_out += suffix; + } + arg_idx++; + if (ftype_str == "COPY") { + params.only_copy = true; + } + } else { + fname_out = argv[arg_idx]; + if (params.keep_split && fname_out.find(suffix) != std::string::npos) { + fname_out = fname_out.substr(0, fname_out.length() - suffix.length()); + } + arg_idx++; + + if (argc <= arg_idx) { + fprintf(stderr, "%s: missing ftype\n", __func__); + return 1; + } + if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { + fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); + return 1; + } + if (ftype_str == "COPY") { + params.only_copy = true; + } + arg_idx++; + } + + // parse nthreads + if (argc > arg_idx) { + try { + params.nthread = std::stoi(argv[arg_idx]); + } + catch (const std::exception & e) { + fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what()); + return 1; + } + } + + if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || + params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { + fprintf(stderr, "\n==========================================================================================================\n"); + fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); + fprintf(stderr, "==========================================================================================================\n\n\n"); + return 1; + } + + print_build_info(); + + fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); + if (params.nthread > 0) { + fprintf(stderr, " using %d threads", params.nthread); + } + fprintf(stderr, "\n"); + + const int64_t t_main_start_us = llama_time_us(); + + int64_t t_quantize_us = 0; + + // load the model + { + const int64_t t_start_us = llama_time_us(); + + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ¶ms)) { + fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); + return 1; + } + + t_quantize_us = llama_time_us() - t_start_us; + } + + // report timing + { + const int64_t t_main_end_us = llama_time_us(); + + printf("\n"); + printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); + } + + llama_backend_free(); + + return 0; +} diff --git a/src/llama.cpp b/src/llama.cpp index f5347a97d..27ebbd05c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4480,6 +4480,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ2_XL: return "IQ2_XL - 2.9 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_XS: return "IQ1_S mix - 1.6-1.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; @@ -15474,7 +15475,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -15490,7 +15491,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_S; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + new_type = GGML_TYPE_IQ2_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_Q2_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; @@ -15504,7 +15507,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q8_0; } @@ -15516,7 +15519,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { @@ -15560,7 +15563,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q8_0; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && + (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ1_M; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -15608,6 +15612,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; } @@ -15668,10 +15673,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) + new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } @@ -15691,10 +15698,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ1_M; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + new_type = GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S; @@ -15717,7 +15724,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_S; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ4_XS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; @@ -15735,7 +15741,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_S; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ4_XS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; @@ -15879,6 +15884,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ2_XL: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ1_XS: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; From e2e2d77e8e89218a5acf77f8ae40cc76a3a0a4d7 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 01:13:12 +0200 Subject: [PATCH 19/59] misplaced file lol --- examples/quantize/quantize.cpp | 1 + include/quantize.cpp | 457 --------------------------------- 2 files changed, 1 insertion(+), 457 deletions(-) delete mode 100644 include/quantize.cpp diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 8471aeb91..db04bf64e 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -25,6 +25,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ2_XL", LLAMA_FTYPE_MOSTLY_IQ2_XL, " 2.85 bpw quantization mix", }, + { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, diff --git a/include/quantize.cpp b/include/quantize.cpp deleted file mode 100644 index db04bf64e..000000000 --- a/include/quantize.cpp +++ /dev/null @@ -1,457 +0,0 @@ -#include "common.h" -#include "llama.h" - -#include -#include -#include -#include -#include -#include -#include - -struct quant_option { - std::string name; - llama_ftype ftype; - std::string desc; -}; - -static const std::vector QUANT_OPTIONS = { - { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, - { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", }, - { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", }, - { "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", }, - { "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", }, - { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, - { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, - { "IQ2_XL", LLAMA_FTYPE_MOSTLY_IQ2_XL, " 2.85 bpw quantization mix", }, - { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", }, - { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, - { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, - { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, - { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, - { "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, - { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, - { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, - { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.85 bpw quantization mix", }, - { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, - { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, - { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, - { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, - { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, - { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, - { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, - { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, - { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, - { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, - { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", }, - { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", }, - { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", }, - { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", }, - { "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", }, - { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, - { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, - // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. - { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, -}; - -static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; -static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset"; -static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; -static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; - -static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { - std::string ftype_str; - - for (auto ch : ftype_str_in) { - ftype_str.push_back(std::toupper(ch)); - } - for (auto & it : QUANT_OPTIONS) { - if (it.name == ftype_str) { - ftype = it.ftype; - ftype_str_out = it.name; - return true; - } - } - try { - int ftype_int = std::stoi(ftype_str); - for (auto & it : QUANT_OPTIONS) { - if (it.ftype == ftype_int) { - ftype = it.ftype; - ftype_str_out = it.name; - return true; - } - } - } - catch (...) { - // stoi failed - } - return false; -} - -// usage: -// ./llama-quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] -// -[[noreturn]] -static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); - printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); - printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); - printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); - printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); - printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); - printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); - printf(" --keep-split: will generate quatized model in the same shards as input"); - printf(" --override-kv KEY=TYPE:VALUE\n"); - printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); - printf("Note: --include-weights and --exclude-weights cannot be used together\n"); - printf("\nAllowed quantization types:\n"); - for (auto & it : QUANT_OPTIONS) { - if (it.name != "COPY") { - printf(" %2d or ", it.ftype); - } else { - printf(" "); - } - printf("%-7s : %s\n", it.name.c_str(), it.desc.c_str()); - } - exit(1); -} - -static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map> & imatrix_data) { - std::ifstream in(imatrix_file.c_str(), std::ios::binary); - if (!in) { - printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); - exit(1); - } - int n_entries; - in.read((char *)&n_entries, sizeof(n_entries)); - if (in.fail() || n_entries < 1) { - printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); - exit(1); - } - for (int i = 0; i < n_entries; ++i) { - int len; in.read((char *)&len, sizeof(len)); - std::vector name_as_vec(len+1); - in.read((char *)name_as_vec.data(), len); - if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); - exit(1); - } - name_as_vec[len] = 0; - std::string name{name_as_vec.data()}; - auto & e = imatrix_data[name]; - int ncall; - in.read((char *)&ncall, sizeof(ncall)); - int nval; - in.read((char *)&nval, sizeof(nval)); - if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n", __func__, i); - imatrix_data = {}; - exit(1); - } - e.resize(nval); - in.read((char *)e.data(), nval*sizeof(float)); - if (in.fail()) { - printf("%s: failed reading data for entry %d\n", __func__, i); - imatrix_data = {}; - exit(1); - } - if (ncall > 0) { - for (auto& v : e) v /= ncall; - } - - if (getenv("LLAMA_TRACE")) { - printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); - } - } - - // latest imatrix version contains the dataset filename at the end of the file - int m_last_call = 0; - if (in.peek() != EOF) { - in.read((char *)&m_last_call, sizeof(m_last_call)); - int dataset_len; - in.read((char *)&dataset_len, sizeof(dataset_len)); - std::vector dataset_as_vec(dataset_len); - in.read(dataset_as_vec.data(), dataset_len); - imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end()); - printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str()); - } - printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); - return m_last_call; -} - -static int prepare_imatrix(const std::string & imatrix_file, - std::string & imatrix_dataset, - const std::vector & included_weights, - const std::vector & excluded_weights, - std::unordered_map> & imatrix_data) { - int m_last_call = -1; - if (!imatrix_file.empty()) { - m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data); - } - if (imatrix_data.empty()) { - return m_last_call; - } - if (!excluded_weights.empty()) { - for (auto& name : excluded_weights) { - for (auto it = imatrix_data.begin(); it != imatrix_data.end(); ) { - auto pos = it->first.find(name); - if (pos != std::string::npos) it = imatrix_data.erase(it); - else ++it; - } - } - } - if (!included_weights.empty()) { - std::unordered_map> tmp; - for (auto& name : included_weights) { - for (auto& e : imatrix_data) { - auto pos = e.first.find(name); - if (pos != std::string::npos) { - tmp.emplace(std::move(e)); - } - } - } - imatrix_data = std::move(tmp); - } - if (!imatrix_data.empty()) { - printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); - } - return m_last_call; -} - -static ggml_type parse_ggml_type(const char * arg) { - ggml_type result = GGML_TYPE_COUNT; - for (int j = 0; j < GGML_TYPE_COUNT; ++j) { - auto type = ggml_type(j); - const auto * name = ggml_type_name(type); - if (name && strcmp(arg, name) == 0) { - result = type; break; - } - } - return result; -} - -int main(int argc, char ** argv) { - if (argc < 3) { - usage(argv[0]); - } - - llama_model_quantize_params params = llama_model_quantize_default_params(); - - int arg_idx = 1; - std::string imatrix_file; - std::vector included_weights, excluded_weights; - std::vector kv_overrides; - - for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { - if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { - params.quantize_output_tensor = false; - } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { - if (arg_idx < argc-1) { - params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { - if (arg_idx < argc-1) { - params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { - if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { - params.allow_requantize = true; - } else if (strcmp(argv[arg_idx], "--pure") == 0) { - params.pure = true; - } else if (strcmp(argv[arg_idx], "--imatrix") == 0) { - if (arg_idx < argc-1) { - imatrix_file = argv[++arg_idx]; - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--include-weights") == 0) { - if (arg_idx < argc-1) { - included_weights.emplace_back(argv[++arg_idx]); - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--exclude-weights") == 0) { - if (arg_idx < argc-1) { - excluded_weights.emplace_back(argv[++arg_idx]); - } else { - usage(argv[0]); - } - } else if (strcmp(argv[arg_idx], "--keep-split") == 0) { - params.keep_split = true; - } else { - usage(argv[0]); - } - } - - if (argc - arg_idx < 2) { - printf("%s: bad arguments\n", argv[0]); - usage(argv[0]); - } - if (!included_weights.empty() && !excluded_weights.empty()) { - usage(argv[0]); - } - - std::string imatrix_dataset; - std::unordered_map> imatrix_data; - int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data); - if (!imatrix_data.empty()) { - params.imatrix = &imatrix_data; - { - llama_model_kv_override kvo; - std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; - strncpy(kvo.val_str, imatrix_file.c_str(), 127); - kvo.val_str[127] = '\0'; - kv_overrides.emplace_back(std::move(kvo)); - } - if (!imatrix_dataset.empty()) { - llama_model_kv_override kvo; - std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET); - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; - strncpy(kvo.val_str, imatrix_dataset.c_str(), 127); - kvo.val_str[127] = '\0'; - kv_overrides.emplace_back(std::move(kvo)); - } - - { - llama_model_kv_override kvo; - std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.val_i64 = imatrix_data.size(); - kv_overrides.emplace_back(std::move(kvo)); - } - - if (m_last_call > 0) { - llama_model_kv_override kvo; - std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS); - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.val_i64 = m_last_call; - kv_overrides.emplace_back(std::move(kvo)); - } - } - if (!kv_overrides.empty()) { - kv_overrides.emplace_back(); - kv_overrides.back().key[0] = 0; - params.kv_overrides = &kv_overrides; - } - - llama_backend_init(); - - // parse command line arguments - const std::string fname_inp = argv[arg_idx]; - arg_idx++; - std::string fname_out; - - std::string ftype_str; - std::string suffix = ".gguf"; - if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { - std::string fpath; - const size_t pos = fname_inp.find_last_of("/\\"); - if (pos != std::string::npos) { - fpath = fname_inp.substr(0, pos + 1); - } - - // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting - fname_out = fpath + "ggml-model-" + ftype_str; - if (!params.keep_split) { - fname_out += suffix; - } - arg_idx++; - if (ftype_str == "COPY") { - params.only_copy = true; - } - } else { - fname_out = argv[arg_idx]; - if (params.keep_split && fname_out.find(suffix) != std::string::npos) { - fname_out = fname_out.substr(0, fname_out.length() - suffix.length()); - } - arg_idx++; - - if (argc <= arg_idx) { - fprintf(stderr, "%s: missing ftype\n", __func__); - return 1; - } - if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { - fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); - return 1; - } - if (ftype_str == "COPY") { - params.only_copy = true; - } - arg_idx++; - } - - // parse nthreads - if (argc > arg_idx) { - try { - params.nthread = std::stoi(argv[arg_idx]); - } - catch (const std::exception & e) { - fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what()); - return 1; - } - } - - if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || - params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { - fprintf(stderr, "\n==========================================================================================================\n"); - fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); - fprintf(stderr, "==========================================================================================================\n\n\n"); - return 1; - } - - print_build_info(); - - fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str()); - if (params.nthread > 0) { - fprintf(stderr, " using %d threads", params.nthread); - } - fprintf(stderr, "\n"); - - const int64_t t_main_start_us = llama_time_us(); - - int64_t t_quantize_us = 0; - - // load the model - { - const int64_t t_start_us = llama_time_us(); - - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ¶ms)) { - fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); - return 1; - } - - t_quantize_us = llama_time_us() - t_start_us; - } - - // report timing - { - const int64_t t_main_end_us = llama_time_us(); - - printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); - } - - llama_backend_free(); - - return 0; -} From ef83a87cfe8eaa6f9e274d418298fc5be334dfe0 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 01:30:18 +0200 Subject: [PATCH 20/59] Revert of ffn gate and up on IQ3_M and indent --- src/llama.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 27ebbd05c..eda547369 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15674,7 +15674,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) @@ -15725,7 +15725,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ4_XS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } @@ -15742,7 +15741,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ4_XS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } From 1268d58ca87fc1bdaa0c6ab9d8f473a6b31d7ac1 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 02:13:08 +0200 Subject: [PATCH 21/59] More adjustments --- src/llama.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index eda547369..7bfde358c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15599,8 +15599,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q5_K; } } else if (name.find("attn_q.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { @@ -15715,9 +15715,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q4_K; + if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; @@ -15731,9 +15729,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q4_K; + if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; @@ -16212,7 +16208,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4; } - LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); + LLAMA_LOG_INFO("converts to %s .. ", ggml_type_name(new_type)); fflush(stdout); if (work.size() < (size_t)nelements * 4) { From 91db53b64555753f6445eef675cfdbf93370322b Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 16:41:23 +0200 Subject: [PATCH 22/59] IQ1_XL and some corrections notably on attn_q and parenthesis --- examples/quantize/quantize.cpp | 7 ++- include/llama.h | 1 + src/llama.cpp | 101 +++++++++++++++++++-------------- 3 files changed, 64 insertions(+), 45 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index db04bf64e..9e2e40071 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -28,13 +28,14 @@ static const std::vector QUANT_OPTIONS = { { "IQ1_XS", LLAMA_FTYPE_MOSTLY_IQ1_XS, " 1.6-1.7 bpw quantization mix", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, + { "IQ1_XL", LLAMA_FTYPE_MOSTLY_IQ1_XL, " 1.90 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, - { "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 2.96G, +3.1836 ppl @ Llama-3-8B", }, + { "Q2_K_L", LLAMA_FTYPE_MOSTLY_Q2_K_L, " 3.20G, +3.1836 ppl @ Llama-3-8B", }, { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, - { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, - { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.85 bpw quantization mix", }, + { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", }, + { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.95 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, diff --git a/include/llama.h b/include/llama.h index c9d7cb8a1..4f5f2fc19 100644 --- a/include/llama.h +++ b/include/llama.h @@ -170,6 +170,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_XL = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_XS = 39, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_XL = 40, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 7bfde358c..e0f251a0f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4483,11 +4483,12 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ1_XS: return "IQ1_S mix - 1.6-1.7 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_XL: return "IQ1_XL - 1.90 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.96 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; @@ -15475,7 +15476,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -15491,7 +15493,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_Q2_K; @@ -15508,7 +15511,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + new_type = GGML_TYPE_Q6_K; + } else new_type = GGML_TYPE_Q8_0; } else if (qs.model.hparams.n_gqa() >= 7) { @@ -15519,10 +15524,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { @@ -15534,7 +15539,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; @@ -15559,21 +15564,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // TODO: explore better strategies if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q6_K; + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + new_type = GGML_TYPE_Q6_K; + } else new_type = GGML_TYPE_Q8_0; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ1_M; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ2_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ2_S; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && @@ -15603,24 +15611,28 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS; } @@ -15636,7 +15648,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } @@ -15674,11 +15686,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) - new_type = GGML_TYPE_IQ2_XXS; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } @@ -15701,7 +15713,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = GGML_TYPE_IQ2_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + new_type = GGML_TYPE_IQ2_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S; @@ -15715,29 +15729,31 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } @@ -15881,6 +15897,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ1_XS: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; + case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; From 1ad18f80e98f68c9c056423cf43838f5fb779d69 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 21:44:29 +0200 Subject: [PATCH 23/59] Adjustments on attn_k --- src/llama.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f9982d7d4..ec9f393c3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15531,7 +15531,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_XXS; + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; @@ -15573,23 +15573,21 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else new_type = GGML_TYPE_Q8_0; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && - (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ1_M; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) && - (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ2_S; } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { From df9e6fda50a64c260ab7aa4825e0f309ba65af34 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 11 Aug 2024 21:49:23 +0200 Subject: [PATCH 24/59] Adjustments on output and embeddings --- src/llama.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ec9f393c3..b76b2edfe 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15479,11 +15479,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = GGML_TYPE_Q5_K; @@ -15496,11 +15496,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS ||ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_IQ2_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_Q2_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { From cd92ba612f3b7d525ccada9e3071e11f477618f6 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 12 Aug 2024 19:45:46 +0200 Subject: [PATCH 25/59] IQ4_XSR (test FTYPE) and attention_wv logic for all attn_*.weights Also, Advise iMatrix for IQ2_M and Q2_K FTypes --- examples/quantize/quantize.cpp | 11 +++--- include/llama.h | 1 + src/llama.cpp | 61 ++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 9e2e40071..99930e892 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -43,6 +43,7 @@ static const std::vector QUANT_OPTIONS = { { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, + { "IQ4_XSR", LLAMA_FTYPE_MOSTLY_IQ4_XSR, " 4.xx bpw non-linear quantization", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", }, @@ -409,13 +410,13 @@ int main(int argc, char ** argv) { } if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || - params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { - fprintf(stderr, "\n==========================================================================================================\n"); - fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); - fprintf(stderr, "==========================================================================================================\n\n\n"); + fprintf(stderr, "\n==========================================================================================\n"); + fprintf(stderr, "Please do not use IQ1_*, IQ2_*, Q2_K_S, or Q2_K quantization without an importance matrix!\n"); + fprintf(stderr, "==========================================================================================\n\n\n"); return 1; } diff --git a/include/llama.h b/include/llama.h index 4f5f2fc19..01f859634 100644 --- a/include/llama.h +++ b/include/llama.h @@ -171,6 +171,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_XS = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_XL = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ4_XSR = 41, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index b76b2edfe..63b3c6072 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4489,6 +4489,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8"; @@ -15347,10 +15348,17 @@ struct quantize_state_internal { const llama_model_quantize_params * params; int n_attention_wv = 0; + int n_attention_wk = 0; + int n_attention_wq = 0; + int n_attention_wo = 0; int n_ffn_down = 0; int n_ffn_gate = 0; int n_ffn_up = 0; + int i_attention_wv = 0; + int i_attention_wk = 0; + int i_attention_wq = 0; + int i_attention_wo = 0; int i_ffn_down = 0; int i_ffn_gate = 0; int i_ffn_up = 0; @@ -15505,6 +15513,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q8_0; } } else if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { @@ -15556,9 +15565,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { + new_type = GGML_TYPE_Q5_K; + } else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K : + use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; + } + } ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { @@ -15606,6 +15624,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K : + use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + } + } + ++qs.i_attention_wk; } else if (name.find("attn_q.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS; @@ -15618,6 +15643,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS : + use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S; + ++qs.i_attention_wq; } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; @@ -15674,6 +15707,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K : + use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + } + } ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { @@ -15682,6 +15721,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || @@ -15700,10 +15740,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_Q5_K : + use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + } + } } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; } + ++qs.i_attention_wo; } else if (name.find("attn_qkv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -15723,8 +15770,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + ++qs.i_attention_wv; } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); @@ -15739,6 +15788,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { @@ -15754,6 +15804,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K; ++qs.i_ffn_up; } @@ -15900,6 +15951,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ1_XL: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ4_XSR: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break; @@ -15998,6 +16050,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; + } else if (name.find("attn_k.weight") != std::string::npos) { + ++qs.n_attention_wk; + } else if (name.find("attn_q.weight") != std::string::npos) { + ++qs.n_attention_wq; + } else if (name.find("attn_output.weight") != std::string::npos) { + ++qs.n_attention_wo; } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } @@ -16012,6 +16070,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models // GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer) && "n_attention_wv is unexpected"); + GGML_ASSERT((qs.n_attention_wk == 0 || qs.n_attention_wk == (int)model.hparams.n_layer || qs.n_attention_wk == 3 * (int)model.hparams.n_layer) && "n_attention_wk is unexpected"); + GGML_ASSERT((qs.n_attention_wq == 0 || qs.n_attention_wq == (int)model.hparams.n_layer || qs.n_attention_wq == 3 * (int)model.hparams.n_layer) && "n_attention_wq is unexpected"); + GGML_ASSERT((qs.n_attention_wo == 0 || qs.n_attention_wo == (int)model.hparams.n_layer || qs.n_attention_wo == 3 * (int)model.hparams.n_layer) && "n_attention_wo is unexpected"); size_t total_size_org = 0; size_t total_size_new = 0; From 8c9017bfbe45f165cc1a712e76dd730a4ea54b2d Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 12 Aug 2024 22:20:02 +0200 Subject: [PATCH 26/59] Simplify IQ4_XSR But leave in place as a "demo" the more complex template set by Ikawrakow to customize the layers quants, with the added attn_q, attn_k, and attn_output tensors. --- src/llama.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7683abb95..5a5aee8cf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15508,7 +15508,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q8_0; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ4_XS; } } else if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { @@ -15568,8 +15568,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K : - use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; + new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : + use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } } ++qs.i_attention_wv; @@ -15622,7 +15622,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K : - use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; } } ++qs.i_attention_wk; @@ -15704,7 +15704,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K : + new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS : use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; } } @@ -15737,7 +15737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_Q5_K : + new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS : use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; } } @@ -15783,7 +15783,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { @@ -15799,7 +15798,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR && (i_layer < n_layer/8)) new_type = GGML_TYPE_Q5_K; ++qs.i_ffn_up; } From 17b71512a6f3214bb886af627fca0dc25b1f8c09 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 17 Aug 2024 00:17:41 +0200 Subject: [PATCH 27/59] Update IQ3_M attn_k and IQ3_XL token_embd --- src/llama.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 5e934dcd3..e54ae697d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15899,11 +15899,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; } } else if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { @@ -16003,9 +16004,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { new_type = GGML_TYPE_IQ3_XXS; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) && - (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; From 4ba561808d04dd1d122ca5af1a010099fdc95fd4 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 17 Aug 2024 12:31:36 +0200 Subject: [PATCH 28/59] Adapt token embeddings and output.weight to vocab size due to the huge increase of the embeddings and output weight size for models with huge vocab, they seem to quantize with less loss. --- src/llama.cpp | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e54ae697d..f6afc6ebf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15877,15 +15877,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q6_K; + } else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } @@ -15894,10 +15902,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_S; + } + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_IQ2_S; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; From b02eaf68031f21a9b9897a7f0713efdc28fc5bde Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 17 Aug 2024 14:58:25 +0200 Subject: [PATCH 29/59] Mass use of the few/some/more/many bits bump logic Add few bits logic and rework the 4 settings for 25/37.5/50/75% quant bump when used. --- src/llama.cpp | 78 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index f6afc6ebf..8e3745ef3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15847,9 +15847,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n const llm_arch arch = qs.model.arch; const auto tn = LLM_TN(arch); - auto use_more_bits = [](int i_layer, int n_layers) -> bool { - return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; + auto use_few_bits = [](int i_layer, int n_layers) -> bool { + return i_layer <= n_layers/8 || i_layer > 7*n_layers/8; }; + //few_bits has a broad 25% bump to the upper quant. + auto use_some_bits = [](int i_layer, int n_layers) -> bool { + return i_layer <= n_layers/8 || i_layer > 7*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); + }; + // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; + // The intervals of 3 are replaced by a broad bump in the central layers. some_bits has a broad 37.5% bump to the upper quant. + auto use_more_bits = [](int i_layer, int n_layers) -> bool { + return i_layer <= n_layers/8 || i_layer > 6*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); + }; + //more_bits has a broad 50% bump to the upper quant. + auto use_many_bits = [](int i_layer, int n_layers) -> bool { + return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 4*n_layers/8); + }; + //many_bits has a broad 75% bump to the upper quant. const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { @@ -15917,10 +15931,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S; - else new_type = GGML_TYPE_IQ4_XS; - } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; @@ -15969,7 +15980,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -15988,7 +16000,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : - use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; } } ++qs.i_attention_wv; @@ -16027,9 +16039,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; - else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q4_K; @@ -16059,8 +16077,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS : - use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S : + use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S; } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S; @@ -16091,11 +16109,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (use_some_bits(i_layer, n_layer) || (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } @@ -16193,30 +16213,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } From ddb13732c444e3e1a44e7308bf5f39f2591d1e44 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 18 Aug 2024 16:56:55 +0200 Subject: [PATCH 30/59] IQ3_XXL and IQ3_XXXL We now have a full range of quants between IQ3_M and IQ4_XS --- examples/quantize/quantize.cpp | 4 ++- gguf-py/gguf/constants.py | 8 ++++++ include/llama.h | 16 +++++++----- src/llama.cpp | 47 ++++++++++++++++++++++++++-------- 4 files changed, 58 insertions(+), 17 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 99930e892..fa7bc7685 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -35,7 +35,9 @@ static const std::vector QUANT_OPTIONS = { { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", }, - { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.95 bpw quantization mix", }, + { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.80 bpw quantization mix", }, + { "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 3.95 bpw quantization mix", }, + { "IQ3_XXXL", LLAMA_FTYPE_MOSTLY_IQ3_XXXL, " 4.10 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5541972ce..63e7e2d3b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1249,6 +1249,14 @@ class LlamaFileType(IntEnum): MOSTLY_Q4_0_4_4 = 33 # except 1d tensors MOSTLY_Q4_0_4_8 = 34 # except 1d tensors MOSTLY_Q4_0_8_8 = 35 # except 1d tensors + MOSTLY_IQ2_XL = 38 # except 1d tensors + MOSTLY_IQ3_XL = 39 # except 1d tensors + MOSTLY_Q2_K_L = 40 # except 1d tensors + MOSTLY_IQ1_XS = 41 # except 1d tensors + MOSTLY_IQ1_XL = 42 # except 1d tensors + MOSTLY_IQ4_XSR = 43 # except 1d tensors + MOSTLY_IQ3_XXL = 44 # except 1d tensors + MOSTLY_IQ3_XXXL = 45 # except 1d tensors GUESSED = 1024 # not specified in the model file diff --git a/include/llama.h b/include/llama.h index b426dc0c5..aaee535e6 100644 --- a/include/llama.h +++ b/include/llama.h @@ -166,12 +166,16 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ2_XL = 36, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ3_XL = 37, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q2_K_L = 38, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_XS = 39, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ1_XL = 40, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ4_XSR = 41, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_XL = 38, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_XL = 39, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K_L = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_XS = 41, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_XL = 42, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ4_XSR = 43, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_XXL = 44, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_XXXL = 45, // except 1d tensors + LLAMA_FTYPE_CQS = 99, // except 1d tensors + LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama.cpp b/src/llama.cpp index 8e3745ef3..7c5252d13 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4524,7 +4524,9 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.95 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.80 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; @@ -15931,14 +15933,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { + new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; } } else if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { @@ -15979,7 +15982,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_IQ4_XS; } @@ -16045,6 +16049,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; @@ -16114,6 +16128,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + new_type = use_some_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -16153,6 +16173,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_expert >= 4) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || @@ -16173,7 +16194,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS : @@ -16202,9 +16224,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; ++qs.i_attention_wv; @@ -16223,7 +16245,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { @@ -16239,8 +16263,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } @@ -16391,6 +16416,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; From 503048a19771d0325cb6e6d7f358fe7c799233f2 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 18 Aug 2024 17:44:11 +0200 Subject: [PATCH 31/59] Correct IQ3_M --- src/llama.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 7c5252d13..0a51d34a9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16045,8 +16045,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16123,9 +16123,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (use_some_bits(i_layer, n_layer) || - (qs.model.hparams.n_expert >= 4 && use_more_bits(i_layer, n_layer)))) { - new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { new_type = use_some_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; From caeb839ae3c9ee8f1ee8b895519ad9140b0eed68 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 18 Aug 2024 17:58:17 +0200 Subject: [PATCH 32/59] Boost embeddings and output weights for MOEs. They are single and non-repeating, the boost is thus reasonable compared to the 4 or more experts size. --- src/llama.cpp | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0a51d34a9..07f4e9c2b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15894,17 +15894,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q8_0; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; else new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { - new_type = GGML_TYPE_Q5_K; + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; @@ -15918,11 +15921,26 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_expert >= 4) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + new_type = GGML_TYPE_Q4_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_XS; else new_type = GGML_TYPE_IQ2_S; } - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { @@ -16047,7 +16065,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; From a7f91643bb7aece32fd77e96fc7657b3721ee214 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 19 Aug 2024 16:02:00 +0200 Subject: [PATCH 33/59] Fix mistake --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 07f4e9c2b..0ad44a3f5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16202,7 +16202,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; } From 207ffe681ff5914b18df73f2ce04cc7c87694465 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 18 Aug 2024 23:28:13 +0200 Subject: [PATCH 34/59] Reorder, corrections, settling lower IQ3 quants --- src/llama.cpp | 176 ++++++++++++++++++++++++++++---------------------- 1 file changed, 100 insertions(+), 76 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0ad44a3f5..8ede3c461 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15981,6 +15981,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && + use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } @@ -15990,35 +16000,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { - new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_IQ4_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) { - new_type = GGML_TYPE_Q5_K; - } - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : @@ -16038,6 +16031,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else new_type = GGML_TYPE_Q8_0; } + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && + (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_Q3_K; + } + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && + (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_Q4_K; + } + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { + new_type = GGML_TYPE_Q5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XS; else new_type = GGML_TYPE_IQ2_XXS; @@ -16054,12 +16059,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ2_S; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && - (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && qs.model.hparams.n_gqa() < 2 && qs.model.hparams.n_expert < 2) { - new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16081,13 +16092,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_Q4_K; - } - else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || - ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_Q5_K; - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K : @@ -16096,16 +16100,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } ++qs.i_attention_wk; } else if (name.find("attn_q.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) new_type = GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { - if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { @@ -16113,13 +16125,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) new_type = GGML_TYPE_IQ3_S; ++qs.i_attention_wq; } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K + : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K + : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (arch == LLM_ARCH_FALCON) { + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : + use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) + && qs.has_imatrix && i_layer < n_layer/8) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; @@ -16133,13 +16171,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { - new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K - : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K - : GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { + new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; @@ -16153,31 +16192,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (arch == LLM_ARCH_FALCON) { - new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : - use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - } else { - if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; - } - } else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { - new_type = GGML_TYPE_Q5_K; - } - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) - && qs.has_imatrix && i_layer < n_layer/8) { - // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. - // We only do it when an imatrix is provided because a) we want to make sure that one can always get the - // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. - new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS : @@ -16198,7 +16215,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || @@ -16208,9 +16225,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + new_type = GGML_TYPE_IQ3_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { @@ -16226,10 +16245,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ++qs.i_attention_wo; } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = GGML_TYPE_IQ2_XS; } @@ -16240,12 +16263,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; ++qs.i_attention_wv; } else if (name.find("ffn_gate") != std::string::npos) { @@ -16261,6 +16282,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; @@ -16280,6 +16302,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; From fddff029151c82fdbef4b5dbe4b71314efab3b27 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 19 Aug 2024 01:43:31 +0200 Subject: [PATCH 35/59] Rework IQ3_XXS and IQ3_XS and fix parenthesis mistake on IQ3_S --- src/llama.cpp | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 8ede3c461..7dad7b43b 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16000,11 +16000,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_IQ4_XS; } @@ -16061,15 +16063,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { @@ -16172,10 +16174,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; @@ -16221,13 +16223,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || @@ -16282,8 +16284,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; @@ -16302,8 +16303,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; From ce860197704436c4ed688ab6dc08cfc1a6d57dc9 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed, 21 Aug 2024 12:25:38 +0200 Subject: [PATCH 36/59] change function use_*_bits into difquant_*_tensors this to clarify what it does, especially with the 5 additional levels of difquant --- src/llama.cpp | 159 +++++++++++++++++++++++++++----------------------- 1 file changed, 86 insertions(+), 73 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 34cf84ba5..ab7e39cbd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15866,23 +15866,33 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n const llm_arch arch = qs.model.arch; const auto tn = LLM_TN(arch); - auto use_few_bits = [](int i_layer, int n_layers) -> bool { + // difquant_init_tensors has a broad 12.5% bump to the upper quant. + auto difquant_init_tensors = [](int i_layer, int n_layers) -> bool { + return i_layer <= n_layers/8; + }; + // difquant_init_end_tensors has a broad 25% bump to the upper quant. + auto difquant_init_end_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer > 7*n_layers/8; }; - //few_bits has a broad 25% bump to the upper quant. - auto use_some_bits = [](int i_layer, int n_layers) -> bool { + // difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. + auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer > 7*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); }; + // original formula use_more_bits : // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; - // The intervals of 3 are replaced by a broad bump in the central layers. some_bits has a broad 37.5% bump to the upper quant. - auto use_more_bits = [](int i_layer, int n_layers) -> bool { + // The intervals of 3 are replaced by a broad bump in the central layers. + // difquant_half_tensors replaces it and keeps the broad 50% bump to the upper quant. + auto difquant_half_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer > 6*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); }; - //more_bits has a broad 50% bump to the upper quant. - auto use_many_bits = [](int i_layer, int n_layers) -> bool { + // difquant_five_eights_tensors has a broad 62.5% bump to the upper quant. + auto difquant_five_eights_tensors = [](int i_layer, int n_layers) -> bool { + return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); + }; + // difquant_six_eights_tensors has a broad 75% bump to the upper quant. + auto difquant_six_eights_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 4*n_layers/8); }; - //many_bits has a broad 75% bump to the upper quant. const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { if (n_expert > 1) { @@ -16006,7 +16016,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; + difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; @@ -16019,8 +16029,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + else new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { @@ -16034,7 +16044,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : - use_more_bits(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; + difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; } } ++qs.i_attention_wv; @@ -16080,41 +16090,41 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; + new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; + new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = use_few_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = use_some_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = use_many_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K : - use_more_bits(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; + difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; } } ++qs.i_attention_wk; @@ -16141,18 +16151,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S : - use_more_bits(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S; + difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S; } } ++qs.i_attention_wq; } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K - : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K + : arch != LLM_ARCH_FALCON || difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { @@ -16161,15 +16171,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : - use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) && qs.has_imatrix && i_layer < n_layer/8) { // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. @@ -16179,37 +16189,40 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; + if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; + new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { - new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = use_few_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - new_type = use_some_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { - new_type = use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { - new_type = use_many_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { new_type = GGML_TYPE_Q5_K; @@ -16217,7 +16230,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS : - use_more_bits(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; } } ++qs.i_ffn_down; @@ -16254,7 +16267,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS : - use_more_bits(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; } } } @@ -16291,39 +16304,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (use_few_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (use_some_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (use_more_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (use_many_bits(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } From dbadcdd5cfd30f77a4dd2ee6c89a663b3fb49a19 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 20 Aug 2024 11:59:41 +0200 Subject: [PATCH 37/59] harmonize formatting of tensor type conditions --- src/llama.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index ab7e39cbd..407108b3a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16275,8 +16275,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; } ++qs.i_attention_wo; - } - else if (name.find("attn_qkv.weight") != std::string::npos) { + } else if (name.find("attn_qkv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = GGML_TYPE_Q3_K; } @@ -16300,8 +16299,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K; ++qs.i_attention_wv; - } - else if (name.find("ffn_gate") != std::string::npos) { + } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; @@ -16319,8 +16317,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; - } - else if (name.find("ffn_up") != std::string::npos) { + } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; From d7b9d214fbb4ae9e49ece4a28d553ff64d2b9056 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Tue, 20 Aug 2024 12:45:30 +0200 Subject: [PATCH 38/59] Shrink a bit IQ3_XXS, bump a bit IQ3_M --- src/llama.cpp | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 407108b3a..dd73dfb49 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16029,8 +16029,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - else new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + else new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { @@ -16078,20 +16078,37 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; + else new_type = GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; else new_type = GGML_TYPE_IQ2_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ2_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) + new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16204,7 +16221,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; + new_type = difquant_init_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; @@ -16311,8 +16328,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; @@ -16329,8 +16348,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; From 32f6ead0d91ec3bf6760fb2c98ce6f459093d89a Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 19 Aug 2024 17:58:12 +0200 Subject: [PATCH 39/59] Improve IQ1 and IQ2 quants And fix mistakes for the attn.output of IQ2_XL and the ffn gate and up of IQ2_XS Reformat attn_ouput mess and split GQA4/GQA2 --- src/llama.cpp | 100 ++++++++++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 43 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index dd73dfb49..1e90efb4f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15925,19 +15925,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; else new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || + ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || - ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q6_K; } else if (new_type != GGML_TYPE_Q8_0) { @@ -15970,10 +15971,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_IQ2_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S; else new_type = GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + new_type = GGML_TYPE_IQ3_XXS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ3_S; @@ -16018,10 +16022,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { @@ -16035,7 +16037,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; - else new_type = GGML_TYPE_IQ4_XS; + else new_type = GGML_TYPE_Q4_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -16072,11 +16074,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XS; - else new_type = GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; else new_type = GGML_TYPE_IQ1_M; @@ -16204,21 +16202,24 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (i_layer < n_layer/8)) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - if (i_layer < n_layer/8) new_type = GGML_TYPE_IQ2_S; + if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = difquant_init_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; @@ -16254,24 +16255,37 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert >= 4) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; - else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { + new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q8_0; + } + else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + } + else if (qs.model.hparams.n_gqa() >= 2) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; @@ -16323,10 +16337,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; @@ -16343,10 +16357,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; From 644aa9fd41bb8c67cde95d015561406a1b59b5f4 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:07:32 +0200 Subject: [PATCH 40/59] Correction too small tensor embeddings to quantize IQ2_XS doesn't seem to work as such, back to IQ2_S --- src/llama.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 1e90efb4f..c06447c8c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15964,11 +15964,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_Q4_K; } } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_XS; - else new_type = GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { From 179ad0fad4aec3716266f1c7eb6a309df7df467f Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:10:54 +0200 Subject: [PATCH 41/59] Little rework of the difquant formulas --- src/llama.cpp | 52 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c06447c8c..8574c53ed 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15866,17 +15866,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n const llm_arch arch = qs.model.arch; const auto tn = LLM_TN(arch); - // difquant_init_tensors has a broad 12.5% bump to the upper quant. - auto difquant_init_tensors = [](int i_layer, int n_layers) -> bool { - return i_layer <= n_layers/8; + // difquant_first_last_tensors has a broad 13.75-16.66% bump to the upper quant. + auto difquant_first_last_tensors = [](int i_layer, int n_layers) -> bool { + return i_layer < n_layers/8 || i_layer >= n_layers-2; }; - // difquant_init_end_tensors has a broad 25% bump to the upper quant. - auto difquant_init_end_tensors = [](int i_layer, int n_layers) -> bool { - return i_layer <= n_layers/8 || i_layer > 7*n_layers/8; + // difquant_more_fl_tensors has a broad 26-29% bump to the upper quant. + auto difquant_more_fl_tensors = [](int i_layer, int n_layers) -> bool { + return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8; }; // difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool { - return i_layer <= n_layers/8 || i_layer > 7*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); + return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer > 2*n_layers/8 && i_layer < 3*n_layers/8); }; // original formula use_more_bits : // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; @@ -16028,8 +16028,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - else new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; + new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { @@ -16102,8 +16102,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) - new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_init_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16115,8 +16115,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_init_end_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16199,7 +16199,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; } @@ -16213,22 +16213,22 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = difquant_init_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = difquant_first_last_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { - new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = difquant_init_end_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; @@ -16331,18 +16331,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; @@ -16351,18 +16351,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_init_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_init_end_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; From 1607a02bdd4d2267666208b5b913826dc47e662f Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 12:38:45 +0200 Subject: [PATCH 42/59] Further adjustments difquant formulas --- src/llama.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 8574c53ed..dd0655d95 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15866,32 +15866,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n const llm_arch arch = qs.model.arch; const auto tn = LLM_TN(arch); - // difquant_first_last_tensors has a broad 13.75-16.66% bump to the upper quant. + // difquant_first_last_tensors has a broad 13.75-16.66% bump to the upper quant. Ex : 6/32 auto difquant_first_last_tensors = [](int i_layer, int n_layers) -> bool { return i_layer < n_layers/8 || i_layer >= n_layers-2; }; - // difquant_more_fl_tensors has a broad 26-29% bump to the upper quant. + // difquant_more_fl_tensors has a broad 26-29% bump to the upper quant. Ex : 9/32 auto difquant_more_fl_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8; }; - // difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. + // difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. Ex : 12/32 auto difquant_three_eights_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer > 2*n_layers/8 && i_layer < 3*n_layers/8); }; // original formula use_more_bits : // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; // The intervals of 3 are replaced by a broad bump in the central layers. - // difquant_half_tensors replaces it and keeps the broad 50% bump to the upper quant. + // In the case of a 32 layers model, layers 5-7 and layers 12-16 are always skipped. + // In the case of a 40 layers model, layers 6-9 and layers 15-20 are always skipped. + // difquant_half_tensors replaces it and keeps the broad 50% bump to the upper quant. Ex : 16/32 auto difquant_half_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer > 6*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); }; - // difquant_five_eights_tensors has a broad 62.5% bump to the upper quant. + // difquant_five_eights_tensors has a broad 62.5% bump to the upper quant. Ex : 20/32 auto difquant_five_eights_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); }; - // difquant_six_eights_tensors has a broad 75% bump to the upper quant. + // difquant_six_eights_tensors has a broad 75% bump to the upper quant. Ex : 24/32 auto difquant_six_eights_tensors = [](int i_layer, int n_layers) -> bool { - return i_layer <= n_layers/8 || i_layer > 5*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 4*n_layers/8); + return i_layer <= n_layers/8 || i_layer > 4*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); }; const int n_expert = std::max(1, (int)qs.model.hparams.n_expert); auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) { From e05da54eff069a4922a0b3fcd2f913d9486fe39e Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu, 22 Aug 2024 19:12:13 +0200 Subject: [PATCH 43/59] Overhaul of FFN, if GQA and if not --- src/llama.cpp | 237 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 199 insertions(+), 38 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index dd0655d95..0179e68e9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16011,15 +16011,19 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_IQ3_S || new_type == GGML_TYPE_IQ4_XS) new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && - difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K; + (difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv))) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { @@ -16061,13 +16065,22 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else new_type = GGML_TYPE_Q8_0; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) && + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q3_K; } - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) && - (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q4_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -16110,7 +16123,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_XXS; + else new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_IQ4_XS; @@ -16332,41 +16345,189 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ1_M; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && (difquant_first_last_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (difquant_five_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; ++qs.i_ffn_up; } From 3a027b878b69284240242607966cb6134045c013 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 00:08:42 +0200 Subject: [PATCH 44/59] Revamp IQ4_XSR, remove IQ3_XXXL --- examples/quantize/quantize.cpp | 5 +- gguf-py/gguf/constants.py | 1 - include/llama.h | 1 - src/llama.cpp | 100 ++++++++++++++++++++------------- 4 files changed, 63 insertions(+), 44 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index fa7bc7685..a311e9cad 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -35,9 +35,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ3_XXS", LLAMA_FTYPE_MOSTLY_IQ3_XXS, " 3.06 bpw quantization", }, { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.70 bpw quantization mix", }, - { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.80 bpw quantization mix", }, - { "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 3.95 bpw quantization mix", }, - { "IQ3_XXXL", LLAMA_FTYPE_MOSTLY_IQ3_XXXL, " 4.10 bpw quantization mix", }, + { "IQ3_XL", LLAMA_FTYPE_MOSTLY_IQ3_XL, " 3.90 bpw quantization mix", }, + { "IQ3_XXL", LLAMA_FTYPE_MOSTLY_IQ3_XXL, " 4.10 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c74b85729..9f425c19e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1257,7 +1257,6 @@ class LlamaFileType(IntEnum): MOSTLY_IQ1_XL = 42 # except 1d tensors MOSTLY_IQ4_XSR = 43 # except 1d tensors MOSTLY_IQ3_XXL = 44 # except 1d tensors - MOSTLY_IQ3_XXXL = 45 # except 1d tensors GUESSED = 1024 # not specified in the model file diff --git a/include/llama.h b/include/llama.h index aaee535e6..697740dc2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -173,7 +173,6 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ1_XL = 42, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XSR = 43, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXL = 44, // except 1d tensors - LLAMA_FTYPE_MOSTLY_IQ3_XXXL = 45, // except 1d tensors LLAMA_FTYPE_CQS = 99, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file diff --git a/src/llama.cpp b/src/llama.cpp index 0179e68e9..21a4c63a6 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4528,9 +4528,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.70 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.80 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 3.95 bpw"; - case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: return "IQ3_S mix - 4.10 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XL: return "IQ3_S mix - 3.90 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_XXL: return "IQ3_S mix - 4.10 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XSR: return "IQ4_XS mix - 4.xx bpw"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4"; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8"; @@ -15981,7 +15980,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; @@ -16038,7 +16037,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q4_K; } @@ -16048,9 +16047,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : - difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; + new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K : + difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } + else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { @@ -16143,16 +16143,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q5_K : - difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q5_K; + new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K : + difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } + else difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } ++qs.i_attention_wk; } else if (name.find("attn_q.weight") != std::string::npos) { @@ -16170,16 +16166,17 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ2_S; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ3_S : - difquant_half_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_S; + new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS : + difquant_more_fl_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } + else difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } ++qs.i_attention_wq; } else if (name.find("ffn_down") != std::string::npos) { @@ -16231,37 +16228,49 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = difquant_first_last_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - new_type = difquant_five_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { - new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { - new_type = difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) { - new_type = difquant_six_eights_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ4_XS; } else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = i_layer < n_layer/8 ? GGML_TYPE_IQ4_XS : - difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : + difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } + else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { @@ -16275,10 +16284,9 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { new_type = GGML_TYPE_Q5_K; } @@ -16305,13 +16313,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) + new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wo < qs.n_attention_wo/8 ? GGML_TYPE_IQ4_XS : - difquant_half_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ4_XS; + new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q6_K : + difquant_more_fl_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } + else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } } } else { @@ -16340,7 +16349,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) new_type = GGML_TYPE_IQ4_XS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : + difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } + else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } ++qs.i_attention_wv; } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); @@ -16434,7 +16449,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); @@ -16528,7 +16547,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXXL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } ++qs.i_ffn_up; } @@ -16683,7 +16706,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XL: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXL: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XXXL: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break; case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break; case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break; From 596a4aec86b0db7c397ac9c8e407a7ff0a9fdd4f Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu, 22 Aug 2024 19:12:25 +0200 Subject: [PATCH 45/59] Readd variable attn_k, attn_q, attn_o after merge --- src/llama.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 6191741ff..a91ed5c60 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17328,6 +17328,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s n_attn_layer *= 3; } GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected"); + GGML_ASSERT((qs.n_attention_wk == n_attn_layer) && "n_attention_wk is unexpected"); + GGML_ASSERT((qs.n_attention_wq == n_attn_layer) && "n_attention_wq is unexpected"); + GGML_ASSERT((qs.n_attention_wo == n_attn_layer) && "n_attention_wo is unexpected"); } size_t total_size_org = 0; From f79695487294acb084aefa5472880d6b0f324b44 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 14:17:19 +0200 Subject: [PATCH 46/59] Revamp FFN down and attn_k And complete FFN up Shrink a bit more non GQA models --- src/llama.cpp | 389 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 231 insertions(+), 158 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a91ed5c60..d8726bc4a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16374,8 +16374,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n auto difquant_first_last_tensors = [](int i_layer, int n_layers) -> bool { return i_layer < n_layers/8 || i_layer >= n_layers-2; }; - // difquant_more_fl_tensors has a broad 26-29% bump to the upper quant. Ex : 9/32 - auto difquant_more_fl_tensors = [](int i_layer, int n_layers) -> bool { + // difquant_fl_more_tensors has a broad 26-29% bump to the upper quant. Ex : 9/32 + auto difquant_fl_more_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8; }; // difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. Ex : 12/32 @@ -16539,7 +16539,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; + else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { @@ -16553,7 +16553,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K : - difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } @@ -16591,20 +16591,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; - else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; - else new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XXS; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; + else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; - else new_type = GGML_TYPE_IQ2_XXS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; - else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; - else new_type = GGML_TYPE_IQ2_XXS; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; @@ -16613,47 +16618,59 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; - else new_type = GGML_TYPE_IQ2_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ2_S; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ3_XXS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ4_XS; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K : - difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } - else difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q5_K; } ++qs.i_attention_wk; } else if (name.find("attn_q.weight") != std::string::npos) { @@ -16679,105 +16696,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS : - difquant_more_fl_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } ++qs.i_attention_wq; - } else if (name.find("ffn_down") != std::string::npos) { - auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); - int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K - : arch != LLM_ARCH_FALCON || difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q4_K - : GGML_TYPE_Q3_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (arch == LLM_ARCH_FALCON) { - new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : - difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - } else { - if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; - } - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { - new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) - && qs.has_imatrix && i_layer < n_layer/8) { - // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. - // We only do it when an imatrix is provided because a) we want to make sure that one can always get the - // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. - new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { - if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = GGML_TYPE_IQ4_XS; - } - else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { - new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : - difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - } - else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - } - ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert >= 4) { @@ -16823,7 +16746,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q6_K : - difquant_more_fl_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } @@ -16857,7 +16780,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : - difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -16877,89 +16800,230 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } ++qs.i_ffn_gate; + } else if (name.find("ffn_down") != std::string::npos) { + auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ4_XS; + } + else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : + difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (arch == LLM_ARCH_FALCON) { + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : + difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) + && qs.has_imatrix && i_layer < n_layer/8) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } + ++qs.i_ffn_down; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; @@ -16975,86 +17039,95 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } ++qs.i_ffn_up; From 6b5cebfb2b5b521bfc6081e94fa57afd0c1ce601 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 16:40:40 +0200 Subject: [PATCH 47/59] Revamp a bit output weight for more granularity in low quants. --- src/llama.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d8726bc4a..817e4cc4c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16426,23 +16426,35 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q6_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || - ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q6_K; From 608108597c230a054ac225f7802d7ae75fbe11f4 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 17:48:31 +0200 Subject: [PATCH 48/59] Ravamp attn_output --- src/llama.cpp | 89 ++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 48 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 817e4cc4c..6f60622f4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16714,57 +16714,50 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } ++qs.i_attention_wq; } else if (name.find("attn_output.weight") != std::string::npos) { - if (arch != LLM_ARCH_FALCON) { - if (qs.model.hparams.n_expert >= 4) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || - ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { - new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q5_K; - else new_type = GGML_TYPE_Q8_0; + if (qs.model.hparams.n_expert >= 4) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + new_type = GGML_TYPE_Q4_K; } - else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; - } - else if (qs.model.hparams.n_gqa() >= 2) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; - } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - new_type = GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) - new_type = GGML_TYPE_IQ4_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q6_K : - difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; - } - else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; - } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { + new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q5_K : + difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) + new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) + new_type = GGML_TYPE_IQ4_XS; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) + new_type = GGML_TYPE_IQ2_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ4_XS; } ++qs.i_attention_wo; } else if (name.find("attn_qkv.weight") != std::string::npos) { From 380b53d0617ed3b3d3836fcd5f4d43389cd3339e Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 21:59:34 +0200 Subject: [PATCH 49/59] Fix IQ4_XSR --- src/llama.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 6f60622f4..72a07e749 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16567,7 +16567,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K : difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } - else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } ++qs.i_attention_wv; } else if (name.find("attn_k.weight") != std::string::npos) { @@ -16710,7 +16710,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS : difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } ++qs.i_attention_wq; } else if (name.find("attn_output.weight") != std::string::npos) { @@ -16737,7 +16737,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q5_K : difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } - else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) @@ -16787,7 +16787,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } - else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + else new_type = difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } ++qs.i_attention_wv; } else if (name.find("ffn_gate") != std::string::npos) { @@ -16983,7 +16983,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } - else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) From 16e9c3771aa3f5140f5ea32d0d65862e50fc98d2 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 23:18:59 +0200 Subject: [PATCH 50/59] various corrections on IQ2_S+ and IQ3 quants --- src/llama.cpp | 53 +++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 72a07e749..d6a822a74 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16443,11 +16443,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ4_XS; else new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; - else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { @@ -16456,7 +16457,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; - else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; + else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q6_K; } else if (new_type != GGML_TYPE_Q8_0) { @@ -16487,17 +16488,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S; + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ2_S; else new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS; + else if ( || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ3_S; new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; @@ -16550,10 +16552,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; + new_type = difquant_first_last_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + else new_type = difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = difquant_five_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q4_K; @@ -16650,7 +16657,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16659,8 +16666,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16863,8 +16870,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16873,8 +16879,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16947,8 +16952,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16957,8 +16962,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17102,8 +17107,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17112,8 +17116,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) From 1bde168c077073566b0638a403d5de4336d88b98 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Fri, 23 Aug 2024 23:27:26 +0200 Subject: [PATCH 51/59] Usage of n_head to discriminate very small models Of which the size is more sensitive to the non repeating tensors --- src/llama.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d6a822a74..f6dd9f3f1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16441,14 +16441,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_IQ4_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; - else if (qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ4_XS; else new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; - else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { @@ -16457,7 +16458,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; - else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_Q5_K; + else if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q6_K; } else if (new_type != GGML_TYPE_Q8_0) { @@ -16488,18 +16489,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ2_S; + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ2_S; else new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_XXS; } - else if ( || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { - if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head <= 20) new_type = GGML_TYPE_IQ3_S; + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S; new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; From 5ae59714d21023ba8cc654332703e8d0afdb5ebd Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 24 Aug 2024 20:50:07 +0200 Subject: [PATCH 52/59] Revamp Q2_K and Q3_K quants Q3_K_XL takes the place of Q3_K_L. Q3_K_L becomes intermediary between Q3_K_M and XL. --- examples/quantize/quantize.cpp | 3 +- gguf-py/gguf/constants.py | 3 +- include/llama.h | 3 +- src/llama.cpp | 138 ++++++++++++++++++++++----------- 4 files changed, 97 insertions(+), 50 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index c4bed4e62..619b8e6b4 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -41,7 +41,8 @@ static const std::vector QUANT_OPTIONS = { { "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, - { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, + { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.10 bpw quantization mix", }, + { "Q3_K_XL", LLAMA_FTYPE_MOSTLY_Q3_K_XL, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "IQ4_XSR", LLAMA_FTYPE_MOSTLY_IQ4_XSR, " 4.xx bpw non-linear quantization", }, diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9f425c19e..3dffab0a7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1227,7 +1227,7 @@ class LlamaFileType(IntEnum): MOSTLY_Q2_K = 10 # except 1d tensors MOSTLY_Q3_K_S = 11 # except 1d tensors MOSTLY_Q3_K_M = 12 # except 1d tensors - MOSTLY_Q3_K_L = 13 # except 1d tensors + MOSTLY_Q3_K_XL = 13 # except 1d tensors MOSTLY_Q4_K_S = 14 # except 1d tensors MOSTLY_Q4_K_M = 15 # except 1d tensors MOSTLY_Q5_K_S = 16 # except 1d tensors @@ -1257,6 +1257,7 @@ class LlamaFileType(IntEnum): MOSTLY_IQ1_XL = 42 # except 1d tensors MOSTLY_IQ4_XSR = 43 # except 1d tensors MOSTLY_IQ3_XXL = 44 # except 1d tensors + MOSTLY_Q3_K_L = 45 # except 1d tensors GUESSED = 1024 # not specified in the model file diff --git a/include/llama.h b/include/llama.h index dae99bfc5..a5d09f824 100644 --- a/include/llama.h +++ b/include/llama.h @@ -143,7 +143,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_S = 11, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q3_K_M = 12, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q3_K_L = 13, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_XL = 13, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_S = 14, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_K_M = 15, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors @@ -173,6 +173,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ1_XL = 42, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XSR = 43, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ3_XXL = 44, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q3_K_L = 45, // except 1d tensors LLAMA_FTYPE_CQS = 99, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file diff --git a/src/llama.cpp b/src/llama.cpp index f6dd9f3f1..0d1b9dadd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5000,6 +5000,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; + case LLAMA_FTYPE_MOSTLY_Q3_K_XL: return "Q3_K - Xtra Large"; case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small"; case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small"; @@ -16428,8 +16429,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; + else if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q4_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q6_K; + else new_type = GGML_TYPE_Q5_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; @@ -16470,7 +16476,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.params->token_embedding_type; } else { if (qs.model.hparams.n_expert >= 4) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { new_type = GGML_TYPE_IQ3_S; } @@ -16479,10 +16490,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = GGML_TYPE_IQ4_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = GGML_TYPE_Q4_K; - } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K; + else new_type = GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { @@ -16537,10 +16548,13 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + new_type = (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) new_type = GGML_TYPE_Q5_K; else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && (difquant_half_tensors(qs.i_attention_wv, qs.n_attention_wv))) new_type = GGML_TYPE_Q6_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; @@ -16598,12 +16612,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q4_K; } @@ -16694,10 +16712,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } ++qs.i_attention_wk; } else if (name.find("attn_q.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || @@ -16723,18 +16749,18 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ++qs.i_attention_wq; } else if (name.find("attn_output.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || - ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL || + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) { new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; @@ -16750,29 +16776,30 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ4_XS; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) + new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) new_type = GGML_TYPE_IQ2_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) new_type = GGML_TYPE_IQ2_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) new_type = GGML_TYPE_IQ3_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ4_XS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) new_type = GGML_TYPE_IQ4_XS; } ++qs.i_attention_wo; } else if (name.find("attn_qkv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = GGML_TYPE_Q3_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; @@ -16801,11 +16828,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; @@ -16815,6 +16838,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -16822,9 +16849,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; @@ -16991,10 +17023,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; @@ -17005,6 +17033,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -17012,7 +17045,12 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { @@ -17038,11 +17076,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; @@ -17052,6 +17086,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -17059,9 +17097,14 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; @@ -17270,7 +17313,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: + case LLAMA_FTYPE_MOSTLY_Q3_K_XL: default_type = GGML_TYPE_Q3_K; break; case LLAMA_FTYPE_MOSTLY_Q4_K_S: case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break; case LLAMA_FTYPE_MOSTLY_Q5_K_S: From 844d11b8f33feac88047c9175b8501e0cef32c21 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:02:51 +0200 Subject: [PATCH 53/59] bad indent --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0d1b9dadd..fbae392f2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16444,7 +16444,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; - else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || From 53b8eaa316e80aafe09388e76360c0064146af16 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:57:07 +0200 Subject: [PATCH 54/59] Remove deprecated rules for token embeddings --- src/llama.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index fbae392f2..e2ea5e071 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16514,9 +16514,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S; new_type = GGML_TYPE_IQ4_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S; else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; } From 8fc46df134bb080b95eaf6479253a58bfe01fa04 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 24 Aug 2024 22:30:45 +0200 Subject: [PATCH 55/59] Bump a bit ffn_gate and down for some GQA<2 models --- src/llama.cpp | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e2ea5e071..157edde25 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16495,6 +16495,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K; else new_type = GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q3_K; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_IQ2_S; @@ -16512,7 +16516,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S; - new_type = GGML_TYPE_IQ4_XS; + else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS; } else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { new_type = GGML_TYPE_Q4_0; @@ -16833,7 +16837,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16852,7 +16856,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16871,7 +16875,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16886,7 +16890,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16896,7 +16900,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16905,7 +16909,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -16924,7 +16928,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17055,7 +17059,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { - if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { @@ -17081,7 +17085,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17100,7 +17104,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17119,7 +17123,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17134,7 +17138,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17144,7 +17148,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17153,7 +17157,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) @@ -17172,7 +17176,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) From f63860eaac96f1614678c362b5d37c34c7e183de Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 25 Aug 2024 03:17:21 +0200 Subject: [PATCH 56/59] Put back ffn_down tree where it was before. --- src/llama.cpp | 276 +++++++++++++++++++++++++------------------------- 1 file changed, 138 insertions(+), 138 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 157edde25..d4403dea1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16748,6 +16748,144 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } ++qs.i_attention_wq; + } else if (name.find("ffn_down") != std::string::npos) { + auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (arch == LLM_ARCH_FALCON) { + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : + difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) + && qs.has_imatrix && i_layer < n_layer/8) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ4_XS; + } + else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : + difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (qs.model.hparams.n_expert >= 4) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || @@ -16936,144 +17074,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } ++qs.i_ffn_gate; - } else if (name.find("ffn_down") != std::string::npos) { - auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); - int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = GGML_TYPE_IQ4_XS; - } - else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { - new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : - difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - } - else new_type = difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = GGML_TYPE_Q3_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; - else new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (arch == LLM_ARCH_FALCON) { - new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : - difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - } else { - if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K; - } - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { - new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) - && qs.has_imatrix && i_layer < n_layer/8) { - // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. - // We only do it when an imatrix is provided because a) we want to make sure that one can always get the - // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. - new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; - } - ++qs.i_ffn_down; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; From dd3df754b27d0ad7fb2f6852573c55076668a0e6 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 25 Aug 2024 03:30:36 +0200 Subject: [PATCH 57/59] Bad indents and trailing whitespaces --- src/llama.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index d4403dea1..b6f0c3cc2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16386,8 +16386,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // original formula use_more_bits : // return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; // The intervals of 3 are replaced by a broad bump in the central layers. - // In the case of a 32 layers model, layers 5-7 and layers 12-16 are always skipped. - // In the case of a 40 layers model, layers 6-9 and layers 15-20 are always skipped. + // In the case of a 32 layers model, layers 5-7 and layers 12-16 are always skipped. + // In the case of a 40 layers model, layers 6-9 and layers 15-20 are always skipped. // difquant_half_tensors replaces it and keeps the broad 50% bump to the upper quant. Ex : 16/32 auto difquant_half_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer > 6*n_layers/8 || (i_layer >= 2*n_layers/8 && i_layer < 3*n_layers/8); @@ -16448,7 +16448,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; else if (qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ4_XS; else new_type = GGML_TYPE_Q4_K; @@ -16478,7 +16478,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_expert >= 4) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || @@ -16611,20 +16611,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; - else new_type = GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K; - else new_type = GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_Q4_K; - else new_type = GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; - else new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q4_K; } else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { @@ -16722,7 +16722,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q3_K; - else new_type = GGML_TYPE_Q2_K; + else new_type = GGML_TYPE_Q2_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; From 16aee45179f29887c2c5c3c56d778437c4cb275c Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:25:46 +0200 Subject: [PATCH 58/59] correction --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index b6f0c3cc2..7df933b27 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16943,7 +16943,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { From 26aac8e2892a4509b26a4eed1863ff3945bf3b2b Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sun, 25 Aug 2024 14:42:33 +0200 Subject: [PATCH 59/59] Soften the token embeddings bump for experts >= 4 --- src/llama.cpp | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index c96be5b4e..ce3034e28 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17624,20 +17624,16 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = qs.params->token_embedding_type; } else { if (qs.model.hparams.n_expert >= 4) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || - ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { - new_type = GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - new_type = GGML_TYPE_IQ4_XS; - } + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || + ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) new_type = GGML_TYPE_IQ3_XXS; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || + ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K;