diff --git a/src/llama.cpp b/src/llama.cpp index a91ed5c60..d8726bc4a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16374,8 +16374,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n auto difquant_first_last_tensors = [](int i_layer, int n_layers) -> bool { return i_layer < n_layers/8 || i_layer >= n_layers-2; }; - // difquant_more_fl_tensors has a broad 26-29% bump to the upper quant. Ex : 9/32 - auto difquant_more_fl_tensors = [](int i_layer, int n_layers) -> bool { + // difquant_fl_more_tensors has a broad 26-29% bump to the upper quant. Ex : 9/32 + auto difquant_fl_more_tensors = [](int i_layer, int n_layers) -> bool { return i_layer <= n_layers/8 || i_layer >= 7*n_layers/8; }; // difquant_three_eights_tensors has a broad 37.5% bump to the upper quant. Ex : 12/32 @@ -16539,7 +16539,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; + else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q4_K : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { @@ -16553,7 +16553,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q6_K : - difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } @@ -16591,20 +16591,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { - if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_S; - else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XXS; - else new_type = GGML_TYPE_IQ1_M; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ2_XXS; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; + else new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_XXS; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; - else new_type = GGML_TYPE_IQ2_XXS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; - else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_XS; - else new_type = GGML_TYPE_IQ2_XXS; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; @@ -16613,47 +16618,59 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; - else new_type = GGML_TYPE_IQ2_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; else new_type = GGML_TYPE_IQ2_S; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else if (qs.model.hparams.n_gqa() >= 2) new_type = GGML_TYPE_IQ3_XXS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ3_XXS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 2) new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_first_last_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = difquant_five_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)) { - new_type = GGML_TYPE_IQ4_XS; + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - else new_type = difquant_half_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = difquant_six_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + else new_type = GGML_TYPE_IQ4_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wk < qs.n_attention_wk/8 ? GGML_TYPE_Q6_K : - difquant_more_fl_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + difquant_fl_more_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } - else difquant_three_eights_tensors(qs.i_attention_wk, qs.n_attention_wk) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q5_K; } ++qs.i_attention_wk; } else if (name.find("attn_q.weight") != std::string::npos) { @@ -16679,105 +16696,11 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_IQ4_XS : - difquant_more_fl_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + difquant_first_last_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } - else difquant_three_eights_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else difquant_fl_more_tensors(qs.i_attention_wq, qs.n_attention_wq) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } ++qs.i_attention_wq; - } else if (name.find("ffn_down") != std::string::npos) { - auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); - int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && (difquant_half_tensors(i_layer, n_layer))) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = i_layer < n_layer/8 ? GGML_TYPE_Q5_K - : arch != LLM_ARCH_FALCON || difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q4_K - : GGML_TYPE_Q3_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { - new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { - if (arch == LLM_ARCH_FALCON) { - new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : - difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; - } else { - if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; - } - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { - new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; - else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) - && qs.has_imatrix && i_layer < n_layer/8) { - // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. - // We only do it when an imatrix is provided because a) we want to make sure that one can always get the - // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. - new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS && (difquant_more_fl_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { - if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XXS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS && (difquant_three_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_XS; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - if (difquant_three_eights_tensors(i_layer, n_layer)) new_type = GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S && (difquant_six_eights_tensors(i_layer, n_layer))) new_type = GGML_TYPE_IQ2_S; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = difquant_more_fl_tensors(i_layer, n_layer) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = GGML_TYPE_IQ4_XS; - } - else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { - new_type = GGML_TYPE_Q5_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { - if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { - new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : - difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - } - else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; - } - ++qs.i_ffn_down; } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert >= 4) { @@ -16823,7 +16746,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wq < qs.n_attention_wq/8 ? GGML_TYPE_Q6_K : - difquant_more_fl_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; + difquant_first_last_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } else difquant_three_eights_tensors(qs.i_attention_wo, qs.n_attention_wo) ? GGML_TYPE_Q6_K : GGML_TYPE_Q5_K; } @@ -16857,7 +16780,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { new_type = qs.i_attention_wv < qs.n_attention_wv/8 ? GGML_TYPE_Q5_K : - difquant_more_fl_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + difquant_fl_more_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else difquant_three_eights_tensors(qs.i_attention_wv, qs.n_attention_wv) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -16877,89 +16800,230 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } ++qs.i_ffn_gate; + } else if (name.find("ffn_down") != std::string::npos) { + auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); + int i_layer = info.first, n_layer = info.second; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ4_XS; + } + else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) { + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : + difquant_fl_more_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + else difquant_three_eights_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + if (arch == LLM_ARCH_FALCON) { + new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : + difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; + } else { + if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + } + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { + new_type = GGML_TYPE_Q5_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; + else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0) + && qs.has_imatrix && i_layer < n_layer/8) { + // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0. + // We only do it when an imatrix is provided because a) we want to make sure that one can always get the + // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix. + new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; + } + ++qs.i_ffn_down; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; @@ -16975,86 +17039,95 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { + if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) + new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XS : GGML_TYPE_IQ2_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; - else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; + else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) - new_type = (difquant_more_fl_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; + new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; else new_type = (difquant_first_last_tensors(i_layer, n_layer)) ? GGML_TYPE_Q5_K : GGML_TYPE_IQ4_XS; } ++qs.i_ffn_up;