Bump a bit ffn_gate and down for some GQA<2 models

This commit is contained in:
Nexesenex 2024-08-24 22:30:45 +02:00
parent 53b8eaa316
commit 8fc46df134

View file

@ -16495,6 +16495,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K; if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q2_K;
else new_type = GGML_TYPE_Q3_K; else new_type = GGML_TYPE_Q3_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_Q3_K;
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
new_type = GGML_TYPE_IQ2_S; new_type = GGML_TYPE_IQ2_S;
@ -16512,7 +16516,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S; if (qs.model.hparams.n_vocab >= 127999 || qs.model.hparams.n_head() <= 20) new_type = GGML_TYPE_IQ3_S;
new_type = GGML_TYPE_IQ4_XS; else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_IQ4_XS;
} }
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) { else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
new_type = GGML_TYPE_Q4_0; new_type = GGML_TYPE_Q4_0;
@ -16833,7 +16837,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -16852,7 +16856,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -16871,7 +16875,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -16886,7 +16890,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -16896,7 +16900,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -16905,7 +16909,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -16924,7 +16928,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -17055,7 +17059,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; difquant_half_tensors(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else { } else {
if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K; if (difquant_half_tensors(i_layer, n_layer)) new_type = GGML_TYPE_Q5_K;
} }
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
@ -17081,7 +17085,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q3_K : GGML_TYPE_Q2_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -17100,7 +17104,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) { else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -17119,7 +17123,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_XXS : GGML_TYPE_IQ1_M;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -17134,7 +17138,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ2_S : GGML_TYPE_IQ2_XS;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -17144,7 +17148,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_XXS : GGML_TYPE_IQ2_S;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -17153,7 +17157,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; new_type = (difquant_five_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
else new_type = (difquant_fl_more_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
@ -17172,7 +17176,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXL) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)
new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; new_type = (difquant_six_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
else new_type = (difquant_three_eights_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S; else new_type = (difquant_half_tensors(i_layer, n_layer)) ? GGML_TYPE_IQ4_XS : GGML_TYPE_IQ3_S;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_XSR) {
if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2)