Revamp a bit output weight

for more granularity in low quants.
This commit is contained in:
Nexesenex 2024-08-23 16:40:40 +02:00
parent f796954872
commit 6b5cebfb2b

View file

@ -16426,23 +16426,35 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0; new_type = GGML_TYPE_Q8_0;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
else new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K;
else new_type = GGML_TYPE_Q6_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS;
else if (qs.model.hparams.n_gqa() >= 2 || qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q4_K;
else new_type = GGML_TYPE_IQ4_XS;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
else new_type = GGML_TYPE_Q4_K; else new_type = GGML_TYPE_Q4_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL ||
ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K;
else new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q5_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
else new_type = GGML_TYPE_Q5_K; else new_type = GGML_TYPE_Q5_K;
} }
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K; if (qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q6_K;
else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; else if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K;
else new_type = GGML_TYPE_Q6_K; else new_type = GGML_TYPE_Q6_K;