From 4ba561808d04dd1d122ca5af1a010099fdc95fd4 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Sat, 17 Aug 2024 12:31:36 +0200 Subject: [PATCH] Adapt token embeddings and output.weight to vocab size due to the huge increase of the embeddings and output weight size for models with huge vocab, they seem to quantize with less loss. --- src/llama.cpp | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index e54ae697d..f6afc6ebf 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -15877,15 +15877,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS; + else new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K; + else new_type = GGML_TYPE_Q5_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) { new_type = GGML_TYPE_Q5_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_Q6_K; + } else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } @@ -15894,10 +15902,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_XS; + else new_type = GGML_TYPE_IQ2_S; + } + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { new_type = GGML_TYPE_IQ2_S; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S; + else new_type = GGML_TYPE_IQ3_XXS; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS; + else new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) { + if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S; + else new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;