Adapt token embeddings and output.weight to vocab size

due to the huge increase of the embeddings and output weight size for models with huge vocab, they seem to quantize with less loss.
2024-08-17 12:31:36 +02:00 · 2024-08-17 12:31:36 +02:00 · 4ba561808d
commit 4ba561808d
parent 17b71512a6
1 changed files with 28 additions and 5 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -15877,15 +15877,23 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
            if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
                new_type = GGML_TYPE_Q8_0;
            }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
-                     ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
-                new_type = GGML_TYPE_Q4_K;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+                if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ4_XS;
+                else new_type = GGML_TYPE_Q4_K;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+                if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q4_K;
+                else new_type = GGML_TYPE_Q5_K;
            }
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
                     ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M  ||
                     ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S  || ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q2_K_L) {
                new_type = GGML_TYPE_Q5_K;
            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
+                if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_Q5_K;
+                else new_type = GGML_TYPE_Q6_K;
+            }
            else if (new_type != GGML_TYPE_Q8_0) {
                new_type = GGML_TYPE_Q6_K;
            }
@ -15894,10 +15902,25 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
        if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
            new_type = qs.params->token_embedding_type;
        } else {
-            if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M ||
-                ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
+            if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
+                if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_XS;
+                else new_type = GGML_TYPE_IQ2_S;
+            }
+            if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XL || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
                new_type = GGML_TYPE_IQ2_S;
            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) {
+                if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ2_S;
+                else new_type = GGML_TYPE_IQ3_XXS;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
+                if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_XXS;
+                else new_type = GGML_TYPE_IQ3_S;
+            }
+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XL) {
+                if (qs.model.hparams.n_vocab >= 127999) new_type = GGML_TYPE_IQ3_S;
+                else new_type = GGML_TYPE_IQ4_XS;
+            }
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_XXS;
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XL) new_type = GGML_TYPE_IQ3_XXS;
            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;