From 1c4da5ddac23de5e29209a05a7e13bccdd8d59e4 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Mon, 25 Mar 2024 20:37:11 +0100 Subject: [PATCH] Update llama.cpp - Embeddings and output tensors strategy. --- llama.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llama.cpp b/llama.cpp index 7f1a54e21..6427fbe4a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12447,6 +12447,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q5_K; + else new_type = GGML_TYPE_IQ4_XS; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_Q5_K; @@ -12462,6 +12466,10 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { new_type = GGML_TYPE_Q2_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_XS) { + if (qs.model.hparams.n_expert >= 2) new_type = GGML_TYPE_Q2_K; + new_type = GGML_TYPE_IQ2_S; + } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { new_type = GGML_TYPE_IQ3_S; }