From 614c6e654438f7217eee9840415d975a80faf1d8 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Fri, 17 Jan 2025 11:23:29 -0700 Subject: [PATCH] fix: Use Q8_0 for all embedding quantizations for granite and granitemoe At lower precision levels, the models can manifest numerical instability, especially with batch size > 1. This shows up as nondeterministic stopping when index 0 (the EOG token) has a seemingly uninitialized large value in the logits. Branch: GraniteEmbedQuant Signed-off-by: Gabe Goodhart --- src/llama-quant.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index fb7982655..c66e6347a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -155,7 +155,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t const int64_t nx = tensor->ne[0]; const int64_t qk_k = ggml_blck_size(new_type); - if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) { + if (arch == LLM_ARCH_FALCON || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE || nx % qk_k != 0) { new_type = GGML_TYPE_Q8_0; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -171,7 +171,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || + if (arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q2_K; }