From 3898c111258a15cca4e54be584997856ef7979bf Mon Sep 17 00:00:00 2001 From: jukofyork <69222624+jukofyork@users.noreply.github.com> Date: Fri, 28 Jun 2024 16:03:47 +0100 Subject: [PATCH] Fix for Deepseek-v2's low-rank attension weights --- src/llama.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index 3edaa98e8..e65b2ebf9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16754,6 +16754,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name.find("ssm_x.weight") == std::string::npos; quantize &= name.find("ssm_dt.weight") == std::string::npos; + // do not quantize Deepseek-v2's low-rank attension weights + // NOTE: this will have O(((w-q)^2)^2) rate-distortion otherwise (4-th power quantization error!) + quantize &= name.find("attn_q_a.weight") == std::string::npos; + quantize &= name.find("attn_q_b.weight") == std::string::npos; + quantize &= name.find("attn_kv_a_mqa.weight") == std::string::npos; + quantize &= name.find("attn_kv_b.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size;