From 3898c111258a15cca4e54be584997856ef7979bf Mon Sep 17 00:00:00 2001
From: jukofyork <69222624+jukofyork@users.noreply.github.com>
Date: Fri, 28 Jun 2024 16:03:47 +0100
Subject: [PATCH] Fix for Deepseek-v2's low-rank attension weights

---
 src/llama.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index 3edaa98e8..e65b2ebf9 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -16754,6 +16754,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= name.find("ssm_x.weight")      == std::string::npos;
         quantize &= name.find("ssm_dt.weight")     == std::string::npos;
 
+        // do not quantize Deepseek-v2's low-rank attension weights
+        // NOTE: this will have O(((w-q)^2)^2) rate-distortion otherwise (4-th power quantization error!)
+        quantize &= name.find("attn_q_a.weight") == std::string::npos;
+        quantize &= name.find("attn_q_b.weight") == std::string::npos;
+        quantize &= name.find("attn_kv_a_mqa.weight") == std::string::npos;
+        quantize &= name.find("attn_kv_b.weight") == std::string::npos;
+
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;