From f99df46f982dba25cb250a54f55e5565a108694a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= <sszymczy@gmail.com>
Date: Sun, 19 May 2024 19:59:03 +0200
Subject: [PATCH] Replaced hardcoded mscale value with rescaling attn_factor
 that results in the final mscale value equal to 1.0.

---
 ggml.c    |  2 --
 llama.cpp | 12 ++++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2618edf59..55152bce4 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14073,8 +14073,6 @@ static void rope_yarn(
         // Get n-d magnitude scaling corrected for interpolation
         mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
     }
-    // TODO ugly hack for DeepSeek-V2 until we find a solution
-    mscale = 1.0;
     *cos_theta = cosf(theta) * mscale;
     *sin_theta = sinf(theta) * mscale;
 }
diff --git a/llama.cpp b/llama.cpp
index ac76d4c58..19b30ba56 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -10949,6 +10949,14 @@ struct llm_build_context {
         const float mscale = 1.0f + 0.1f * hparams.mscale_all_dim * logf(1.0f / freq_scale);
         const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
 
+        // DeepSeek-V2 uses non-standard YaRN mscale calculation from mscale and mscale_all_dim
+        // config.json parameters. However, both of these are equal to 0.707 in released models,
+        // which results in the final mscale value equal to 1.0. To get the same value we
+        // pre-scale the attn_factor.
+        // TODO Get rid of this when other models start using DeepSeek-V2
+        // variant of mscale calculation resulting in the API change.
+        const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
+
         // kept original names of these parameters from HF transformers code for clarity
         const uint32_t qk_rope_head_dim = hparams.n_rot;
         const uint32_t qk_nope_head_dim = hparams.n_embd_head_k - hparams.n_rot;
@@ -11040,7 +11048,7 @@ struct llm_build_context {
                 q_pe = ggml_rope_custom(
                     ctx0, q_pe, inp_pos,
                     n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
+                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
                 );
                 cb(q_pe, "q_pe", il);
 
@@ -11048,7 +11056,7 @@ struct llm_build_context {
                 k_pe = ggml_rope_custom(
                     ctx0, ggml_view_3d(ctx0, k_pe, qk_rope_head_dim, 1, n_tokens, k_pe->nb[0], k_pe->nb[1], 0), inp_pos,
                     n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
+                    ext_factor, attn_factor_scaled, beta_fast, beta_slow
                 );
                 cb(k_pe, "k_pe", il);