Added YaRN log multiplier model header parameter corresponding to the multiplier of the ln(s) from the sqrt(1/t) = 0.1 ln(s) + 1 equation.

2024-05-20 18:51:23 +02:00 · 2024-05-20 18:51:23 +02:00 · 7be56da99a
commit 7be56da99a
parent 68a5103026
4 changed files with 11 additions and 3 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -2420,6 +2420,7 @@ class DeepseekV2Model(Model):
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+                self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1*hparams["rope_scaling"]["mscale_all_dim"])

    _experts: list[dict[str, Tensor]] | None = None

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -69,6 +69,7 @@ class Keys:
        SCALING_FACTOR       = "{arch}.rope.scaling.factor"
        SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
        SCALING_FINETUNED    = "{arch}.rope.scaling.finetuned"
+        SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"

    class SSM:
        CONV_KERNEL    = "{arch}.ssm.conv_kernel"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -457,6 +457,9 @@ class GGUFWriter:
    def add_rope_scaling_finetuned(self, value: bool) -> None:
        self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)

+    def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
+        self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
+
    def add_ssm_conv_kernel(self, value: int) -> None:
        self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)

--- a/llama.cpp
+++ b/llama.cpp
@ -319,6 +319,7 @@ enum llm_kv {
    LLM_KV_ROPE_SCALING_FACTOR,
    LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
    LLM_KV_ROPE_SCALING_FINETUNED,
+    LLM_KV_ROPE_SCALING_YARN_LOG_MUL,

    LLM_KV_SPLIT_NO,
    LLM_KV_SPLIT_COUNT,
@ -402,6 +403,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ROPE_SCALING_FACTOR,           "%s.rope.scaling.factor"                  },
    { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
    { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
+    { LLM_KV_ROPE_SCALING_YARN_LOG_MUL,     "%s.rope.scaling.yarn_log_multiplier"     },

    { LLM_KV_SPLIT_NO,                      "split.no"            },
    { LLM_KV_SPLIT_COUNT,                   "split.count"         },
@ -1829,8 +1831,7 @@ struct llama_hparams {
    float    rope_freq_base_train;
    float    rope_freq_scale_train;
    uint32_t n_yarn_orig_ctx;
-    // TODO read from the model file
-    float    mscale_all_dim = 0.707;
+    float    rope_yarn_log_mul;

    // for State Space Models
    uint32_t ssm_d_conv  = 0;
@ -1885,6 +1886,7 @@ struct llama_hparams {
        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
        if (!is_float_close(this->expert_weights_scale,  other.expert_weights_scale,  EPSILON)) return true;
+        if (!is_float_close(this->rope_yarn_log_mul,     other.rope_yarn_log_mul,     EPSILON)) return true;

        return false;
    }
@ -4343,6 +4345,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_expert_ff);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);

                model.type = e_model::MODEL_UNKNOWN;
            } break;
@ -10948,7 +10951,7 @@ struct llm_build_context {

        // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
        // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
-        const float mscale = 1.0f + 0.1f * hparams.mscale_all_dim * logf(1.0f / freq_scale);
+        const float mscale = 1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale);
        const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
        const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));