From 7be56da99a903045bf1f29d93e7dfec7ab097f9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Mon, 20 May 2024 18:51:23 +0200 Subject: [PATCH] Added YaRN log multiplier model header parameter corresponding to the multiplier of the ln(s) from the sqrt(1/t) = 0.1 ln(s) + 1 equation. --- convert-hf-to-gguf.py | 1 + gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 +++ llama.cpp | 9 ++++++--- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d03f4d9e4..b9f893cac 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2420,6 +2420,7 @@ class DeepseekV2Model(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1*hparams["rope_scaling"]["mscale_all_dim"]) _experts: list[dict[str, Tensor]] | None = None diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1739f14d6..1a579dc80 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -69,6 +69,7 @@ class Keys: SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier" class SSM: CONV_KERNEL = "{arch}.ssm.conv_kernel" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index da6e686a4..c834efd7f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -457,6 +457,9 @@ class GGUFWriter: def add_rope_scaling_finetuned(self, value: bool) -> None: self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) + def add_rope_scaling_yarn_log_mul(self, value: float) -> None: + self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value) + def add_ssm_conv_kernel(self, value: int) -> None: self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) diff --git a/llama.cpp b/llama.cpp index f563c52be..d081f0820 100644 --- a/llama.cpp +++ b/llama.cpp @@ -319,6 +319,7 @@ enum llm_kv { LLM_KV_ROPE_SCALING_FACTOR, LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, + LLM_KV_ROPE_SCALING_YARN_LOG_MUL, LLM_KV_SPLIT_NO, LLM_KV_SPLIT_COUNT, @@ -402,6 +403,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" }, { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" }, { LLM_KV_SPLIT_NO, "split.no" }, { LLM_KV_SPLIT_COUNT, "split.count" }, @@ -1829,8 +1831,7 @@ struct llama_hparams { float rope_freq_base_train; float rope_freq_scale_train; uint32_t n_yarn_orig_ctx; - // TODO read from the model file - float mscale_all_dim = 0.707; + float rope_yarn_log_mul; // for State Space Models uint32_t ssm_d_conv = 0; @@ -1885,6 +1886,7 @@ struct llama_hparams { if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; + if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; return false; } @@ -4343,6 +4345,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_expert_ff); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul); model.type = e_model::MODEL_UNKNOWN; } break; @@ -10948,7 +10951,7 @@ struct llm_build_context { // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. - const float mscale = 1.0f + 0.1f * hparams.mscale_all_dim * logf(1.0f / freq_scale); + const float mscale = 1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale); const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));