Added YaRN log multiplier model header parameter corresponding to the multiplier of the ln(s) from the sqrt(1/t) = 0.1 ln(s) + 1 equation.

This commit is contained in:
Stanisław Szymczyk 2024-05-20 18:51:23 +02:00
parent 68a5103026
commit 7be56da99a
4 changed files with 11 additions and 3 deletions

View file

@ -2420,6 +2420,7 @@ class DeepseekV2Model(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1*hparams["rope_scaling"]["mscale_all_dim"])
_experts: list[dict[str, Tensor]] | None = None

View file

@ -69,6 +69,7 @@ class Keys:
SCALING_FACTOR = "{arch}.rope.scaling.factor"
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
class SSM:
CONV_KERNEL = "{arch}.ssm.conv_kernel"

View file

@ -457,6 +457,9 @@ class GGUFWriter:
def add_rope_scaling_finetuned(self, value: bool) -> None:
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
def add_ssm_conv_kernel(self, value: int) -> None:
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)

View file

@ -319,6 +319,7 @@ enum llm_kv {
LLM_KV_ROPE_SCALING_FACTOR,
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
LLM_KV_ROPE_SCALING_FINETUNED,
LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
LLM_KV_SPLIT_NO,
LLM_KV_SPLIT_COUNT,
@ -402,6 +403,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
{ LLM_KV_ROPE_SCALING_YARN_LOG_MUL, "%s.rope.scaling.yarn_log_multiplier" },
{ LLM_KV_SPLIT_NO, "split.no" },
{ LLM_KV_SPLIT_COUNT, "split.count" },
@ -1829,8 +1831,7 @@ struct llama_hparams {
float rope_freq_base_train;
float rope_freq_scale_train;
uint32_t n_yarn_orig_ctx;
// TODO read from the model file
float mscale_all_dim = 0.707;
float rope_yarn_log_mul;
// for State Space Models
uint32_t ssm_d_conv = 0;
@ -1885,6 +1886,7 @@ struct llama_hparams {
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
return false;
}
@ -4343,6 +4345,7 @@ static void llm_load_hparams(
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_expert_ff);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
model.type = e_model::MODEL_UNKNOWN;
} break;
@ -10948,7 +10951,7 @@ struct llm_build_context {
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float mscale = 1.0f + 0.1f * hparams.mscale_all_dim * logf(1.0f / freq_scale);
const float mscale = 1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale);
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));