Added expert_weights_scale parameter for scaling MoE gate weights.

2024-05-19 12:19:40 +02:00 · 2024-05-19 12:19:40 +02:00 · 7e4786bbfb
commit 7e4786bbfb
parent 6050941653
4 changed files with 22 additions and 1 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -2412,6 +2412,7 @@ class DeepseekV2Model(Model):
        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -45,6 +45,7 @@ class Keys:
        EXPERT_COUNT          = "{arch}.expert_count"
        EXPERT_USED_COUNT     = "{arch}.expert_used_count"
        EXPERT_SHARED_COUNT   = "{arch}.expert_shared_count"
+        EXPERT_WEIGHTS_SCALE  = "{arch}.expert_weights_scale"
        POOLING_TYPE          = "{arch}.pooling_type"
        LOGIT_SCALE           = "{arch}.logit_scale"

--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -418,6 +418,9 @@ class GGUFWriter:
    def add_expert_shared_count(self, count: int) -> None:
        self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)

+    def add_expert_weights_scale(self, value: float) -> None:
+        self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
+
    def add_layer_norm_eps(self, value: float) -> None:
        self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

--- a/llama.cpp
+++ b/llama.cpp
@ -296,6 +296,7 @@ enum llm_kv {
    LLM_KV_EXPERT_COUNT,
    LLM_KV_EXPERT_USED_COUNT,
    LLM_KV_EXPERT_SHARED_COUNT,
+    LLM_KV_EXPERT_WEIGHTS_SCALE,
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,

@ -378,6 +379,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_COUNT,                  "%s.expert_count"          },
    { LLM_KV_EXPERT_USED_COUNT,             "%s.expert_used_count"     },
    { LLM_KV_EXPERT_SHARED_COUNT,           "%s.expert_shared_count"   },
+    { LLM_KV_EXPERT_WEIGHTS_SCALE,          "%s.expert_weights_scale"  },
    { LLM_KV_POOLING_TYPE ,                 "%s.pooling_type"          },
    { LLM_KV_LOGIT_SCALE,                   "%s.logit_scale"           },

@ -1819,6 +1821,7 @@ struct llama_hparams {
    uint32_t n_lora_kv = 0;
    uint32_t n_expert_ff = 0;
    uint32_t n_expert_shared = 0;
+    float    expert_weights_scale = 0.0;

    float f_norm_eps;
    float f_norm_rms_eps;
@ -1881,6 +1884,7 @@ struct llama_hparams {
        if (!is_float_close(this->f_norm_rms_eps,        other.f_norm_rms_eps,        EPSILON)) return true;
        if (!is_float_close(this->rope_freq_base_train,  other.rope_freq_base_train,  EPSILON)) return true;
        if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
+        if (!is_float_close(this->expert_weights_scale,  other.expert_weights_scale,EPSILON)) return true;

        return false;
    }
@ -4338,6 +4342,7 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_expert_ff);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);

                model.type = e_model::MODEL_UNKNOWN;
            } break;
@ -6659,6 +6664,8 @@ static struct ggml_tensor * llm_build_moe_ffn(
                    int64_t   n_expert_used,
            llm_ffn_op_type   type_op,
                       bool   norm_w,
+                       bool   scale_w,
+                      float   w_scale,
         const llm_build_cb & cb,
                        int   il) {
    int64_t n_embd = cur->ne[0];
@ -6690,6 +6697,10 @@ static struct ggml_tensor * llm_build_moe_ffn(

        weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens);
    }
+    if (scale_w) {
+        weights = ggml_scale(ctx, weights, w_scale);
+        cb(weights, "ffn_moe_weights_scaled", il);
+    }

    cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
    ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
@ -7306,6 +7317,7 @@ struct llm_build_context {
                        model.layers[il].ffn_down_exps,
                        n_expert, n_expert_used,
                        LLM_FFN_SILU, true,
+                        false, 0.0,
                        cb, il);
                cb(cur, "ffn_moe_out", il);
            }
@ -7787,6 +7799,7 @@ struct llm_build_context {
                    model.layers[il].ffn_down_exps,
                    n_expert, n_expert_used,
                    LLM_FFN_GELU, true,
+                    false, 0.0,
                    cb, il);
            cb(cur, "ffn_moe_out", il);

@ -7930,6 +7943,7 @@ struct llm_build_context {
                    model.layers[il].ffn_down_exps,
                    n_expert, n_expert_used,
                    LLM_FFN_SILU, true,
+                    false, 0.0,
                    cb, il);
            cb(cur, "ffn_moe_out", il);

@ -9275,6 +9289,7 @@ struct llm_build_context {
                        model.layers[il].ffn_down_exps,
                        n_expert, n_expert_used,
                        LLM_FFN_SILU, false,
+                        false, 0.0,
                        cb, il);
            cb(cur, "ffn_moe_out", il);

@ -11093,7 +11108,8 @@ struct llm_build_context {
                            model.layers[il].ffn_gate_exps,
                            model.layers[il].ffn_down_exps,
                            n_expert, n_expert_used,
-                            LLM_FFN_SILU, true,
+                            LLM_FFN_SILU, false,
+                            true, hparams.expert_weights_scale,
                            cb, il);
                cb(moe_out, "ffn_moe_out", il);