From 140eb292644f201aadc042392419dea0da236ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stanis=C5=82aw=20Szymczyk?= Date: Fri, 3 Jan 2025 13:51:14 +0100 Subject: [PATCH] gguf-py, llama : rename expert_weights to exp_probs in tensor and variable names --- gguf-py/gguf/constants.py | 6 +++--- gguf-py/gguf/tensor_mapping.py | 2 +- src/llama.cpp | 18 +++++++++--------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1302000ee..ef795c04e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -314,7 +314,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_SHEXP = auto() FFN_DOWN_SHEXP = auto() FFN_UP_SHEXP = auto() - FFN_EXPERT_WEIGHTS_B = auto() + FFN_EXP_PROBS_B = auto() ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() @@ -499,7 +499,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", - MODEL_TENSOR.FFN_EXPERT_WEIGHTS_B: "blk.{bid}.expert_weights_b", + MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", @@ -1280,7 +1280,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, - MODEL_TENSOR.FFN_EXPERT_WEIGHTS_B, + MODEL_TENSOR.FFN_EXP_PROBS_B, ], MODEL_ARCH.CHATGLM : [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f48769ad1..efe2a4aa4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -276,7 +276,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe ), - MODEL_TENSOR.FFN_EXPERT_WEIGHTS_B: ( + MODEL_TENSOR.FFN_EXP_PROBS_B: ( "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 ), diff --git a/src/llama.cpp b/src/llama.cpp index 1c8479c96..1ab930e3e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -564,7 +564,7 @@ enum llm_tensor { LLM_TENSOR_FFN_DOWN_SHEXP, LLM_TENSOR_FFN_GATE_SHEXP, LLM_TENSOR_FFN_UP_SHEXP, - LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, + LLM_TENSOR_FFN_EXP_PROBS_B, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, @@ -1434,7 +1434,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, "blk.%d.expert_weights_b" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, }, }, { @@ -2934,7 +2934,7 @@ struct llama_layer { struct ggml_tensor * ffn_down_b = nullptr; // b2 struct ggml_tensor * ffn_up_b = nullptr; // b3 struct ggml_tensor * ffn_act = nullptr; - struct ggml_tensor * ffn_expert_weights_bias = nullptr; + struct ggml_tensor * ffn_exp_probs_b = nullptr; // mamba proj struct ggml_tensor * ssm_in = nullptr; @@ -7480,7 +7480,7 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, - {LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_FFN_EXP_PROBS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, @@ -9283,7 +9283,7 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } else { layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); - layer.ffn_expert_weights_bias = create_tensor(tn(LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED); if (n_expert == 0) { throw std::runtime_error("n_expert must be > 0"); @@ -10285,22 +10285,22 @@ llm_expert_gating_func_type gating_op, case LLM_EXPERT_GATING_FUNC_SOFTMAX: { probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] - cb(probs, "ffn_moe_probs", il); } break; case LLM_EXPERT_GATING_FUNC_SIGMOID: { probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens] - cb(probs, "ffn_moe_sigm", il); } break; default: GGML_ABORT("fatal error"); } + cb(probs, "ffn_moe_probs", il); // add experts selection bias - introduced in DeepSeek V3 + // leave probs unbiased as it's later used to get expert weights ggml_tensor * selection_probs = probs; if (expert_weights_b != nullptr) { selection_probs = ggml_add(ctx, probs, expert_weights_b); - cb(selection_probs, "ffn_moe_sigm_biased", il); + cb(selection_probs, "ffn_moe_probs_biased", il); } // select experts @@ -16241,7 +16241,7 @@ struct llm_build_context { model.layers[il].ffn_up_exps, model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps, - model.layers[il].ffn_expert_weights_bias, + model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, hparams.expert_weights_norm, true, hparams.expert_weights_scale,