names : for brevity "SHARED_EXP" -> "SHEXP"
This commit is contained in:
parent
7355ca84b5
commit
f88e6844a4
3 changed files with 136 additions and 136 deletions
|
@ -148,7 +148,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
ATTN_OUT_NORM = auto()
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_GATE_INP_SHARED_EXP = auto()
|
FFN_GATE_INP_SHEXP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
|
@ -157,9 +157,9 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
FFN_GATE_SHARED_EXP = auto()
|
FFN_GATE_SHEXP = auto()
|
||||||
FFN_DOWN_SHARED_EXP = auto()
|
FFN_DOWN_SHEXP = auto()
|
||||||
FFN_UP_SHARED_EXP = auto()
|
FFN_UP_SHEXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
|
@ -225,14 +225,14 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHARED_EXP: "blk.{bid}.ffn_gate_inp_shared_exp",
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.FFN_GATE_SHARED_EXP: "blk.{bid}.ffn_gate_shared_exp",
|
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||||
MODEL_TENSOR.FFN_DOWN_SHARED_EXP: "blk.{bid}.ffn_down_shared_exp",
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||||
MODEL_TENSOR.FFN_UP_SHARED_EXP: "blk.{bid}.ffn_up_shared_exp",
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
|
@ -493,10 +493,10 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE_EXP,
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP,
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
MODEL_TENSOR.FFN_UP_EXP,
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHARED_EXP,
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
|
||||||
MODEL_TENSOR.FFN_GATE_SHARED_EXP,
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
||||||
MODEL_TENSOR.FFN_DOWN_SHARED_EXP,
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
||||||
MODEL_TENSOR.FFN_UP_SHARED_EXP,
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.PLAMO: [
|
MODEL_ARCH.PLAMO: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
|
|
@ -213,7 +213,7 @@ class TensorNameMap:
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP_SHARED_EXP: (
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -247,7 +247,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
|
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged)
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_SHARED_EXP: (
|
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -273,7 +273,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
|
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged)
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_SHARED_EXP: (
|
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -306,7 +306,7 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
|
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged)
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_SHARED_EXP: (
|
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||||
),
|
),
|
||||||
|
|
||||||
|
|
64
llama.cpp
64
llama.cpp
|
@ -431,7 +431,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_ATTN_OUT_NORM,
|
LLM_TENSOR_ATTN_OUT_NORM,
|
||||||
LLM_TENSOR_ATTN_ROT_EMBD,
|
LLM_TENSOR_ATTN_ROT_EMBD,
|
||||||
LLM_TENSOR_FFN_GATE_INP,
|
LLM_TENSOR_FFN_GATE_INP,
|
||||||
LLM_TENSOR_FFN_GATE_INP_SHARED_EXP,
|
LLM_TENSOR_FFN_GATE_INP_SHEXP,
|
||||||
LLM_TENSOR_FFN_NORM,
|
LLM_TENSOR_FFN_NORM,
|
||||||
LLM_TENSOR_FFN_GATE,
|
LLM_TENSOR_FFN_GATE,
|
||||||
LLM_TENSOR_FFN_DOWN,
|
LLM_TENSOR_FFN_DOWN,
|
||||||
|
@ -440,12 +440,12 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
||||||
LLM_TENSOR_FFN_GATE_EXP,
|
LLM_TENSOR_FFN_GATE_EXP,
|
||||||
LLM_TENSOR_FFN_UP_EXP,
|
LLM_TENSOR_FFN_UP_EXP,
|
||||||
LLM_TENSOR_FFN_DOWN_SHARED_EXP,
|
|
||||||
LLM_TENSOR_FFN_GATE_SHARED_EXP,
|
|
||||||
LLM_TENSOR_FFN_UP_SHARED_EXP,
|
|
||||||
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
||||||
LLM_TENSOR_FFN_GATE_EXPS,
|
LLM_TENSOR_FFN_GATE_EXPS,
|
||||||
LLM_TENSOR_FFN_UP_EXPS,
|
LLM_TENSOR_FFN_UP_EXPS,
|
||||||
|
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_UP_SHEXP,
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
LLM_TENSOR_LAYER_OUT_NORM,
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
|
@ -759,10 +759,10 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
{ LLM_TENSOR_FFN_GATE_INP_SHARED_EXP, "blk.%d.ffn_gate_inp_shared_exp" },
|
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
||||||
{ LLM_TENSOR_FFN_GATE_SHARED_EXP, "blk.%d.ffn_gate_shared_exp" },
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||||
{ LLM_TENSOR_FFN_DOWN_SHARED_EXP, "blk.%d.ffn_down_shared_exp" },
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||||
{ LLM_TENSOR_FFN_UP_SHARED_EXP, "blk.%d.ffn_up_shared_exp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1938,11 +1938,11 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_down_exps;
|
struct ggml_tensor * ffn_down_exps;
|
||||||
struct ggml_tensor * ffn_up_exps ;
|
struct ggml_tensor * ffn_up_exps ;
|
||||||
|
|
||||||
// ff shared expert
|
// ff shared expert (shexp)
|
||||||
struct ggml_tensor * ffn_gate_inp_shared_exp;
|
struct ggml_tensor * ffn_gate_inp_shexp;
|
||||||
struct ggml_tensor * ffn_gate_shared_exp;
|
struct ggml_tensor * ffn_gate_shexp;
|
||||||
struct ggml_tensor * ffn_down_shared_exp;
|
struct ggml_tensor * ffn_down_shexp;
|
||||||
struct ggml_tensor * ffn_up_shared_exp;
|
struct ggml_tensor * ffn_up_shexp;
|
||||||
|
|
||||||
// ff bias
|
// ff bias
|
||||||
struct ggml_tensor * ffn_down_b; // b2
|
struct ggml_tensor * ffn_down_b; // b2
|
||||||
|
@ -5205,10 +5205,10 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert});
|
||||||
|
|
||||||
// Shared expert branch
|
// Shared expert branch
|
||||||
layer.ffn_gate_inp_shared_exp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHARED_EXP, "weight", i), {n_embd});
|
layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd});
|
||||||
layer.ffn_gate_shared_exp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHARED_EXP, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down_shared_exp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHARED_EXP, "weight", i), { n_ff, n_embd});
|
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd});
|
||||||
layer.ffn_up_shared_exp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHARED_EXP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
|
@ -8622,31 +8622,31 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * gate_shared_exp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shared_exp, cur);
|
// FFN shared expert
|
||||||
cb(gate_shared_exp, "ffn_moe_gate_inp_shared_exp", il);
|
{
|
||||||
|
ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur);
|
||||||
|
cb(cur_gate_inp, "ffn_shexp_gate_inp", il);
|
||||||
|
|
||||||
// sigmoid
|
// sigmoid
|
||||||
ggml_tensor * logits_shared_exp = ggml_silu(ctx0, gate_shared_exp);
|
ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp);
|
||||||
cb(logits_shared_exp, "ffn_moe_logits_shared_exp", il);
|
cb(cur_gate, "ffn_shexp_gate", il);
|
||||||
|
|
||||||
ggml_tensor * probs_shared_exp = ggml_div(ctx0, logits_shared_exp, gate_shared_exp);
|
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
|
||||||
cb(probs_shared_exp, "ffn_moe_probs_shared_exp", il);
|
model.layers[il].ffn_up_shexp, NULL,
|
||||||
|
model.layers[il].ffn_gate_shexp, NULL,
|
||||||
ggml_tensor * ffn_shared_exp = llm_build_ffn(ctx0, cur,
|
model.layers[il].ffn_down_shexp, NULL,
|
||||||
model.layers[il].ffn_up_shared_exp, NULL,
|
|
||||||
model.layers[il].ffn_gate_shared_exp, NULL,
|
|
||||||
model.layers[il].ffn_down_shared_exp, NULL,
|
|
||||||
NULL,
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(ffn_shared_exp, "ffn_moe_shared_exp", il);
|
cb(cur_ffn, "ffn_shexp", il);
|
||||||
|
|
||||||
ggml_tensor * ffn_shared_exp_out = ggml_mul(ctx0, ffn_shared_exp, probs_shared_exp);
|
ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate);
|
||||||
cb(ffn_shared_exp_out, "ffn_moe_shared_exp_out", il);
|
cb(ffn_shexp_out, "ffn_shexp_out", il);
|
||||||
|
|
||||||
moe_out = ggml_add(ctx0, moe_out, ffn_shared_exp_out);
|
moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out);
|
||||||
cb(moe_out, "ffn_out", il);
|
cb(moe_out, "ffn_out", il);
|
||||||
|
|
||||||
cur = moe_out;
|
cur = moe_out;
|
||||||
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue