change llm_build_ffn

This commit is contained in:
Eddie-Wang1120 2024-06-25 09:37:25 +08:00
parent 2df373ac40
commit a671d56e22

255
llama.cpp
View file

@ -7082,10 +7082,13 @@ static struct ggml_tensor * llm_build_ffn(
struct ggml_tensor * cur, struct ggml_tensor * cur,
struct ggml_tensor * up, struct ggml_tensor * up,
struct ggml_tensor * up_b, struct ggml_tensor * up_b,
struct ggml_tensor * up_s,
struct ggml_tensor * gate, struct ggml_tensor * gate,
struct ggml_tensor * gate_b, struct ggml_tensor * gate_b,
struct ggml_tensor * gate_s,
struct ggml_tensor * down, struct ggml_tensor * down,
struct ggml_tensor * down_b, struct ggml_tensor * down_b,
struct ggml_tensor * down_s,
struct ggml_tensor * act_scales, struct ggml_tensor * act_scales,
llm_ffn_op_type type_op, llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate, llm_ffn_gate_type type_gate,
@ -7099,6 +7102,11 @@ static struct ggml_tensor * llm_build_ffn(
cb(tmp, "ffn_up_b", il); cb(tmp, "ffn_up_b", il);
} }
if (up_s) {
tmp = ggml_mul(ctx, tmp, up_s);
cb(tmp, "ffn_up_s", il);
}
if (gate) { if (gate) {
switch (type_gate) { switch (type_gate) {
case LLM_FFN_SEQ: case LLM_FFN_SEQ:
@ -7117,6 +7125,12 @@ static struct ggml_tensor * llm_build_ffn(
cur = ggml_add(ctx, cur, gate_b); cur = ggml_add(ctx, cur, gate_b);
cb(cur, "ffn_gate_b", il); cb(cur, "ffn_gate_b", il);
} }
if (gate_s) {
cur = ggml_mul(ctx, cur, gate_s);
cb(cur, "ffn_gate_s", il);
}
} else { } else {
cur = tmp; cur = tmp;
} }
@ -7156,7 +7170,10 @@ static struct ggml_tensor * llm_build_ffn(
cb(cur, "ffn_gate_par", il); cb(cur, "ffn_gate_par", il);
} }
cur = ggml_mul_mat(ctx, down, cur); if (down) {
cur = ggml_mul_mat(ctx, down, cur);
}
if (down_b) { if (down_b) {
cb(cur, "ffn_down", il); cb(cur, "ffn_down", il);
} }
@ -7165,6 +7182,11 @@ static struct ggml_tensor * llm_build_ffn(
cur = ggml_add(ctx, cur, down_b); cur = ggml_add(ctx, cur, down_b);
} }
if (down_s) {
cur = ggml_mul(ctx, cur, down_s);
cb(cur, "ffn_down_s", il);
}
return cur; return cur;
} }
@ -7873,9 +7895,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -8010,9 +8032,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -8114,9 +8136,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -8229,9 +8251,9 @@ struct llm_build_context {
// feed forward // feed forward
{ {
cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -8627,9 +8649,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -8715,9 +8737,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -8899,23 +8921,23 @@ struct llm_build_context {
// feed-forward network // feed-forward network
if (model.arch == LLM_ARCH_BERT) { if (model.arch == LLM_ARCH_BERT) {
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) { } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_PAR, cb, il); LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
} else { } else {
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
} }
@ -9011,9 +9033,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -9145,9 +9167,9 @@ struct llm_build_context {
LLM_NORM, cb, il); LLM_NORM, cb, il);
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
model.layers[il].ffn_act, model.layers[il].ffn_act,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -9293,9 +9315,9 @@ struct llm_build_context {
cur = inpSA; cur = inpSA;
} }
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -9405,9 +9427,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -9517,9 +9539,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -9653,9 +9675,9 @@ struct llm_build_context {
cb(cur_gate, "ffn_shexp_gate", il); cb(cur_gate, "ffn_shexp_gate", il);
ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur, ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur_ffn, "ffn_shexp", il); cb(cur_ffn, "ffn_shexp", il);
@ -9781,9 +9803,9 @@ struct llm_build_context {
// FF // FF
{ {
ffn_output = llm_build_ffn(ctx0, attn_norm_output, ffn_output = llm_build_ffn(ctx0, attn_norm_output,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(ffn_output, "ffn_out", il); cb(ffn_output, "ffn_out", il);
@ -10017,9 +10039,9 @@ struct llm_build_context {
// feed-forward network // feed-forward network
{ {
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10126,9 +10148,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10233,9 +10255,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10346,9 +10368,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10463,9 +10485,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10599,9 +10621,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10715,9 +10737,9 @@ struct llm_build_context {
// feed-forward network // feed-forward network
{ {
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_PAR, cb, il); LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -10834,9 +10856,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -11119,9 +11141,9 @@ struct llm_build_context {
// feed-forward network // feed-forward network
{ {
cur = llm_build_ffn(ctx0, ffn_inp, cur = llm_build_ffn(ctx0, ffn_inp,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -11255,9 +11277,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -11372,9 +11394,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -11399,9 +11421,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, model.layers[il].ffn_up_b, model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
NULL, NULL, NULL, NULL, NULL,
model.layers[il].ffn_down, model.layers[il].ffn_down_b, model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
NULL, NULL,
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -11504,9 +11526,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -11729,9 +11751,9 @@ struct llm_build_context {
cb(cur, "ffn_norm", il); cb(cur, "ffn_norm", il);
cur = llm_build_ffn(ctx0, cur, cur = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up, NULL, model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, model.layers[il].ffn_down, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_out", il); cb(cur, "ffn_out", il);
@ -11757,9 +11779,9 @@ struct llm_build_context {
// FFN shared expert // FFN shared expert
{ {
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur, ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, cur,
model.layers[il].ffn_up_shexp, NULL, model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, model.layers[il].ffn_down_shexp, NULL, NULL,
NULL, NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il); LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(ffn_shexp, "ffn_shexp", il); cb(ffn_shexp, "ffn_shexp", il);
@ -11861,7 +11883,7 @@ struct llm_build_context {
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
nullptr, nullptr, NULL, NULL,
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cur = llm_build_norm(ctx0, cur, hparams, cur = llm_build_norm(ctx0, cur, hparams,
@ -11888,35 +11910,28 @@ struct llm_build_context {
cb(ffn_inp, "ffn_inp", il); cb(ffn_inp, "ffn_inp", il);
// feed-forward forward // feed-forward forward
if (model.layers[il].ffn_gate_inp == nullptr) { cur = llm_build_norm(ctx0, ffn_inp, hparams,
cur = llm_build_norm(ctx0, ffn_inp, hparams, model.layers[il].ffn_norm, NULL,
model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, cb, il);
LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il);
cb(cur, "ffn_norm", il);
struct ggml_tensor *tmp = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur); cur = llm_build_ffn(ctx0, cur,
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_up_scale); model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_scale,
cb(tmp, "ffn_up", il); model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_scale,
NULL, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
cb(cur, "ffn_sub_out", il);
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_gate, cur); cur = llm_build_norm(ctx0, cur, hparams,
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_gate_scale); model.layers[il].ffn_sub_norm, NULL,
cb(cur, "ffn_gate", il); LLM_NORM_RMS, cb, il);
cb(cur, "ffn_sub_norm", il);
cur = ggml_silu(ctx0, cur); cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
cb(cur, "ffn_silu", il); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
cb(cur, "ffn_down", il);
cur = ggml_mul(ctx0, cur, tmp);
cb(cur, "ffn_gate_par", il);
cur = llm_build_norm(ctx0, cur, hparams,
model.layers[il].ffn_sub_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_sub_norm", il);
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_down_scale);
cb(cur, "ffn_down", il);
}
cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, ffn_inp);
cb(cur, "l_out", il); cb(cur, "l_out", il);