llama: factorize moe graph implementation between grok, mixtral and dbrx
This commit is contained in:
parent
21fb24aa45
commit
f20c04f01f
1 changed files with 63 additions and 171 deletions
182
llama.cpp
182
llama.cpp
|
@ -6457,6 +6457,39 @@ struct llm_build_context {
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = build_moe(cur, n_tokens, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||||
|
if (layer_dir != nullptr) {
|
||||||
|
cur = ggml_add(ctx0, cur, layer_dir);
|
||||||
|
}
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_tensor * build_moe(ggml_tensor * cur, int32_t n_tokens, int il) {
|
||||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
||||||
cb(logits, "ffn_moe_logits", il);
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
|
||||||
|
@ -6512,36 +6545,7 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = moe_out;
|
return moe_out;
|
||||||
}
|
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
||||||
cb(cur, "ffn_out", il);
|
|
||||||
|
|
||||||
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
|
||||||
if (layer_dir != nullptr) {
|
|
||||||
cur = ggml_add(ctx0, cur, layer_dir);
|
|
||||||
}
|
|
||||||
cb(cur, "l_out", il);
|
|
||||||
|
|
||||||
// input for next layer
|
|
||||||
inpL = cur;
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = inpL;
|
|
||||||
|
|
||||||
cur = llm_build_norm(ctx0, cur, hparams,
|
|
||||||
model.output_norm, NULL,
|
|
||||||
LLM_NORM_RMS, cb, -1);
|
|
||||||
cb(cur, "result_norm", -1);
|
|
||||||
|
|
||||||
// lm_head
|
|
||||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
||||||
cb(cur, "result_output", -1);
|
|
||||||
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
|
||||||
|
|
||||||
return gf;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_baichuan() {
|
struct ggml_cgraph * build_baichuan() {
|
||||||
|
@ -6991,63 +6995,7 @@ struct llm_build_context {
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(cur, "ffn_norm", il);
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
cur = build_moe(cur, n_tokens, il);
|
||||||
cb(logits, "ffn_moe_logits", il);
|
|
||||||
|
|
||||||
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
|
||||||
cb(probs, "ffn_moe_probs", il);
|
|
||||||
|
|
||||||
// select experts
|
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
|
||||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
||||||
|
|
||||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
|
||||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
|
||||||
cb(weights, "ffn_moe_weights", il);
|
|
||||||
|
|
||||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
|
||||||
|
|
||||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
|
||||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
||||||
|
|
||||||
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
|
||||||
cb(weights, "ffn_moe_weights_norm", il);
|
|
||||||
|
|
||||||
// compute expert outputs
|
|
||||||
ggml_tensor * moe_out = nullptr;
|
|
||||||
|
|
||||||
for (int i = 0; i < n_expert_used; ++i) {
|
|
||||||
ggml_tensor * cur_expert;
|
|
||||||
|
|
||||||
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
|
||||||
cb(cur_up, "ffn_moe_up", il);
|
|
||||||
|
|
||||||
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
|
||||||
cb(cur_gate, "ffn_moe_gate", il);
|
|
||||||
|
|
||||||
//GeLU
|
|
||||||
cur_gate = ggml_gelu(ctx0, cur_gate);
|
|
||||||
cb(cur_gate, "ffn_moe_gelu", il);
|
|
||||||
|
|
||||||
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
|
||||||
cb(cur_expert, "ffn_moe_gate_par", il);
|
|
||||||
|
|
||||||
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
|
||||||
cb(cur_expert, "ffn_moe_down", il);
|
|
||||||
|
|
||||||
cur_expert = ggml_mul(ctx0, cur_expert,
|
|
||||||
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
|
||||||
cb(cur_expert, "ffn_moe_weighted", il);
|
|
||||||
|
|
||||||
if (i == 0) {
|
|
||||||
moe_out = cur_expert;
|
|
||||||
} else {
|
|
||||||
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cur = moe_out;
|
|
||||||
|
|
||||||
// Grok
|
// Grok
|
||||||
// if layer_out_norm is present then apply it before adding the input
|
// if layer_out_norm is present then apply it before adding the input
|
||||||
|
@ -7163,7 +7111,6 @@ struct llm_build_context {
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (il == n_layer - 1) {
|
if (il == n_layer - 1) {
|
||||||
|
@ -7179,64 +7126,9 @@ struct llm_build_context {
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
// MoE branch
|
// MoE branch
|
||||||
{
|
cur = build_moe(cur, n_tokens, il);
|
||||||
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
|
||||||
cb(logits, "ffn_moe_logits", il);
|
|
||||||
|
|
||||||
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
|
||||||
cb(probs, "ffn_moe_probs", il);
|
|
||||||
|
|
||||||
// select experts
|
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
|
||||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
|
||||||
|
|
||||||
ggml_tensor * weights = ggml_get_rows(ctx0,
|
|
||||||
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
|
||||||
cb(weights, "ffn_moe_weights", il);
|
|
||||||
|
|
||||||
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
|
||||||
|
|
||||||
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
|
||||||
cb(weights_sum, "ffn_moe_weights_sum", il);
|
|
||||||
|
|
||||||
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
|
||||||
cb(weights, "ffn_moe_weights_norm", il);
|
|
||||||
|
|
||||||
// compute expert outputs
|
|
||||||
ggml_tensor * moe_out = nullptr;
|
|
||||||
for (int i = 0; i < n_expert_used; ++i) {
|
|
||||||
ggml_tensor * cur_expert;
|
|
||||||
|
|
||||||
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exps, selected_experts, i, cur);
|
|
||||||
cb(cur_up, "ffn_moe_up", il);
|
|
||||||
|
|
||||||
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exps, selected_experts, i, cur);
|
|
||||||
cb(cur_gate, "ffn_moe_gate", il);
|
|
||||||
|
|
||||||
//GeLU
|
|
||||||
cur_gate = ggml_gelu(ctx0, cur_gate);
|
|
||||||
cb(cur_gate, "ffn_moe_gelu", il);
|
|
||||||
|
|
||||||
cur_expert = ggml_mul(ctx0, cur_up, cur_gate);
|
|
||||||
cb(cur_expert, "ffn_moe_gate_par", il);
|
|
||||||
|
|
||||||
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exps, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
|
||||||
cb(cur_expert, "ffn_moe_down", il);
|
|
||||||
|
|
||||||
cur_expert = ggml_mul(ctx0, cur_expert,
|
|
||||||
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i * weights->nb[0]));
|
|
||||||
cb(cur_expert, "ffn_moe_weighted", il);
|
|
||||||
|
|
||||||
if (i == 0) {
|
|
||||||
moe_out = cur_expert;
|
|
||||||
} else {
|
|
||||||
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur = moe_out;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// DBRX norm2
|
||||||
cur = llm_build_norm(ctx0, cur, hparams,
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
model.layers[il].layer_out_norm, NULL,
|
model.layers[il].layer_out_norm, NULL,
|
||||||
LLM_NORM, cb, il);
|
LLM_NORM, cb, il);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue