Update llama.cpp
This commit is contained in:
parent
52604860f9
commit
b3cf383f24
1 changed files with 105 additions and 14 deletions
119
llama.cpp
119
llama.cpp
|
@ -711,10 +711,14 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -4881,10 +4885,26 @@ static bool llm_load_tensors(
|
||||||
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
|
||||||
|
|
||||||
|
if (layer.ffn_gate_inp == nullptr) {
|
||||||
|
GGML_ASSERT(hparams.n_expert == 0);
|
||||||
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
||||||
|
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(hparams.n_expert > 0);
|
||||||
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
||||||
|
|
||||||
|
// MoE branch
|
||||||
|
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
||||||
|
layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
|
||||||
|
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
|
||||||
|
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
|
@ -8009,20 +8029,91 @@ struct llm_build_context {
|
||||||
cb(ffn_inp, "ffn_inp", il);
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
||||||
model.layers[il].ffn_norm, NULL,
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
LLM_NORM_RMS, cb, il);
|
model.layers[il].ffn_norm, NULL,
|
||||||
cb(cur, "ffn_norm", il);
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
cur = llm_build_ffn(ctx0, cur,
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
model.layers[il].ffn_up, NULL,
|
model.layers[il].ffn_up, NULL,
|
||||||
model.layers[il].ffn_gate, NULL,
|
model.layers[il].ffn_gate, NULL,
|
||||||
model.layers[il].ffn_down, NULL,
|
model.layers[il].ffn_down, NULL,
|
||||||
NULL,
|
NULL,
|
||||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
cb(cur, "ffn_out", il);
|
cb(cur, "ffn_out", il);
|
||||||
|
} else {
|
||||||
|
// MoE branch
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts]
|
||||||
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
|
||||||
|
ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts]
|
||||||
|
cb(probs, "ffn_moe_probs", il);
|
||||||
|
|
||||||
|
// select experts
|
||||||
|
ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok]
|
||||||
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||||
|
|
||||||
|
ggml_tensor * weights = ggml_get_rows(ctx0,
|
||||||
|
ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts);
|
||||||
|
cb(weights, "ffn_moe_weights", il);
|
||||||
|
|
||||||
|
weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok]
|
||||||
|
|
||||||
|
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights);
|
||||||
|
cb(weights_sum, "ffn_moe_weights_sum", il);
|
||||||
|
|
||||||
|
weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok]
|
||||||
|
cb(weights, "ffn_moe_weights_norm", il);
|
||||||
|
|
||||||
|
// compute expert outputs
|
||||||
|
ggml_tensor * moe_out = nullptr;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_expert_used; ++i) {
|
||||||
|
ggml_tensor * cur_expert;
|
||||||
|
|
||||||
|
ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur);
|
||||||
|
cb(cur_up, "ffn_moe_up", il);
|
||||||
|
|
||||||
|
ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur);
|
||||||
|
cb(cur_gate, "ffn_moe_gate", il);
|
||||||
|
|
||||||
|
cur_gate = ggml_silu(ctx0, cur_gate);
|
||||||
|
cb(cur_gate, "ffn_moe_silu", il);
|
||||||
|
|
||||||
|
cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd]
|
||||||
|
cb(cur_expert, "ffn_moe_gate_par", il);
|
||||||
|
|
||||||
|
cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd]
|
||||||
|
cb(cur_expert, "ffn_moe_down", il);
|
||||||
|
|
||||||
|
cur_expert = ggml_mul(ctx0, cur_expert,
|
||||||
|
ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0]));
|
||||||
|
cb(cur_expert, "ffn_moe_weighted", il);
|
||||||
|
|
||||||
|
if (i == 0) {
|
||||||
|
moe_out = cur_expert;
|
||||||
|
} else {
|
||||||
|
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
||||||
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = moe_out;
|
||||||
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||||
|
if (layer_dir != nullptr) {
|
||||||
|
cur = ggml_add(ctx0, cur, layer_dir);
|
||||||
|
}
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
// input for next layer
|
// input for next layer
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue