llama : add support for DeepSeek V3 model.
This commit is contained in:
parent
0061955a06
commit
a43d4953ba
3 changed files with 86 additions and 5 deletions
|
@ -105,6 +105,7 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
|
|
|
@ -396,6 +396,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||||
"\\p{N}+",
|
"\\p{N}+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
||||||
|
regex_exprs = {
|
||||||
|
"\\p{N}{1,3}",
|
||||||
|
"[一-龥-ゟ゠-ヿ]+",
|
||||||
|
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
"[\r\n]",
|
"[\r\n]",
|
||||||
|
|
|
@ -78,7 +78,7 @@
|
||||||
|
|
||||||
// bump if necessary
|
// bump if necessary
|
||||||
#define LLAMA_MAX_LAYERS 512
|
#define LLAMA_MAX_LAYERS 512
|
||||||
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
|
#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
|
||||||
|
|
||||||
//
|
//
|
||||||
// helpers
|
// helpers
|
||||||
|
@ -289,6 +289,8 @@ enum llm_kv {
|
||||||
LLM_KV_EXPERT_USED_COUNT,
|
LLM_KV_EXPERT_USED_COUNT,
|
||||||
LLM_KV_EXPERT_SHARED_COUNT,
|
LLM_KV_EXPERT_SHARED_COUNT,
|
||||||
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
LLM_KV_EXPERT_WEIGHTS_SCALE,
|
||||||
|
LLM_KV_EXPERT_WEIGHTS_NORM,
|
||||||
|
LLM_KV_EXPERT_GATING_FUNC,
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
LLM_KV_LOGIT_SCALE,
|
LLM_KV_LOGIT_SCALE,
|
||||||
LLM_KV_DECODER_START_TOKEN_ID,
|
LLM_KV_DECODER_START_TOKEN_ID,
|
||||||
|
@ -415,6 +417,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||||
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
|
||||||
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
|
||||||
|
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
|
||||||
|
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
|
||||||
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
|
||||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
|
||||||
|
@ -560,6 +564,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_DOWN_SHEXP,
|
LLM_TENSOR_FFN_DOWN_SHEXP,
|
||||||
LLM_TENSOR_FFN_GATE_SHEXP,
|
LLM_TENSOR_FFN_GATE_SHEXP,
|
||||||
LLM_TENSOR_FFN_UP_SHEXP,
|
LLM_TENSOR_FFN_UP_SHEXP,
|
||||||
|
LLM_TENSOR_FFN_EXPERT_WEIGHTS_B,
|
||||||
LLM_TENSOR_ATTN_Q_NORM,
|
LLM_TENSOR_ATTN_Q_NORM,
|
||||||
LLM_TENSOR_ATTN_K_NORM,
|
LLM_TENSOR_ATTN_K_NORM,
|
||||||
LLM_TENSOR_LAYER_OUT_NORM,
|
LLM_TENSOR_LAYER_OUT_NORM,
|
||||||
|
@ -1429,6 +1434,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||||
|
{ LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, "blk.%d.expert_weights_b" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -2558,6 +2564,7 @@ enum e_model {
|
||||||
MODEL_70B,
|
MODEL_70B,
|
||||||
MODEL_236B,
|
MODEL_236B,
|
||||||
MODEL_314B,
|
MODEL_314B,
|
||||||
|
MODEL_671B,
|
||||||
MODEL_SMALL,
|
MODEL_SMALL,
|
||||||
MODEL_MEDIUM,
|
MODEL_MEDIUM,
|
||||||
MODEL_LARGE,
|
MODEL_LARGE,
|
||||||
|
@ -2586,6 +2593,19 @@ struct llama_hparams_convnext {
|
||||||
uint32_t n_layer;
|
uint32_t n_layer;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llm_expert_gating_func_type {
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX = 1,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SIGMOID = 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
static const char * llama_expert_gating_func_name(llm_expert_gating_func_type type) {
|
||||||
|
switch (type) {
|
||||||
|
case LLM_EXPERT_GATING_FUNC_SOFTMAX: return "softmax";
|
||||||
|
case LLM_EXPERT_GATING_FUNC_SIGMOID: return "sigmoid";
|
||||||
|
default: return "unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
bool rope_finetuned;
|
bool rope_finetuned;
|
||||||
|
@ -2621,6 +2641,8 @@ struct llama_hparams {
|
||||||
uint32_t n_ff_shexp = 0;
|
uint32_t n_ff_shexp = 0;
|
||||||
uint32_t n_expert_shared = 0;
|
uint32_t n_expert_shared = 0;
|
||||||
float expert_weights_scale = 0.0;
|
float expert_weights_scale = 0.0;
|
||||||
|
bool expert_weights_norm = false;
|
||||||
|
uint32_t expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX;
|
||||||
|
|
||||||
float f_norm_eps;
|
float f_norm_eps;
|
||||||
float f_norm_rms_eps;
|
float f_norm_rms_eps;
|
||||||
|
@ -2912,6 +2934,7 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
||||||
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
||||||
struct ggml_tensor * ffn_act = nullptr;
|
struct ggml_tensor * ffn_act = nullptr;
|
||||||
|
struct ggml_tensor * ffn_expert_weights_bias = nullptr;
|
||||||
|
|
||||||
// mamba proj
|
// mamba proj
|
||||||
struct ggml_tensor * ssm_in = nullptr;
|
struct ggml_tensor * ssm_in = nullptr;
|
||||||
|
@ -5577,6 +5600,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_70B: return "70B";
|
case MODEL_70B: return "70B";
|
||||||
case MODEL_236B: return "236B";
|
case MODEL_236B: return "236B";
|
||||||
case MODEL_314B: return "314B";
|
case MODEL_314B: return "314B";
|
||||||
|
case MODEL_671B: return "671B";
|
||||||
case MODEL_SMALL: return "0.1B";
|
case MODEL_SMALL: return "0.1B";
|
||||||
case MODEL_MEDIUM: return "0.4B";
|
case MODEL_MEDIUM: return "0.4B";
|
||||||
case MODEL_LARGE: return "0.8B";
|
case MODEL_LARGE: return "0.8B";
|
||||||
|
@ -6288,11 +6312,14 @@ static void llm_load_hparams(
|
||||||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||||
|
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 27: model.type = e_model::MODEL_16B; break;
|
case 27: model.type = e_model::MODEL_16B; break;
|
||||||
case 60: model.type = e_model::MODEL_236B; break;
|
case 60: model.type = e_model::MODEL_236B; break;
|
||||||
|
case 61: model.type = e_model::MODEL_671B; break;
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -6616,6 +6643,10 @@ static void llm_load_vocab(
|
||||||
tokenizer_pre == "deepseek-coder") {
|
tokenizer_pre == "deepseek-coder") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||||||
vocab.tokenizer_clean_spaces = false;
|
vocab.tokenizer_clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "deepseek-v3") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
|
||||||
|
vocab.tokenizer_clean_spaces = false;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "falcon") {
|
tokenizer_pre == "falcon") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||||
|
@ -7300,6 +7331,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||||
|
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||||
|
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llm_expert_gating_func_type) hparams.expert_gating_func));
|
||||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7447,6 +7480,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
||||||
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_DOWN_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_GATE_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
||||||
|
{LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
|
||||||
// this tensor is loaded for T5, but never used
|
// this tensor is loaded for T5, but never used
|
||||||
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
||||||
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
||||||
|
@ -9249,6 +9283,7 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
} else {
|
} else {
|
||||||
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
||||||
|
layer.ffn_expert_weights_bias = create_tensor(tn(LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
if (n_expert == 0) {
|
if (n_expert == 0) {
|
||||||
throw std::runtime_error("n_expert must be > 0");
|
throw std::runtime_error("n_expert must be > 0");
|
||||||
|
@ -10229,12 +10264,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
||||||
struct ggml_tensor * up_exps,
|
struct ggml_tensor * up_exps,
|
||||||
struct ggml_tensor * gate_exps,
|
struct ggml_tensor * gate_exps,
|
||||||
struct ggml_tensor * down_exps,
|
struct ggml_tensor * down_exps,
|
||||||
|
struct ggml_tensor * expert_weights_b,
|
||||||
int64_t n_expert,
|
int64_t n_expert,
|
||||||
int64_t n_expert_used,
|
int64_t n_expert_used,
|
||||||
llm_ffn_op_type type_op,
|
llm_ffn_op_type type_op,
|
||||||
bool norm_w,
|
bool norm_w,
|
||||||
bool scale_w,
|
bool scale_w,
|
||||||
float w_scale,
|
float w_scale,
|
||||||
|
llm_expert_gating_func_type gating_op,
|
||||||
const llm_build_cb & cb,
|
const llm_build_cb & cb,
|
||||||
int il) {
|
int il) {
|
||||||
int64_t n_embd = cur->ne[0];
|
int64_t n_embd = cur->ne[0];
|
||||||
|
@ -10243,11 +10280,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
|
||||||
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
|
ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
|
||||||
cb(logits, "ffn_moe_logits", il);
|
cb(logits, "ffn_moe_logits", il);
|
||||||
|
|
||||||
ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
ggml_tensor * probs = nullptr;
|
||||||
cb(probs, "ffn_moe_probs", il);
|
switch (gating_op) {
|
||||||
|
case LLM_EXPERT_GATING_FUNC_SOFTMAX:
|
||||||
|
{
|
||||||
|
probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
|
||||||
|
cb(probs, "ffn_moe_probs", il);
|
||||||
|
} break;
|
||||||
|
case LLM_EXPERT_GATING_FUNC_SIGMOID:
|
||||||
|
{
|
||||||
|
probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
|
||||||
|
cb(probs, "ffn_moe_sigm", il);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
|
||||||
|
// add experts selection bias - introduced in DeepSeek V3
|
||||||
|
ggml_tensor * selection_probs = probs;
|
||||||
|
if (expert_weights_b != nullptr) {
|
||||||
|
selection_probs = ggml_add(ctx, probs, expert_weights_b);
|
||||||
|
cb(selection_probs, "ffn_moe_sigm_biased", il);
|
||||||
|
}
|
||||||
|
|
||||||
// select experts
|
// select experts
|
||||||
ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
|
ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
|
||||||
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
cb(selected_experts->src[0], "ffn_moe_argsort", il);
|
||||||
cb(selected_experts, "ffn_moe_topk", il);
|
cb(selected_experts, "ffn_moe_topk", il);
|
||||||
|
|
||||||
|
@ -11368,9 +11425,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, true,
|
LLM_FFN_SILU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
}
|
}
|
||||||
|
@ -12020,9 +12079,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_GELU, true,
|
LLM_FFN_GELU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -12161,9 +12222,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, true,
|
LLM_FFN_SILU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -13409,9 +13472,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, false,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -15403,9 +15468,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, false,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -15800,9 +15867,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, true,
|
LLM_FFN_SILU, true,
|
||||||
false, 0.0,
|
false, 0.0,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(cur, "ffn_moe_out", il);
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -15941,9 +16010,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
nullptr,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, false,
|
||||||
false, hparams.expert_weights_scale,
|
false, hparams.expert_weights_scale,
|
||||||
|
LLM_EXPERT_GATING_FUNC_SOFTMAX,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
@ -16170,9 +16241,11 @@ struct llm_build_context {
|
||||||
model.layers[il].ffn_up_exps,
|
model.layers[il].ffn_up_exps,
|
||||||
model.layers[il].ffn_gate_exps,
|
model.layers[il].ffn_gate_exps,
|
||||||
model.layers[il].ffn_down_exps,
|
model.layers[il].ffn_down_exps,
|
||||||
|
model.layers[il].ffn_expert_weights_bias,
|
||||||
n_expert, n_expert_used,
|
n_expert, n_expert_used,
|
||||||
LLM_FFN_SILU, false,
|
LLM_FFN_SILU, hparams.expert_weights_norm,
|
||||||
true, hparams.expert_weights_scale,
|
true, hparams.expert_weights_scale,
|
||||||
|
(enum llm_expert_gating_func_type) hparams.expert_gating_func,
|
||||||
cb, il);
|
cb, il);
|
||||||
cb(moe_out, "ffn_moe_out", il);
|
cb(moe_out, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue