llama : Add support for DeepSeek V3 (#11049)
* convert : extend DEEPSEEK2 model architecture to support DeepseekV3ForCausalLM by adding EXPERT_WEIGHTS_NORM and EXPERT_GATING_FUNC model parameters and FFN_EXP_PROBS_B tensor type * vocab : add DeepSeek V3 pre-tokenizer regexes * unicode : handle ACCENT_MARK and SYMBOL categories in regex * llama : add DeepSeek V3 chat template, handle new model parameters and tensor types --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
parent
f922a9c542
commit
9394bbd484
16 changed files with 162 additions and 5 deletions
|
@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
|
|||
case MODEL_70B: return "70B";
|
||||
case MODEL_236B: return "236B";
|
||||
case MODEL_314B: return "314B";
|
||||
case MODEL_671B: return "671B";
|
||||
case MODEL_SMALL: return "0.1B";
|
||||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
|
@ -125,6 +126,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
}
|
||||
}
|
||||
|
||||
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
|
||||
switch (type) {
|
||||
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
|
||||
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string llama_model_arch_name (const llama_model & model) {
|
||||
return llm_arch_name(model.arch);
|
||||
}
|
||||
|
@ -933,11 +942,19 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
|
|||
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
||||
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
||||
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
||||
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
||||
// that have no expert_gating_func model parameter set
|
||||
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
||||
}
|
||||
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 27: model.type = e_model::MODEL_16B; break;
|
||||
case 60: model.type = e_model::MODEL_236B; break;
|
||||
case 61: model.type = e_model::MODEL_671B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
@ -1259,6 +1276,10 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
|
|||
tokenizer_pre == "deepseek-coder") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "deepseek-v3") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
|
||||
vocab.tokenizer_clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "falcon") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
||||
|
@ -1941,6 +1962,8 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
||||
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
||||
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
||||
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
|
||||
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue