llama : Add support for DeepSeek V3 (#11049)

* convert : extend DEEPSEEK2 model architecture to support DeepseekV3ForCausalLM by adding EXPERT_WEIGHTS_NORM and EXPERT_GATING_FUNC model parameters and FFN_EXP_PROBS_B tensor type

* vocab : add DeepSeek V3 pre-tokenizer regexes

* unicode : handle ACCENT_MARK and SYMBOL categories in regex

* llama : add DeepSeek V3 chat template, handle new model parameters and tensor types

---------

Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
fairydreaming 2025-01-04 21:06:11 +01:00 committed by GitHub
parent f922a9c542
commit 9394bbd484
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 162 additions and 5 deletions

View file

@ -6,7 +6,13 @@
// bump if necessary
#define LLAMA_MAX_LAYERS 512
#define LLAMA_MAX_EXPERTS 160 // DeepSeekV2
#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
enum llama_expert_gating_func_type {
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
};
struct llama_hparams_posnet {
uint32_t n_embd;
@ -54,7 +60,9 @@ struct llama_hparams {
uint32_t n_expert_shared = 0;
uint32_t n_norm_groups = 0;
float expert_weights_scale = 0.0;
float expert_weights_scale = 0.0;
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
float f_norm_eps;
float f_norm_rms_eps;