llama : add support for DeepSeek V3 model.

2025-01-02 10:15:53 +01:00 · 2025-01-02 10:15:53 +01:00 · a43d4953ba
commit a43d4953ba
parent 0061955a06
3 changed files with 86 additions and 5 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -105,6 +105,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
        LLAMA_VOCAB_PRE_TYPE_MINERVA        = 27,
+        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM  = 28,
    };

    enum llama_rope_type {
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -396,6 +396,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "\\p{N}+",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
+                regex_exprs = {
+                    "\\p{N}{1,3}",
+                    "[一-龥぀-ゟ゠-ヿ]+",
+                    "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
+                };
+                break;
            case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                regex_exprs = {
                    "[\r\n]",
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -78,7 +78,7 @@

 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
-#define LLAMA_MAX_EXPERTS 160  // DeepSeekV2
+#define LLAMA_MAX_EXPERTS 256  // DeepSeekV3

 //
 // helpers
@ -289,6 +289,8 @@ enum llm_kv {
    LLM_KV_EXPERT_USED_COUNT,
    LLM_KV_EXPERT_SHARED_COUNT,
    LLM_KV_EXPERT_WEIGHTS_SCALE,
+    LLM_KV_EXPERT_WEIGHTS_NORM,
+    LLM_KV_EXPERT_GATING_FUNC,
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,
    LLM_KV_DECODER_START_TOKEN_ID,
@ -415,6 +417,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_USED_COUNT,                 "%s.expert_used_count"                 },
    { LLM_KV_EXPERT_SHARED_COUNT,               "%s.expert_shared_count"               },
    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
+    { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
+    { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
@ -560,6 +564,7 @@ enum llm_tensor {
    LLM_TENSOR_FFN_DOWN_SHEXP,
    LLM_TENSOR_FFN_GATE_SHEXP,
    LLM_TENSOR_FFN_UP_SHEXP,
+    LLM_TENSOR_FFN_EXPERT_WEIGHTS_B,
    LLM_TENSOR_ATTN_Q_NORM,
    LLM_TENSOR_ATTN_K_NORM,
    LLM_TENSOR_LAYER_OUT_NORM,
@ -1429,6 +1434,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, "blk.%d.expert_weights_b" },
        },
    },
    {
@ -2558,6 +2564,7 @@ enum e_model {
    MODEL_70B,
    MODEL_236B,
    MODEL_314B,
+    MODEL_671B,
    MODEL_SMALL,
    MODEL_MEDIUM,
    MODEL_LARGE,
@ -2586,6 +2593,19 @@ struct llama_hparams_convnext {
    uint32_t n_layer;
 };

+enum llm_expert_gating_func_type {
+    LLM_EXPERT_GATING_FUNC_SOFTMAX = 1,
+    LLM_EXPERT_GATING_FUNC_SIGMOID = 2,
+};
+
+static const char * llama_expert_gating_func_name(llm_expert_gating_func_type type) {
+    switch (type) {
+        case LLM_EXPERT_GATING_FUNC_SOFTMAX: return "softmax";
+        case LLM_EXPERT_GATING_FUNC_SIGMOID: return "sigmoid";
+        default:                             return "unknown";
+    }
+}
+
 struct llama_hparams {
    bool vocab_only;
    bool rope_finetuned;
@ -2621,6 +2641,8 @@ struct llama_hparams {
    uint32_t n_ff_shexp = 0;
    uint32_t n_expert_shared = 0;
    float    expert_weights_scale = 0.0;
+    bool     expert_weights_norm = false;
+    uint32_t expert_gating_func = LLM_EXPERT_GATING_FUNC_SOFTMAX;

    float f_norm_eps;
    float f_norm_rms_eps;
@ -2912,6 +2934,7 @@ struct llama_layer {
    struct ggml_tensor * ffn_down_b = nullptr; // b2
    struct ggml_tensor * ffn_up_b   = nullptr; // b3
    struct ggml_tensor * ffn_act    = nullptr;
+    struct ggml_tensor * ffn_expert_weights_bias = nullptr;

    // mamba proj
    struct ggml_tensor * ssm_in  = nullptr;
@ -5577,6 +5600,7 @@ static const char * llama_model_type_name(e_model type) {
        case MODEL_70B:           return "70B";
        case MODEL_236B:          return "236B";
        case MODEL_314B:          return "314B";
+        case MODEL_671B:          return "671B";
        case MODEL_SMALL:         return "0.1B";
        case MODEL_MEDIUM:        return "0.4B";
        case MODEL_LARGE:         return "0.8B";
@ -6288,11 +6312,14 @@ static void llm_load_hparams(
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
                ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
                ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
+                ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
                ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);

                switch (hparams.n_layer) {
                    case 27: model.type = e_model::MODEL_16B; break;
                    case 60: model.type = e_model::MODEL_236B; break;
+                    case 61: model.type = e_model::MODEL_671B; break;
                    default: model.type = e_model::MODEL_UNKNOWN;
                }
            } break;
@ -6616,6 +6643,10 @@ static void llm_load_vocab(
                    tokenizer_pre == "deepseek-coder") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
                vocab.tokenizer_clean_spaces = false;
+            } else if (
+                    tokenizer_pre == "deepseek-v3") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
+                vocab.tokenizer_clean_spaces = false;
            } else if (
                    tokenizer_pre == "falcon") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
@ -7300,6 +7331,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
        LLAMA_LOG_INFO("%s: n_ff_exp             = %d\n",     __func__, hparams.n_ff_exp);
        LLAMA_LOG_INFO("%s: n_expert_shared      = %d\n",     __func__, hparams.n_expert_shared);
        LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n",   __func__, hparams.expert_weights_scale);
+        LLAMA_LOG_INFO("%s: expert_weights_norm  = %d\n",     __func__, hparams.expert_weights_norm);
+        LLAMA_LOG_INFO("%s: expert_gating_func   = %s\n",     __func__, llama_expert_gating_func_name((enum llm_expert_gating_func_type) hparams.expert_gating_func));
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul    = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
    }

@ -7447,6 +7480,7 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
    {LLM_TENSOR_FFN_DOWN_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
    {LLM_TENSOR_FFN_GATE_EXPS,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
    {LLM_TENSOR_FFN_UP_EXPS,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
+    {LLM_TENSOR_FFN_EXPERT_WEIGHTS_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
    // this tensor is loaded for T5, but never used
    {LLM_TENSOR_DEC_CROSS_ATTN_REL_B,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
    {LLM_TENSOR_CONV1D,                     {LLM_TENSOR_LAYER_INPUT,     GGML_OP_IM2COL}},
@ -9249,6 +9283,7 @@ static bool llm_load_tensors(
                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
                        } else {
                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+                            layer.ffn_expert_weights_bias = create_tensor(tn(LLM_TENSOR_FFN_EXPERT_WEIGHTS_B, "bias", i), {n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);

                            if (n_expert == 0) {
                                throw std::runtime_error("n_expert must be > 0");
@ -10229,12 +10264,14 @@ static struct ggml_tensor * llm_build_moe_ffn(
         struct ggml_tensor * up_exps,
         struct ggml_tensor * gate_exps,
         struct ggml_tensor * down_exps,
+         struct ggml_tensor * expert_weights_b,
                    int64_t   n_expert,
                    int64_t   n_expert_used,
            llm_ffn_op_type   type_op,
                       bool   norm_w,
                       bool   scale_w,
                      float   w_scale,
+llm_expert_gating_func_type   gating_op,
         const llm_build_cb & cb,
                        int   il) {
    int64_t n_embd = cur->ne[0];
@ -10243,11 +10280,31 @@ static struct ggml_tensor * llm_build_moe_ffn(
    ggml_tensor * logits = llm_build_lora_mm(lctx, ctx, gate_inp, cur); // [n_expert, n_tokens]
    cb(logits, "ffn_moe_logits", il);

-    ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
-    cb(probs, "ffn_moe_probs", il);
+    ggml_tensor * probs = nullptr;
+    switch (gating_op) {
+        case LLM_EXPERT_GATING_FUNC_SOFTMAX:
+            {
+                probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens]
+                cb(probs, "ffn_moe_probs", il);
+            } break;
+        case LLM_EXPERT_GATING_FUNC_SIGMOID:
+            {
+                probs = ggml_sigmoid(ctx, logits); // [n_expert, n_tokens]
+                cb(probs, "ffn_moe_sigm", il);
+            } break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+
+    // add experts selection bias - introduced in DeepSeek V3
+    ggml_tensor * selection_probs = probs;
+    if (expert_weights_b != nullptr) {
+        selection_probs = ggml_add(ctx, probs, expert_weights_b);
+        cb(selection_probs, "ffn_moe_sigm_biased", il);
+    }

    // select experts
-    ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens]
+    ggml_tensor * selected_experts = ggml_top_k(ctx, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
    cb(selected_experts->src[0], "ffn_moe_argsort", il);
    cb(selected_experts, "ffn_moe_topk", il);

@ -11368,9 +11425,11 @@ struct llm_build_context {
                        model.layers[il].ffn_up_exps,
                        model.layers[il].ffn_gate_exps,
                        model.layers[il].ffn_down_exps,
+                        nullptr,
                        n_expert, n_expert_used,
                        LLM_FFN_SILU, true,
                        false, 0.0,
+                        LLM_EXPERT_GATING_FUNC_SOFTMAX,
                        cb, il);
                cb(cur, "ffn_moe_out", il);
            }
@ -12020,9 +12079,11 @@ struct llm_build_context {
                    model.layers[il].ffn_up_exps,
                    model.layers[il].ffn_gate_exps,
                    model.layers[il].ffn_down_exps,
+                    nullptr,
                    n_expert, n_expert_used,
                    LLM_FFN_GELU, true,
                    false, 0.0,
+                    LLM_EXPERT_GATING_FUNC_SOFTMAX,
                    cb, il);
            cb(cur, "ffn_moe_out", il);

@ -12161,9 +12222,11 @@ struct llm_build_context {
                    model.layers[il].ffn_up_exps,
                    model.layers[il].ffn_gate_exps,
                    model.layers[il].ffn_down_exps,
+                    nullptr,
                    n_expert, n_expert_used,
                    LLM_FFN_SILU, true,
                    false, 0.0,
+                    LLM_EXPERT_GATING_FUNC_SOFTMAX,
                    cb, il);
            cb(cur, "ffn_moe_out", il);

@ -13409,9 +13472,11 @@ struct llm_build_context {
                        model.layers[il].ffn_up_exps,
                        model.layers[il].ffn_gate_exps,
                        model.layers[il].ffn_down_exps,
+                        nullptr,
                        n_expert, n_expert_used,
                        LLM_FFN_SILU, false,
                        false, 0.0,
+                        LLM_EXPERT_GATING_FUNC_SOFTMAX,
                        cb, il);
            cb(cur, "ffn_moe_out", il);

@ -15403,9 +15468,11 @@ struct llm_build_context {
                    model.layers[il].ffn_up_exps,
                    model.layers[il].ffn_gate_exps,
                    model.layers[il].ffn_down_exps,
+                    nullptr,
                    n_expert, n_expert_used,
                    LLM_FFN_SILU, false,
                    false, 0.0,
+                    LLM_EXPERT_GATING_FUNC_SOFTMAX,
                    cb, il);
            cb(cur, "ffn_moe_out", il);

@ -15800,9 +15867,11 @@ struct llm_build_context {
                    model.layers[il].ffn_up_exps,
                    model.layers[il].ffn_gate_exps,
                    model.layers[il].ffn_down_exps,
+                    nullptr,
                    n_expert, n_expert_used,
                    LLM_FFN_SILU, true,
                    false, 0.0,
+                    LLM_EXPERT_GATING_FUNC_SOFTMAX,
                    cb, il);
            cb(cur, "ffn_moe_out", il);

@ -15941,9 +16010,11 @@ struct llm_build_context {
                            model.layers[il].ffn_up_exps,
                            model.layers[il].ffn_gate_exps,
                            model.layers[il].ffn_down_exps,
+                            nullptr,
                            n_expert, n_expert_used,
                            LLM_FFN_SILU, false,
                            false, hparams.expert_weights_scale,
+                            LLM_EXPERT_GATING_FUNC_SOFTMAX,
                            cb, il);
                cb(moe_out, "ffn_moe_out", il);

@ -16170,9 +16241,11 @@ struct llm_build_context {
                            model.layers[il].ffn_up_exps,
                            model.layers[il].ffn_gate_exps,
                            model.layers[il].ffn_down_exps,
+                            model.layers[il].ffn_expert_weights_bias,
                            n_expert, n_expert_used,
-                            LLM_FFN_SILU, false,
+                            LLM_FFN_SILU, hparams.expert_weights_norm,
                            true, hparams.expert_weights_scale,
+                            (enum llm_expert_gating_func_type) hparams.expert_gating_func,
                            cb, il);
                cb(moe_out, "ffn_moe_out", il);