llama : add Deepseek MoE v1 & GigaChat models (#10827)

* Add deepseek v1 arch & gigachat template

* improve template code

* add readme

* delete comments

* remove comment

* fix format

* lint llama.cpp

* fix order of deepseek and deepseek2, move gigachat temlate to the end of func

* fix order of deepseek and deepseek2 in constants; mark shared exp as deepseek arch need

* remove comments

* move deepseek above deepseek2

* change placement of gigachat chat template
This commit is contained in:
Valentin Mamedov 2024-12-16 00:02:46 +07:00 committed by GitHub
parent 87cf323cef
commit a0974156f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 423 additions and 3 deletions

View file

@ -249,6 +249,7 @@ class MODEL_ARCH(IntEnum):
OLMOE = auto()
OPENELM = auto()
ARCTIC = auto()
DEEPSEEK = auto()
DEEPSEEK2 = auto()
CHATGLM = auto()
BITNET = auto()
@ -412,6 +413,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.BITNET: "bitnet",
@ -1158,6 +1160,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.DEEPSEEK: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_ROT_EMBD,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
MODEL_ARCH.DEEPSEEK2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@ -1380,6 +1405,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.DEEPSEEK: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.DEEPSEEK2: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,

View file

@ -306,7 +306,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_UP_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
),
# AWQ-activation gate
@ -338,7 +338,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_GATE_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
),
# Feed-forward down
@ -379,7 +379,7 @@ class TensorNameMap:
MODEL_TENSOR.FFN_DOWN_SHEXP: (
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
),
MODEL_TENSOR.ATTN_Q_NORM: (