llama : add Deepseek MoE v1 & GigaChat models (#10827)
* Add deepseek v1 arch & gigachat template * improve template code * add readme * delete comments * remove comment * fix format * lint llama.cpp * fix order of deepseek and deepseek2, move gigachat temlate to the end of func * fix order of deepseek and deepseek2 in constants; mark shared exp as deepseek arch need * remove comments * move deepseek above deepseek2 * change placement of gigachat chat template
This commit is contained in:
parent
87cf323cef
commit
a0974156f3
7 changed files with 423 additions and 3 deletions
|
@ -306,7 +306,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2
|
||||
"model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
|
@ -338,7 +338,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2
|
||||
"model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
|
||||
),
|
||||
|
||||
# Feed-forward down
|
||||
|
@ -379,7 +379,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: (
|
||||
"model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe
|
||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2
|
||||
"model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue