model: dbrx: convert fix mixed ffn_gate_exps and ffn_down_exps
This commit is contained in:
parent
ea8b58c6cd
commit
55943a281f
2 changed files with 4 additions and 4 deletions
|
@ -1527,9 +1527,9 @@ class DbrxModel(Model):
|
||||||
# But llama.cpp moe graph works differently
|
# But llama.cpp moe graph works differently
|
||||||
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
# AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
|
||||||
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
# so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
|
||||||
exp_tensor_names = {"ffn.experts.mlp.v1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
||||||
"ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
|
"ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert}
|
||||||
"ffn.experts.mlp.w1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
"ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert}
|
||||||
experts = False
|
experts = False
|
||||||
for exp_tensor_name in exp_tensor_names.keys():
|
for exp_tensor_name in exp_tensor_names.keys():
|
||||||
if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
|
if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1:
|
||||||
|
|
|
@ -238,7 +238,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
|
@ -259,7 +259,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue