From ac75fbd8c515f8317f0d626271942fe7b595d865 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Tue, 9 Apr 2024 02:41:39 +0200 Subject: [PATCH] gguf-py: dbrx: reverse again the MOE tensors mapping: layer.ffn_up_exps -> Up-projection weights (w1) layer.ffn_gate_exps -> Gating weights (v1) layer.ffn_down_exps -> Down-projection weights (w2) --- convert-hf-to-gguf.py | 4 ++-- gguf-py/gguf/tensor_mapping.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 49a328ff5..228b68e14 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1527,9 +1527,9 @@ class DbrxModel(Model): # But llama.cpp moe graph works differently # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor - exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + exp_tensor_names = {"ffn.experts.mlp.v1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} - "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0c9da962e..9d34ce5c1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -238,7 +238,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_EXP: ( "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), # AWQ-activation gate @@ -259,7 +259,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx ), # Feed-forward down