From bdc4efe17fd4c2c9b0cc795e635054122cc54204 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 12 Apr 2024 21:40:47 +0200 Subject: [PATCH] Is silu activation function applied to MODEL_TENSOR.FFN_GATE_EXP here? If so, we must change this to w1 for DBRX. Each expert in DBRX has 3 linear layers: w1, v1 and w2. For an input tensor x, output from the expert layer would be (silu(x.w1_t) * x.v1_t) . w2_t). Same math is also used in mixtral, only difference being DBRX uses v1 instead of w3 in mixtral. Co-authored-by: Megha Agarwal <16129366+megha95@users.noreply.github.com> --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index da900f103..6ef5bf311 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -259,7 +259,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_EXP: ( "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) - "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx ), # Feed-forward down