From bdc4efe17fd4c2c9b0cc795e635054122cc54204 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Fri, 12 Apr 2024 21:40:47 +0200
Subject: [PATCH] Is silu activation function applied to
 MODEL_TENSOR.FFN_GATE_EXP here? If so, we must change this to w1 for DBRX.
 Each expert in DBRX has 3 linear layers: w1, v1 and w2. For an input tensor
 x, output from the expert layer would be (silu(x.w1_t) * x.v1_t) . w2_t).
 Same math is also used in mixtral, only difference being DBRX uses v1 instead
 of w3 in mixtral.

Co-authored-by: Megha Agarwal <16129366+megha95@users.noreply.github.com>
---
 gguf-py/gguf/tensor_mapping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index da900f103..6ef5bf311 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -259,7 +259,7 @@ class TensorNameMap:
         MODEL_TENSOR.FFN_GATE_EXP: (
             "layers.{bid}.feed_forward.experts.w1",         # mixtral (merged)
             "transformer.decoder_layer.{bid}.moe.linear",   # Grok (merged)
-            "transformer.blocks.{bid}.ffn.experts.mlp.v1",  # dbrx
+            "transformer.blocks.{bid}.ffn.experts.mlp.w1",  # dbrx
         ),
 
         # Feed-forward down