From 5816ae687ea1c2f9add7c582d283e80cc5d089ba Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 1 Feb 2024 21:22:28 -0500
Subject: [PATCH] mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
---
 gguf-py/gguf/tensor_mapping.py | 2 +-
 llama.cpp                      | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index d24d10dcb..85af29549 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -91,7 +91,7 @@ class TensorNameMap:
             "model.layers.layers.{bid}.norm",                       # plamo
             "model.layers.{bid}.attention_norm",                    # internlm2
             "model.layers.{bid}.norm",                              # mamba
-            "backbone.layers.{bid}.mixer.norm",                     # mamba
+            "backbone.layers.{bid}.norm",                           # mamba
         ),
 
         # Attention norm 2
diff --git a/llama.cpp b/llama.cpp
index bbf16e8f4..9dba8eeb2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11718,6 +11718,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
 
+        // do not quantize Mamba's small yet 2D weights
+        // NOTE: can't use LLM_TN here because the layer number is not known
+        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
+        quantize &= name.find("ssm_x.weight") == std::string::npos;
+        quantize &= name.find("ssm_dt.weight") == std::string::npos;
+
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;