From 5816ae687ea1c2f9add7c582d283e80cc5d089ba Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 1 Feb 2024 21:22:28 -0500 Subject: [PATCH] mamba : very basic quantization support Mostly works, but there is currently no difference between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same). Most of the SSM-specific weights can be kept in f32 without affecting the size that much, since they are relatively small. (the linear projection weights are responsible for most of Mamba's size) Too much quantization seems to make the state degrade quite fast, and the model begins to output gibberish. It seems to affect bigger models to a lesser extent than small models, but I'm not sure by how much. Experimentation will be needed to figure out which weights are more important for the _M (and _L?) variants of k-quants for Mamba. * convert : fix wrong name for layer norm weight of offical Mamba models I was using Q-bert/Mamba-* models before, which have a slighlty different naming scheme for the weights. (they start with "model.layers" instead of "backbone.layers") --- gguf-py/gguf/tensor_mapping.py | 2 +- llama.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d24d10dcb..85af29549 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -91,7 +91,7 @@ class TensorNameMap: "model.layers.layers.{bid}.norm", # plamo "model.layers.{bid}.attention_norm", # internlm2 "model.layers.{bid}.norm", # mamba - "backbone.layers.{bid}.mixer.norm", # mamba + "backbone.layers.{bid}.norm", # mamba ), # Attention norm 2 diff --git a/llama.cpp b/llama.cpp index bbf16e8f4..9dba8eeb2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11718,6 +11718,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); + // do not quantize Mamba's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + quantize &= name.find("ssm_x.weight") == std::string::npos; + quantize &= name.find("ssm_dt.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size;