diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d24d10dcb..85af29549 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -91,7 +91,7 @@ class TensorNameMap: "model.layers.layers.{bid}.norm", # plamo "model.layers.{bid}.attention_norm", # internlm2 "model.layers.{bid}.norm", # mamba - "backbone.layers.{bid}.mixer.norm", # mamba + "backbone.layers.{bid}.norm", # mamba ), # Attention norm 2 diff --git a/llama.cpp b/llama.cpp index bbf16e8f4..9dba8eeb2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11718,6 +11718,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); + // do not quantize Mamba's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + quantize &= name.find("ssm_x.weight") == std::string::npos; + quantize &= name.find("ssm_dt.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size;