From 9f55809f7211bc58510ba501cfd681e9607cfb6a Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Sun, 4 Feb 2024 09:00:42 -0500 Subject: [PATCH] convert : for Mamba, also consider the "MambaLMHeadModel" arch name It's the name of the class of the official implementation, though they don't use it (yet) in the "architectures" field of config.json --- convert-hf-to-gguf.py | 2 +- gguf-py/gguf/constants.py | 1 - llama.cpp | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e49b2f4f6..42b0fb66e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1844,7 +1844,7 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("MambaForCausalLM") +@Model.register("MambaForCausalLM", "MambaLMHeadModel") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a28108383..651323a1e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -211,7 +211,6 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - # FIXME: NAMES FOR MAMBA ARE NOT FINAL MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", diff --git a/llama.cpp b/llama.cpp index 466f8bc0c..37ac7425d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -401,8 +401,6 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, - // TODO: maybe use longer names? - // TODO: can the in_proj and/or the out_proj instead re-use some of the above types? LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_CONV1D, LLM_TENSOR_SSM_X,