diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index e49b2f4f6..42b0fb66e 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1844,7 +1844,7 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 -@Model.register("MambaForCausalLM") +@Model.register("MambaForCausalLM", "MambaLMHeadModel") class MambaModel(Model): model_arch = gguf.MODEL_ARCH.MAMBA diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a28108383..651323a1e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -211,7 +211,6 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - # FIXME: NAMES FOR MAMBA ARE NOT FINAL MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", diff --git a/llama.cpp b/llama.cpp index 466f8bc0c..37ac7425d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -401,8 +401,6 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, - // TODO: maybe use longer names? - // TODO: can the in_proj and/or the out_proj instead re-use some of the above types? LLM_TENSOR_SSM_IN, LLM_TENSOR_SSM_CONV1D, LLM_TENSOR_SSM_X,