From 17e4d6c96af7c20d8f3963f6e2465ee2753b3f6d Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Thu, 7 Mar 2024 21:32:48 -0500 Subject: [PATCH] mamba : rename metadata to be more similar to transformers library This breaks existing converted-to-GGUF models, but the metadata names are more "standard". mamba : support mamba-*-hf models These models share their token_embd.weight with their output.weight --- convert-hf-to-gguf.py | 8 ++++---- gguf-py/gguf/constants.py | 16 ++++++++-------- gguf-py/gguf/gguf_writer.py | 16 ++++++++-------- gguf-py/gguf/tensor_mapping.py | 7 ++++--- llama.cpp | 35 ++++++++++++++++++++-------------- 5 files changed, 45 insertions(+), 37 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d526e3157..3318be35c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1903,10 +1903,10 @@ class MambaModel(Model): self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading self.gguf_writer.add_block_count(self.hparams["n_layer"]) - self.gguf_writer.add_ssm_conv_kernel_size(d_conv) - self.gguf_writer.add_ssm_inner_length(d_inner) - self.gguf_writer.add_ssm_state_length(d_state) - self.gguf_writer.add_ssm_dt_rank(dt_rank) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) self.gguf_writer.add_file_type(self.ftype) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8030023f3..b23badb10 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -62,10 +62,10 @@ class Keys: SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" class SSM: - CONV_KERNEL_SIZE = "{arch}.ssm.d_conv" - INNER_LENGTH = "{arch}.ssm.d_inner" - STATE_LENGTH = "{arch}.ssm.d_state" - DT_RANK = "{arch}.ssm.dt_rank" + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" + TIME_STEP_RANK = "{arch}.ssm.time_step_rank" class Tokenizer: MODEL = "tokenizer.ggml.model" @@ -770,10 +770,10 @@ KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED # SSM -KEY_SSM_CONV_KERNEL_SIZE = Keys.SSM.CONV_KERNEL_SIZE -KEY_SSM_INNER_LENGTH = Keys.SSM.INNER_LENGTH -KEY_SSM_STATE_LENGTH = Keys.SSM.STATE_LENGTH -KEY_SSM_DT_RANK = Keys.SSM.DT_RANK +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 146358e69..e49c5db68 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -382,17 +382,17 @@ class GGUFWriter: def add_rope_scaling_finetuned(self, value: bool) -> None: self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) - def add_ssm_conv_kernel_size(self, value: int) -> None: - self.add_uint32(Keys.SSM.CONV_KERNEL_SIZE.format(arch=self.arch), value) + def add_ssm_conv_kernel(self, value: int) -> None: + self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) - def add_ssm_inner_length(self, value: int) -> None: - self.add_uint32(Keys.SSM.INNER_LENGTH.format(arch=self.arch), value) + def add_ssm_inner_size(self, value: int) -> None: + self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value) - def add_ssm_state_length(self, value: int) -> None: - self.add_uint32(Keys.SSM.STATE_LENGTH.format(arch=self.arch), value) + def add_ssm_state_size(self, value: int) -> None: + self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value) - def add_ssm_dt_rank(self, value: int) -> None: - self.add_uint32(Keys.SSM.DT_RANK.format(arch=self.arch), value) + def add_ssm_time_step_rank(self, value: int) -> None: + self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value) def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 85af29549..ed89955d8 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -20,8 +20,9 @@ class TensorNameMap: "wte", # gpt2 "transformer.embd.wte", # phi2 "model.tok_embeddings", # internlm2 - "model.embedding", # mamba + "model.embedding", # mamba-qbert "backbone.embedding", # mamba + "backbone.embeddings", # mamba-hf ), # Token type embeddings @@ -63,7 +64,7 @@ class TensorNameMap: "language_model.encoder.final_layernorm", # persimmon "model.final_layernorm", # persimmon "lm_head.ln", # phi2 - "model.norm_f", # mamba + "model.norm_f", # mamba-qbert "backbone.norm_f", # mamba ), @@ -90,7 +91,7 @@ class TensorNameMap: "transformer.h.{bid}.ln", # phi2 "model.layers.layers.{bid}.norm", # plamo "model.layers.{bid}.attention_norm", # internlm2 - "model.layers.{bid}.norm", # mamba + "model.layers.{bid}.norm", # mamba-qbert "backbone.layers.{bid}.norm", # mamba ), diff --git a/llama.cpp b/llama.cpp index 5c5b7a119..a54ce43e9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -286,10 +286,10 @@ enum llm_kv { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, - LLM_KV_SSM_D_INNER, - LLM_KV_SSM_D_CONV, - LLM_KV_SSM_D_STATE, - LLM_KV_SSM_DT_RANK, + LLM_KV_SSM_INNER_SIZE, + LLM_KV_SSM_CONV_KERNEL, + LLM_KV_SSM_STATE_SIZE, + LLM_KV_SSM_TIME_STEP_RANK, LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, @@ -349,10 +349,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, - { LLM_KV_SSM_D_CONV, "%s.ssm.d_conv" }, - { LLM_KV_SSM_D_INNER, "%s.ssm.d_inner"}, - { LLM_KV_SSM_D_STATE, "%s.ssm.d_state"}, - { LLM_KV_SSM_DT_RANK, "%s.ssm.dt_rank"}, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, + { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, + { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, + { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -3599,10 +3599,10 @@ static void llm_load_hparams( } break; case LLM_ARCH_MAMBA: { - ml.get_key(LLM_KV_SSM_D_CONV, hparams.ssm_d_conv); - ml.get_key(LLM_KV_SSM_D_INNER, hparams.ssm_d_inner); - ml.get_key(LLM_KV_SSM_D_STATE, hparams.ssm_d_state); - ml.get_key(LLM_KV_SSM_DT_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -4864,8 +4864,15 @@ static bool llm_load_tensors( // output { - model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed, duplicated to allow offloading + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } } for (int i = 0; i < n_layer; ++i) {