mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models, but the metadata names are more "standard". mamba : support mamba-*-hf models These models share their token_embd.weight with their output.weight
This commit is contained in:
parent
d8024a486b
commit
17e4d6c96a
5 changed files with 45 additions and 37 deletions
|
@ -1903,10 +1903,10 @@ class MambaModel(Model):
|
||||||
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
|
||||||
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
|
||||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||||
self.gguf_writer.add_ssm_conv_kernel_size(d_conv)
|
self.gguf_writer.add_ssm_conv_kernel(d_conv)
|
||||||
self.gguf_writer.add_ssm_inner_length(d_inner)
|
self.gguf_writer.add_ssm_inner_size(d_inner)
|
||||||
self.gguf_writer.add_ssm_state_length(d_state)
|
self.gguf_writer.add_ssm_state_size(d_state)
|
||||||
self.gguf_writer.add_ssm_dt_rank(dt_rank)
|
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
|
||||||
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
|
||||||
self.gguf_writer.add_file_type(self.ftype)
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
|
|
@ -62,10 +62,10 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL_SIZE = "{arch}.ssm.d_conv"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
INNER_LENGTH = "{arch}.ssm.d_inner"
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||||
STATE_LENGTH = "{arch}.ssm.d_state"
|
STATE_SIZE = "{arch}.ssm.state_size"
|
||||||
DT_RANK = "{arch}.ssm.dt_rank"
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
|
@ -770,10 +770,10 @@ KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
|
||||||
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
||||||
|
|
||||||
# SSM
|
# SSM
|
||||||
KEY_SSM_CONV_KERNEL_SIZE = Keys.SSM.CONV_KERNEL_SIZE
|
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
||||||
KEY_SSM_INNER_LENGTH = Keys.SSM.INNER_LENGTH
|
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
||||||
KEY_SSM_STATE_LENGTH = Keys.SSM.STATE_LENGTH
|
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
||||||
KEY_SSM_DT_RANK = Keys.SSM.DT_RANK
|
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
||||||
|
|
||||||
# tokenization
|
# tokenization
|
||||||
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
||||||
|
|
|
@ -382,17 +382,17 @@ class GGUFWriter:
|
||||||
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
def add_rope_scaling_finetuned(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_ssm_conv_kernel_size(self, value: int) -> None:
|
def add_ssm_conv_kernel(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.SSM.CONV_KERNEL_SIZE.format(arch=self.arch), value)
|
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_ssm_inner_length(self, value: int) -> None:
|
def add_ssm_inner_size(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.SSM.INNER_LENGTH.format(arch=self.arch), value)
|
self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_ssm_state_length(self, value: int) -> None:
|
def add_ssm_state_size(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.SSM.STATE_LENGTH.format(arch=self.arch), value)
|
self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_ssm_dt_rank(self, value: int) -> None:
|
def add_ssm_time_step_rank(self, value: int) -> None:
|
||||||
self.add_uint32(Keys.SSM.DT_RANK.format(arch=self.arch), value)
|
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_tokenizer_model(self, model: str) -> None:
|
def add_tokenizer_model(self, model: str) -> None:
|
||||||
self.add_string(Keys.Tokenizer.MODEL, model)
|
self.add_string(Keys.Tokenizer.MODEL, model)
|
||||||
|
|
|
@ -20,8 +20,9 @@ class TensorNameMap:
|
||||||
"wte", # gpt2
|
"wte", # gpt2
|
||||||
"transformer.embd.wte", # phi2
|
"transformer.embd.wte", # phi2
|
||||||
"model.tok_embeddings", # internlm2
|
"model.tok_embeddings", # internlm2
|
||||||
"model.embedding", # mamba
|
"model.embedding", # mamba-qbert
|
||||||
"backbone.embedding", # mamba
|
"backbone.embedding", # mamba
|
||||||
|
"backbone.embeddings", # mamba-hf
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
|
@ -63,7 +64,7 @@ class TensorNameMap:
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
"model.final_layernorm", # persimmon
|
"model.final_layernorm", # persimmon
|
||||||
"lm_head.ln", # phi2
|
"lm_head.ln", # phi2
|
||||||
"model.norm_f", # mamba
|
"model.norm_f", # mamba-qbert
|
||||||
"backbone.norm_f", # mamba
|
"backbone.norm_f", # mamba
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -90,7 +91,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.ln", # phi2
|
"transformer.h.{bid}.ln", # phi2
|
||||||
"model.layers.layers.{bid}.norm", # plamo
|
"model.layers.layers.{bid}.norm", # plamo
|
||||||
"model.layers.{bid}.attention_norm", # internlm2
|
"model.layers.{bid}.attention_norm", # internlm2
|
||||||
"model.layers.{bid}.norm", # mamba
|
"model.layers.{bid}.norm", # mamba-qbert
|
||||||
"backbone.layers.{bid}.norm", # mamba
|
"backbone.layers.{bid}.norm", # mamba
|
||||||
),
|
),
|
||||||
|
|
||||||
|
|
33
llama.cpp
33
llama.cpp
|
@ -286,10 +286,10 @@ enum llm_kv {
|
||||||
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
||||||
LLM_KV_ROPE_SCALING_FINETUNED,
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
||||||
|
|
||||||
LLM_KV_SSM_D_INNER,
|
LLM_KV_SSM_INNER_SIZE,
|
||||||
LLM_KV_SSM_D_CONV,
|
LLM_KV_SSM_CONV_KERNEL,
|
||||||
LLM_KV_SSM_D_STATE,
|
LLM_KV_SSM_STATE_SIZE,
|
||||||
LLM_KV_SSM_DT_RANK,
|
LLM_KV_SSM_TIME_STEP_RANK,
|
||||||
|
|
||||||
LLM_KV_TOKENIZER_MODEL,
|
LLM_KV_TOKENIZER_MODEL,
|
||||||
LLM_KV_TOKENIZER_LIST,
|
LLM_KV_TOKENIZER_LIST,
|
||||||
|
@ -349,10 +349,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
||||||
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
||||||
|
|
||||||
{ LLM_KV_SSM_D_CONV, "%s.ssm.d_conv" },
|
{ LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" },
|
||||||
{ LLM_KV_SSM_D_INNER, "%s.ssm.d_inner"},
|
{ LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" },
|
||||||
{ LLM_KV_SSM_D_STATE, "%s.ssm.d_state"},
|
{ LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" },
|
||||||
{ LLM_KV_SSM_DT_RANK, "%s.ssm.dt_rank"},
|
{ LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" },
|
||||||
|
|
||||||
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
||||||
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
||||||
|
@ -3599,10 +3599,10 @@ static void llm_load_hparams(
|
||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_MAMBA:
|
case LLM_ARCH_MAMBA:
|
||||||
{
|
{
|
||||||
ml.get_key(LLM_KV_SSM_D_CONV, hparams.ssm_d_conv);
|
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||||
ml.get_key(LLM_KV_SSM_D_INNER, hparams.ssm_d_inner);
|
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||||
ml.get_key(LLM_KV_SSM_D_STATE, hparams.ssm_d_state);
|
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||||
ml.get_key(LLM_KV_SSM_DT_RANK, hparams.ssm_dt_rank);
|
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
@ -4865,7 +4865,14 @@ static bool llm_load_tensors(
|
||||||
// output
|
// output
|
||||||
{
|
{
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
||||||
|
if (model.output == NULL) {
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
ml.n_created--; // artificial tensor
|
||||||
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue