model: dbrx: convert-hf-to-gguf.py fix fix ftype missing, fix tensor names does not suffix with .weight

This commit is contained in:
Pierrick HYMBERT 2024-04-07 15:54:19 +02:00
parent 1fb6d95c1d
commit 200ce21436
2 changed files with 39 additions and 41 deletions

View file

@ -95,7 +95,7 @@ class Model(ABC):
self.gguf_writer.add_context_length(n_ctx)
print(f"gguf: context length = {n_ctx}")
n_embd = self.find_hparam(["hidden_size", "n_embd"])
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
self.gguf_writer.add_embedding_length(n_embd)
print(f"gguf: embedding length = {n_embd}")
@ -103,7 +103,7 @@ class Model(ABC):
self.gguf_writer.add_feed_forward_length(n_ff)
print(f"gguf: feed forward length = {n_ff}")
n_head = self.find_hparam(["num_attention_heads", "n_head"])
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
self.gguf_writer.add_head_count(n_head)
print(f"gguf: head count = {n_head}")
@ -1474,6 +1474,7 @@ class DbrxModel(Model):
model_arch = gguf.MODEL_ARCH.DBRX
def set_gguf_parameters(self):
super().set_gguf_parameters()
ffn_config = self.hparams["ffn_config"]
attn_config = self.hparams["attn_config"]
self.gguf_writer.add_name(self.hparams["model_type"])

View file

@ -10,7 +10,7 @@ class TensorNameMap:
# Token embeddings
MODEL_TENSOR.TOKEN_EMBD: (
"gpt_neox.embed_in", # gptneox
"transformer.wte", # gpt2 gpt-j mpt refact qwen
"transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx
"transformer.word_embeddings", # falcon
"word_embeddings", # bloom
"model.embed_tokens", # llama-hf
@ -24,7 +24,6 @@ class TensorNameMap:
"backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok
"transformer.wte.weight", # dbrx
),
# Token type embeddings
@ -49,11 +48,10 @@ class TensorNameMap:
# Output
MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
"output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
"lm_head.weight", # dbrx
),
# Output norm
@ -62,7 +60,7 @@ class TensorNameMap:
"transformer.ln_f", # gpt2 gpt-j falcon
"model.norm", # llama-hf baichuan internlm2
"norm", # llama-pth
"transformer.norm_f", # mpt
"transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2
"language_model.encoder.final_layernorm", # persimmon
"model.final_layernorm", # persimmon
@ -70,7 +68,6 @@ class TensorNameMap:
"model.norm_f", # mamba-qbert
"backbone.norm_f", # mamba
"transformer.rms_norm", # Grok
"transformer.norm_f.weight", # dbrx
),
# Rope frequencies
@ -99,13 +96,13 @@ class TensorNameMap:
"model.layers.{bid}.norm", # mamba-qbert
"backbone.layers.{bid}.norm", # mamba
"transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # dbrx
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
),
# Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b
"transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # dbrx
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
),
# Attention query-key-value
@ -113,7 +110,7 @@ class TensorNameMap:
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
"transformer.blocks.{bid}.attn.Wqkv", # mpt
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # dbrx
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
"transformer.h.{bid}.self_attention.query_key_value", # falcon
"h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
@ -211,7 +208,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
"transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer.weight", # dbrx
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
),
# Feed-forward up
@ -312,7 +309,7 @@ class TensorNameMap:
"encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
),
MODEL_TENSOR.SSM_IN: (