model: dbrx: convert-hf-to-gguf.py fix fix ftype missing, fix tensor names does not suffix with .weight

This commit is contained in:
Pierrick HYMBERT 2024-04-07 15:54:19 +02:00
parent 1fb6d95c1d
commit 200ce21436
2 changed files with 39 additions and 41 deletions

View file

@ -95,7 +95,7 @@ class Model(ABC):
self.gguf_writer.add_context_length(n_ctx) self.gguf_writer.add_context_length(n_ctx)
print(f"gguf: context length = {n_ctx}") print(f"gguf: context length = {n_ctx}")
n_embd = self.find_hparam(["hidden_size", "n_embd"]) if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
print(f"gguf: embedding length = {n_embd}") print(f"gguf: embedding length = {n_embd}")
@ -103,7 +103,7 @@ class Model(ABC):
self.gguf_writer.add_feed_forward_length(n_ff) self.gguf_writer.add_feed_forward_length(n_ff)
print(f"gguf: feed forward length = {n_ff}") print(f"gguf: feed forward length = {n_ff}")
n_head = self.find_hparam(["num_attention_heads", "n_head"]) if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
print(f"gguf: head count = {n_head}") print(f"gguf: head count = {n_head}")
@ -1474,6 +1474,7 @@ class DbrxModel(Model):
model_arch = gguf.MODEL_ARCH.DBRX model_arch = gguf.MODEL_ARCH.DBRX
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters()
ffn_config = self.hparams["ffn_config"] ffn_config = self.hparams["ffn_config"]
attn_config = self.hparams["attn_config"] attn_config = self.hparams["attn_config"]
self.gguf_writer.add_name(self.hparams["model_type"]) self.gguf_writer.add_name(self.hparams["model_type"])

View file

@ -10,7 +10,7 @@ class TensorNameMap:
# Token embeddings # Token embeddings
MODEL_TENSOR.TOKEN_EMBD: ( MODEL_TENSOR.TOKEN_EMBD: (
"gpt_neox.embed_in", # gptneox "gpt_neox.embed_in", # gptneox
"transformer.wte", # gpt2 gpt-j mpt refact qwen "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx
"transformer.word_embeddings", # falcon "transformer.word_embeddings", # falcon
"word_embeddings", # bloom "word_embeddings", # bloom
"model.embed_tokens", # llama-hf "model.embed_tokens", # llama-hf
@ -24,7 +24,6 @@ class TensorNameMap:
"backbone.embedding", # mamba "backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf "backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok "transformer.in_out_embed", # Grok
"transformer.wte.weight", # dbrx
), ),
# Token type embeddings # Token type embeddings
@ -49,11 +48,10 @@ class TensorNameMap:
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
"lm_head.weight", # dbrx
), ),
# Output norm # Output norm
@ -62,7 +60,7 @@ class TensorNameMap:
"transformer.ln_f", # gpt2 gpt-j falcon "transformer.ln_f", # gpt2 gpt-j falcon
"model.norm", # llama-hf baichuan internlm2 "model.norm", # llama-hf baichuan internlm2
"norm", # llama-pth "norm", # llama-pth
"transformer.norm_f", # mpt "transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
"language_model.encoder.final_layernorm", # persimmon "language_model.encoder.final_layernorm", # persimmon
"model.final_layernorm", # persimmon "model.final_layernorm", # persimmon
@ -70,7 +68,6 @@ class TensorNameMap:
"model.norm_f", # mamba-qbert "model.norm_f", # mamba-qbert
"backbone.norm_f", # mamba "backbone.norm_f", # mamba
"transformer.rms_norm", # Grok "transformer.rms_norm", # Grok
"transformer.norm_f.weight", # dbrx
), ),
# Rope frequencies # Rope frequencies
@ -99,13 +96,13 @@ class TensorNameMap:
"model.layers.{bid}.norm", # mamba-qbert "model.layers.{bid}.norm", # mamba-qbert
"backbone.layers.{bid}.norm", # mamba "backbone.layers.{bid}.norm", # mamba
"transformer.decoder_layer.{bid}.rms_norm", # Grok "transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
), ),
# Attention norm 2 # Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b "transformer.h.{bid}.ln_attn", # falcon40b
"transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
), ),
# Attention query-key-value # Attention query-key-value
@ -113,7 +110,7 @@ class TensorNameMap:
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen "transformer.h.{bid}.attn.c_attn", # gpt2 qwen
"transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.blocks.{bid}.attn.Wqkv", # mpt
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # dbrx "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx
"transformer.h.{bid}.self_attention.query_key_value", # falcon "transformer.h.{bid}.self_attention.query_key_value", # falcon
"h.{bid}.self_attention.query_key_value", # bloom "h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
@ -211,7 +208,7 @@ class TensorNameMap:
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral
"transformer.decoder_layer.{bid}.router", # Grok "transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer.weight", # dbrx "transformer.blocks.{bid}.ffn.router.layer", # dbrx
), ),
# Feed-forward up # Feed-forward up
@ -312,7 +309,7 @@ class TensorNameMap:
"encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", # nomic-bert "encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # dbrx "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
), ),
MODEL_TENSOR.SSM_IN: ( MODEL_TENSOR.SSM_IN: (