parent
a307375c02
commit
1d8de31565
5 changed files with 123 additions and 40 deletions
|
@ -1424,6 +1424,60 @@ class GrokModel(Model):
|
||||||
self.gguf_writer.add_tensor(new_name, data)
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("DbrxForCausalLM")
|
||||||
|
class Qwen2MoeModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.DBRX
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
ffn_config = self.hparams["ffn_config"]
|
||||||
|
attn_config = self.hparams["attn_config"]
|
||||||
|
self.gguf_writer.add_name(self.hparams["model_type"])
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["n_layers"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["n_heads"])
|
||||||
|
self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"])
|
||||||
|
self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"])
|
||||||
|
self.gguf_writer.add_clip_kqv(attn_config["clip_qkv"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"])
|
||||||
|
self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"])
|
||||||
|
|
||||||
|
def _set_vocab_gpt2(self):
|
||||||
|
dir_model = self.dir_model
|
||||||
|
hparams = self.hparams
|
||||||
|
tokens: list[str] = []
|
||||||
|
toktypes: list[int] = []
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
vocab_size = tokenizer.vocab_size
|
||||||
|
|
||||||
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()}
|
||||||
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
|
for i in range(vocab_size):
|
||||||
|
if i not in reverse_vocab:
|
||||||
|
tokens.append(f"[PAD{i}]")
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
elif reverse_vocab[i] in added_vocab:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
if tokenizer.added_tokens_decoder[i].special:
|
||||||
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
else:
|
||||||
|
tokens.append(reverse_vocab[i])
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
@Model.register("MiniCPMForCausalLM")
|
@Model.register("MiniCPMForCausalLM")
|
||||||
class MiniCPMModel(Model):
|
class MiniCPMModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.MINICPM
|
model_arch = gguf.MODEL_ARCH.MINICPM
|
||||||
|
|
|
@ -54,6 +54,7 @@ class Keys:
|
||||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
|
CLIP_KQV = "{arch}.attention.clip_kqv"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
|
@ -125,6 +126,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
|
DBRX = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -194,6 +196,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
|
MODEL_ARCH.DBRX: "dbrx",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -639,6 +642,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.DBRX: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -379,6 +379,9 @@ class GGUFWriter:
|
||||||
def add_causal_attention(self, value: bool) -> None:
|
def add_causal_attention(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_clip_kqv(self, value: int) -> None:
|
||||||
|
self.add_uint32(Keys.Attention.CLIP_KQV.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_pooling_type(self, value: PoolingType) -> None:
|
def add_pooling_type(self, value: PoolingType) -> None:
|
||||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||||
|
|
||||||
|
|
|
@ -96,11 +96,13 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.norm", # mamba-qbert
|
"model.layers.{bid}.norm", # mamba-qbert
|
||||||
"backbone.layers.{bid}.norm", # mamba
|
"backbone.layers.{bid}.norm", # mamba
|
||||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1.weight", # DBRX
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
MODEL_TENSOR.ATTN_NORM_2: (
|
MODEL_TENSOR.ATTN_NORM_2: (
|
||||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2.weight", # DBRX
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
|
@ -108,6 +110,7 @@ class TensorNameMap:
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
"transformer.h.{bid}.attn.c_attn", # gpt2 qwen
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv.weight", # DBRX
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
"h.{bid}.self_attention.query_key_value", # bloom
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||||
|
@ -168,7 +171,8 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wo", # internlm2
|
"model.layers.{bid}.attention.wo", # internlm2
|
||||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear"# Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||||
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj.weight", # DBRX
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
|
@ -204,7 +208,8 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.router" # Grok
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
|
"transformer.blocks.{bid}.ffn.router.layer.weight", # DBRX
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
|
@ -233,6 +238,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # DBRX
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
|
@ -252,7 +258,8 @@ class TensorNameMap:
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear" # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # DBRX
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
|
@ -280,6 +287,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # DBRX
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
-r ./requirements-convert.txt
|
-r ./requirements-convert.txt
|
||||||
torch~=2.1.1
|
torch~=2.1.1
|
||||||
einops~=0.7.0
|
einops~=0.7.0
|
||||||
|
tiktoken~=0.6.0
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue