Removed autoformatting; resolved bug where model_arch was not selecting StableLM2

This commit is contained in:
Ashish 2024-04-12 22:48:21 -07:00
parent 0eb8492ccb
commit 15a5e7db4c
3 changed files with 614 additions and 1056 deletions

File diff suppressed because it is too large Load diff

View file

@ -74,9 +74,7 @@ class Keys:
MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = (
"tokenizer.ggml.token_type_count" # for BERT-style token types
)
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id"
@ -443,8 +441,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
],
MODEL_ARCH.STABLELM2: [
MODEL_TENSOR.TOKEN_EMBD,
@ -731,9 +727,9 @@ class TokenType(IntEnum):
class RopeScalingType(Enum):
NONE = "none"
LINEAR = "linear"
YARN = "yarn"
NONE = 'none'
LINEAR = 'linear'
YARN = 'yarn'
class PoolingType(IntEnum):

View file

@ -25,22 +25,26 @@ class TensorNameMap:
"backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok
),
# Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: (
"embeddings.token_type_embeddings", # bert nomic-bert
),
# Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert
),
# Position embeddings
MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", # gpt2
"embeddings.position_embeddings", # bert
"wpe", # gpt2
),
# Output
MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
@ -49,6 +53,7 @@ class TensorNameMap:
"word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
),
# Output norm
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox
@ -64,8 +69,11 @@ class TensorNameMap:
"backbone.norm_f", # mamba
"transformer.rms_norm", # Grok
),
# Rope frequencies
MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth
MODEL_TENSOR.ROPE_FREQS: (
"rope.freqs", # llama-pth
),
}
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@ -90,8 +98,12 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
),
# Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b
MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b
),
# Attention query-key-value
MODEL_TENSOR.ATTN_QKV: (
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
@ -106,6 +118,7 @@ class TensorNameMap:
"transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
),
# Attention query
MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf
@ -114,8 +127,9 @@ class TensorNameMap:
"transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
),
# Attention key
MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf
@ -124,8 +138,9 @@ class TensorNameMap:
"transformer.h.{bid}.attn.k_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
"model.layers.{bid}.attention.wk", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
),
# Attention value
MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf
@ -134,8 +149,9 @@ class TensorNameMap:
"transformer.h.{bid}.attn.v_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
"model.layers.{bid}.attention.wv", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
),
# Attention output
MODEL_TENSOR.ATTN_OUT: (
"gpt_neox.layers.{bid}.attention.dense", # gptneox
@ -164,6 +180,7 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
),
# Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: (
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
@ -171,6 +188,7 @@ class TensorNameMap:
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
),
# Feed-forward norm
MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
@ -185,12 +203,14 @@ class TensorNameMap:
"model.layers.{bid}.ffn_norm", # internlm2
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
),
MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
"transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
),
# Feed-forward up
MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
@ -213,13 +233,18 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
"model.layers.{bid}.mlp.c_fc", # starcoder2
),
MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
),
# AWQ-activation gate
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
MODEL_TENSOR.FFN_ACT: (
"transformer.blocks.{bid}.ffn.act", # mpt
),
# Feed-forward gate
MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
@ -229,6 +254,7 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.w1", # internlm2
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
),
MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
@ -255,95 +281,67 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
"model.layers.{bid}.mlp.c_proj", # starcoder2
),
MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
),
MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm
"model.layers.{bid}.self_attn.q_norm", # cohere
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
),
MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.k_norm", # cohere
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
),
MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
),
MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
),
MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj",
"backbone.layers.{bid}.mixer.in_proj",
),
MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d",
"backbone.layers.{bid}.mixer.conv1d",
),
MODEL_TENSOR.SSM_X: (
"model.layers.{bid}.x_proj",
"backbone.layers.{bid}.mixer.x_proj",
),
MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj",
"backbone.layers.{bid}.mixer.dt_proj",
),
MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log",
"backbone.layers.{bid}.mixer.A_log",
),
MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D",
"backbone.layers.{bid}.mixer.D",
),
MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj",
@ -374,9 +372,7 @@ class TensorNameMap:
key = key.format(bid = bid, xid = xid)
self.mapping[key] = (tensor, tensor_name)
def get_type_and_name(
self, key: str, try_suffixes: Sequence[str] = ()
) -> tuple[MODEL_TENSOR, str] | None:
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key)
if result is not None:
return result
@ -393,9 +389,7 @@ class TensorNameMap:
return None
return result[1]
def get_type(
self, key: str, try_suffixes: Sequence[str] = ()
) -> MODEL_TENSOR | None:
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None:
return None