Removed autoformatting; resolved bug where model_arch was not selecting StableLM2
This commit is contained in:
parent
0eb8492ccb
commit
15a5e7db4c
3 changed files with 614 additions and 1056 deletions
File diff suppressed because it is too large
Load diff
|
@ -74,9 +74,7 @@ class Keys:
|
|||
MODEL = "tokenizer.ggml.model"
|
||||
LIST = "tokenizer.ggml.tokens"
|
||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||
TOKEN_TYPE_COUNT = (
|
||||
"tokenizer.ggml.token_type_count" # for BERT-style token types
|
||||
)
|
||||
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
||||
SCORES = "tokenizer.ggml.scores"
|
||||
MERGES = "tokenizer.ggml.merges"
|
||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||
|
@ -443,8 +441,6 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
],
|
||||
MODEL_ARCH.STABLELM2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
@ -731,9 +727,9 @@ class TokenType(IntEnum):
|
|||
|
||||
|
||||
class RopeScalingType(Enum):
|
||||
NONE = "none"
|
||||
LINEAR = "linear"
|
||||
YARN = "yarn"
|
||||
NONE = 'none'
|
||||
LINEAR = 'linear'
|
||||
YARN = 'yarn'
|
||||
|
||||
|
||||
class PoolingType(IntEnum):
|
||||
|
|
|
@ -25,22 +25,26 @@ class TensorNameMap:
|
|||
"backbone.embeddings", # mamba-hf
|
||||
"transformer.in_out_embed", # Grok
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
MODEL_TENSOR.TOKEN_TYPES: (
|
||||
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||
),
|
||||
|
||||
# Normalization of token embeddings
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||
"word_embeddings_layernorm", # bloom
|
||||
"embeddings.LayerNorm", # bert
|
||||
"emb_ln", # nomic-bert
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
MODEL_TENSOR.POS_EMBD: (
|
||||
"transformer.wpe", # gpt2
|
||||
"embeddings.position_embeddings", # bert
|
||||
"wpe", # gpt2
|
||||
),
|
||||
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
|
@ -49,6 +53,7 @@ class TensorNameMap:
|
|||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
),
|
||||
|
||||
# Output norm
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
|
@ -64,8 +69,11 @@ class TensorNameMap:
|
|||
"backbone.norm_f", # mamba
|
||||
"transformer.rms_norm", # Grok
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
"rope.freqs", # llama-pth
|
||||
),
|
||||
}
|
||||
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
|
@ -90,8 +98,12 @@ class TensorNameMap:
|
|||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b
|
||||
MODEL_TENSOR.ATTN_NORM_2: (
|
||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||
),
|
||||
|
||||
# Attention query-key-value
|
||||
MODEL_TENSOR.ATTN_QKV: (
|
||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||
|
@ -106,6 +118,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||
),
|
||||
|
||||
# Attention query
|
||||
MODEL_TENSOR.ATTN_Q: (
|
||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||
|
@ -114,8 +127,9 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||
"model.layers.{bid}.attention.wq", # internlm2
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
|
||||
),
|
||||
|
||||
# Attention key
|
||||
MODEL_TENSOR.ATTN_K: (
|
||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||
|
@ -124,8 +138,9 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||
"model.layers.{bid}.attention.wk", # internlm2
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
||||
),
|
||||
|
||||
# Attention value
|
||||
MODEL_TENSOR.ATTN_V: (
|
||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||
|
@ -134,8 +149,9 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||
"model.layers.{bid}.attention.wv", # internlm2
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
||||
),
|
||||
|
||||
# Attention output
|
||||
MODEL_TENSOR.ATTN_OUT: (
|
||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||
|
@ -164,6 +180,7 @@ class TensorNameMap:
|
|||
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||
|
@ -171,6 +188,7 @@ class TensorNameMap:
|
|||
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
||||
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
||||
),
|
||||
|
||||
# Feed-forward norm
|
||||
MODEL_TENSOR.FFN_NORM: (
|
||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||
|
@ -185,12 +203,14 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
"transformer.decoder_layer.{bid}.router", # Grok
|
||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
MODEL_TENSOR.FFN_UP: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
|
@ -213,13 +233,18 @@ class TensorNameMap:
|
|||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_UP_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
|
||||
MODEL_TENSOR.FFN_ACT: (
|
||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
||||
),
|
||||
|
||||
# Feed-forward gate
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||
|
@ -229,6 +254,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||
|
@ -255,95 +281,67 @@ class TensorNameMap:
|
|||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||
),
|
||||
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_IN: (
|
||||
"model.layers.{bid}.in_proj",
|
||||
"backbone.layers.{bid}.mixer.in_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_CONV1D: (
|
||||
"model.layers.{bid}.conv1d",
|
||||
"backbone.layers.{bid}.mixer.conv1d",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_X: (
|
||||
"model.layers.{bid}.x_proj",
|
||||
"backbone.layers.{bid}.mixer.x_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_DT: (
|
||||
"model.layers.{bid}.dt_proj",
|
||||
"backbone.layers.{bid}.mixer.dt_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_A: (
|
||||
"model.layers.{bid}.A_log",
|
||||
"backbone.layers.{bid}.mixer.A_log",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_D: (
|
||||
"model.layers.{bid}.D",
|
||||
"backbone.layers.{bid}.mixer.D",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.SSM_OUT: (
|
||||
"model.layers.{bid}.out_proj",
|
||||
"backbone.layers.{bid}.mixer.out_proj",
|
||||
|
@ -374,9 +372,7 @@ class TensorNameMap:
|
|||
key = key.format(bid = bid, xid = xid)
|
||||
self.mapping[key] = (tensor, tensor_name)
|
||||
|
||||
def get_type_and_name(
|
||||
self, key: str, try_suffixes: Sequence[str] = ()
|
||||
) -> tuple[MODEL_TENSOR, str] | None:
|
||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||
result = self.mapping.get(key)
|
||||
if result is not None:
|
||||
return result
|
||||
|
@ -393,9 +389,7 @@ class TensorNameMap:
|
|||
return None
|
||||
return result[1]
|
||||
|
||||
def get_type(
|
||||
self, key: str, try_suffixes: Sequence[str] = ()
|
||||
) -> MODEL_TENSOR | None:
|
||||
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
||||
if result is None:
|
||||
return None
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue