StableLM12 tensormapping and constants

This commit is contained in:
Ashish 2024-04-12 02:18:51 -07:00
parent d383c0d818
commit 13387d9c57
2 changed files with 482 additions and 457 deletions

View file

@ -74,7 +74,9 @@ class Keys:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types TOKEN_TYPE_COUNT = (
"tokenizer.ggml.token_type_count" # for BERT-style token types
)
SCORES = "tokenizer.ggml.scores" SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges" MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
@ -113,6 +115,7 @@ class MODEL_ARCH(IntEnum):
NOMIC_BERT = auto() NOMIC_BERT = auto()
BLOOM = auto() BLOOM = auto()
STABLELM = auto() STABLELM = auto()
STABLELM2 = auto()
QWEN = auto() QWEN = auto()
QWEN2 = auto() QWEN2 = auto()
PHI2 = auto() PHI2 = auto()
@ -183,6 +186,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.STABLELM2: "stablelm2",
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI2: "phi2",
@ -196,7 +200,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -440,6 +443,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
],
MODEL_ARCH.STABLELM2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
], ],
MODEL_ARCH.QWEN: [ MODEL_ARCH.QWEN: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -710,9 +731,9 @@ class TokenType(IntEnum):
class RopeScalingType(Enum): class RopeScalingType(Enum):
NONE = 'none' NONE = "none"
LINEAR = 'linear' LINEAR = "linear"
YARN = 'yarn' YARN = "yarn"
class PoolingType(IntEnum): class PoolingType(IntEnum):

View file

@ -25,26 +25,22 @@ class TensorNameMap:
"backbone.embeddings", # mamba-hf "backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok "transformer.in_out_embed", # Grok
), ),
# Token type embeddings # Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: ( MODEL_TENSOR.TOKEN_TYPES: (
"embeddings.token_type_embeddings", # bert nomic-bert "embeddings.token_type_embeddings", # bert nomic-bert
), ),
# Normalization of token embeddings # Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: ( MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom "word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert "embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert "emb_ln", # nomic-bert
), ),
# Position embeddings # Position embeddings
MODEL_TENSOR.POS_EMBD: ( MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", # gpt2 "transformer.wpe", # gpt2
"embeddings.position_embeddings", # bert "embeddings.position_embeddings", # bert
"wpe", # gpt2 "wpe", # gpt2
), ),
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
@ -53,7 +49,6 @@ class TensorNameMap:
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
), ),
# Output norm # Output norm
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
@ -69,11 +64,8 @@ class TensorNameMap:
"backbone.norm_f", # mamba "backbone.norm_f", # mamba
"transformer.rms_norm", # Grok "transformer.rms_norm", # Grok
), ),
# Rope frequencies # Rope frequencies
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth
"rope.freqs", # llama-pth
),
} }
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@ -98,12 +90,8 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm", # Grok "transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
), ),
# Attention norm 2 # Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b
"transformer.h.{bid}.ln_attn", # falcon40b
),
# Attention query-key-value # Attention query-key-value
MODEL_TENSOR.ATTN_QKV: ( MODEL_TENSOR.ATTN_QKV: (
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
@ -118,7 +106,6 @@ class TensorNameMap:
"transformer.h.{bid}.mixer.Wqkv", # phi2 "transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert "encoder.layers.{bid}.attn.Wqkv", # nomic-bert
), ),
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf "model.layers.{bid}.self_attn.q_proj", # llama-hf
@ -127,9 +114,8 @@ class TensorNameMap:
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo "model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2 "model.layers.{bid}.attention.wq", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
), ),
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf "model.layers.{bid}.self_attn.k_proj", # llama-hf
@ -138,9 +124,8 @@ class TensorNameMap:
"transformer.h.{bid}.attn.k_proj", # gpt-j "transformer.h.{bid}.attn.k_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.k_proj", # plamo "model.layers.layers.{bid}.self_attn.k_proj", # plamo
"model.layers.{bid}.attention.wk", # internlm2 "model.layers.{bid}.attention.wk", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
), ),
# Attention value # Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf "model.layers.{bid}.self_attn.v_proj", # llama-hf
@ -149,9 +134,8 @@ class TensorNameMap:
"transformer.h.{bid}.attn.v_proj", # gpt-j "transformer.h.{bid}.attn.v_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.v_proj", # plamo "model.layers.layers.{bid}.self_attn.v_proj", # plamo
"model.layers.{bid}.attention.wv", # internlm2 "model.layers.{bid}.attention.wv", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
), ),
# Attention output # Attention output
MODEL_TENSOR.ATTN_OUT: ( MODEL_TENSOR.ATTN_OUT: (
"gpt_neox.layers.{bid}.attention.dense", # gptneox "gpt_neox.layers.{bid}.attention.dense", # gptneox
@ -173,7 +157,6 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
), ),
# Attention output norm # Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: ( MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layer.{bid}.attention.output.LayerNorm", # bert
@ -181,7 +164,6 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.decoder_layer.{bid}.rms_norm_1", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
), ),
# Rotary embeddings # Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: ( MODEL_TENSOR.ATTN_ROT_EMBD: (
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
@ -189,7 +171,6 @@ class TensorNameMap:
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
), ),
# Feed-forward norm # Feed-forward norm
MODEL_TENSOR.FFN_NORM: ( MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
@ -204,14 +185,12 @@ class TensorNameMap:
"model.layers.{bid}.ffn_norm", # internlm2 "model.layers.{bid}.ffn_norm", # internlm2
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
), ),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral
"transformer.decoder_layer.{bid}.router", # Grok "transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx "transformer.blocks.{bid}.ffn.router.layer", # dbrx
), ),
# Feed-forward up # Feed-forward up
MODEL_TENSOR.FFN_UP: ( MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
@ -234,18 +213,13 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc11", # nomic-bert "encoder.layers.{bid}.mlp.fc11", # nomic-bert
"model.layers.{bid}.mlp.c_fc", # starcoder2 "model.layers.{bid}.mlp.c_fc", # starcoder2
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
), ),
# AWQ-activation gate # AWQ-activation gate
MODEL_TENSOR.FFN_ACT: ( MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
"transformer.blocks.{bid}.ffn.act", # mpt
),
# Feed-forward gate # Feed-forward gate
MODEL_TENSOR.FFN_GATE: ( MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
@ -255,13 +229,11 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.w1", # internlm2 "model.layers.{bid}.feed_forward.w1", # internlm2
"encoder.layers.{bid}.mlp.fc12", # nomic-bert "encoder.layers.{bid}.mlp.fc12", # nomic-bert
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
), ),
# Feed-forward down # Feed-forward down
MODEL_TENSOR.FFN_DOWN: ( MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
@ -283,67 +255,95 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc2", # nomic-bert "encoder.layers.{bid}.mlp.fc2", # nomic-bert
"model.layers.{bid}.mlp.c_proj", # starcoder2 "model.layers.{bid}.mlp.c_proj", # starcoder2
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
), ),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm
"model.layers.{bid}.self_attn.q_norm", # cohere "model.layers.{bid}.self_attn.q_norm", # cohere
"transformer.blocks.{bid}.attn.q_ln", # sea-lion "transformer.blocks.{bid}.attn.q_ln", # sea-lion
), ),
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.k_norm", # cohere "model.layers.{bid}.self_attn.k_norm", # cohere
"transformer.blocks.{bid}.attn.k_ln", # sea-lion "transformer.blocks.{bid}.attn.k_ln", # sea-lion
), ),
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
), ),
MODEL_TENSOR.LAYER_OUT_NORM: ( MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", # nomic-bert "encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
), ),
MODEL_TENSOR.SSM_IN: ( MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj", "model.layers.{bid}.in_proj",
"backbone.layers.{bid}.mixer.in_proj", "backbone.layers.{bid}.mixer.in_proj",
), ),
MODEL_TENSOR.SSM_CONV1D: ( MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d", "model.layers.{bid}.conv1d",
"backbone.layers.{bid}.mixer.conv1d", "backbone.layers.{bid}.mixer.conv1d",
), ),
MODEL_TENSOR.SSM_X: ( MODEL_TENSOR.SSM_X: (
"model.layers.{bid}.x_proj", "model.layers.{bid}.x_proj",
"backbone.layers.{bid}.mixer.x_proj", "backbone.layers.{bid}.mixer.x_proj",
), ),
MODEL_TENSOR.SSM_DT: ( MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj", "model.layers.{bid}.dt_proj",
"backbone.layers.{bid}.mixer.dt_proj", "backbone.layers.{bid}.mixer.dt_proj",
), ),
MODEL_TENSOR.SSM_A: ( MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log", "model.layers.{bid}.A_log",
"backbone.layers.{bid}.mixer.A_log", "backbone.layers.{bid}.mixer.A_log",
), ),
MODEL_TENSOR.SSM_D: ( MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D", "model.layers.{bid}.D",
"backbone.layers.{bid}.mixer.D", "backbone.layers.{bid}.mixer.D",
), ),
MODEL_TENSOR.SSM_OUT: ( MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj", "model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj", "backbone.layers.{bid}.mixer.out_proj",
@ -368,31 +368,35 @@ class TensorNameMap:
# TODO: make this configurable # TODO: make this configurable
n_experts = 8 n_experts = 8
for xid in range(n_experts): for xid in range(n_experts):
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) tensor_name = TENSOR_NAMES[tensor].format(bid=bid, xid=xid)
self.mapping[tensor_name] = (tensor, tensor_name) self.mapping[tensor_name] = (tensor, tensor_name)
for key in keys: for key in keys:
key = key.format(bid = bid, xid = xid) key = key.format(bid=bid, xid=xid)
self.mapping[key] = (tensor, tensor_name) self.mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: def get_type_and_name(
self, key: str, try_suffixes: Sequence[str] = ()
) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key) result = self.mapping.get(key)
if result is not None: if result is not None:
return result return result
for suffix in try_suffixes: for suffix in try_suffixes:
if key.endswith(suffix): if key.endswith(suffix):
result = self.mapping.get(key[:-len(suffix)]) result = self.mapping.get(key[: -len(suffix)])
if result is not None: if result is not None:
return result[0], result[1] + suffix return result[0], result[1] + suffix
return None return None
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes) result = self.get_type_and_name(key, try_suffixes=try_suffixes)
if result is None: if result is None:
return None return None
return result[1] return result[1]
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: def get_type(
result = self.get_type_and_name(key, try_suffixes = try_suffixes) self, key: str, try_suffixes: Sequence[str] = ()
) -> MODEL_TENSOR | None:
result = self.get_type_and_name(key, try_suffixes=try_suffixes)
if result is None: if result is None:
return None return None
return result[0] return result[0]