StableLM12 tensormapping and constants
This commit is contained in:
parent
d383c0d818
commit
13387d9c57
2 changed files with 482 additions and 457 deletions
|
@ -74,7 +74,9 @@ class Keys:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
TOKEN_TYPE_COUNT = (
|
||||||
|
"tokenizer.ggml.token_type_count" # for BERT-style token types
|
||||||
|
)
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
|
@ -113,6 +115,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
|
STABLELM2 = auto()
|
||||||
QWEN = auto()
|
QWEN = auto()
|
||||||
QWEN2 = auto()
|
QWEN2 = auto()
|
||||||
PHI2 = auto()
|
PHI2 = auto()
|
||||||
|
@ -183,6 +186,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
|
MODEL_ARCH.STABLELM2: "stablelm2",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
MODEL_ARCH.QWEN2: "qwen2",
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
|
@ -196,7 +200,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
MODEL_ARCH.DBRX: "dbrx",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -440,6 +443,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.STABLELM2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.QWEN: [
|
MODEL_ARCH.QWEN: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
@ -710,9 +731,9 @@ class TokenType(IntEnum):
|
||||||
|
|
||||||
|
|
||||||
class RopeScalingType(Enum):
|
class RopeScalingType(Enum):
|
||||||
NONE = 'none'
|
NONE = "none"
|
||||||
LINEAR = 'linear'
|
LINEAR = "linear"
|
||||||
YARN = 'yarn'
|
YARN = "yarn"
|
||||||
|
|
||||||
|
|
||||||
class PoolingType(IntEnum):
|
class PoolingType(IntEnum):
|
||||||
|
|
|
@ -25,26 +25,22 @@ class TensorNameMap:
|
||||||
"backbone.embeddings", # mamba-hf
|
"backbone.embeddings", # mamba-hf
|
||||||
"transformer.in_out_embed", # Grok
|
"transformer.in_out_embed", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
MODEL_TENSOR.TOKEN_TYPES: (
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
"embeddings.token_type_embeddings", # bert nomic-bert
|
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Normalization of token embeddings
|
# Normalization of token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
"word_embeddings_layernorm", # bloom
|
"word_embeddings_layernorm", # bloom
|
||||||
"embeddings.LayerNorm", # bert
|
"embeddings.LayerNorm", # bert
|
||||||
"emb_ln", # nomic-bert
|
"emb_ln", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe", # gpt2
|
"transformer.wpe", # gpt2
|
||||||
"embeddings.position_embeddings", # bert
|
"embeddings.position_embeddings", # bert
|
||||||
"wpe", # gpt2
|
"wpe", # gpt2
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
|
@ -53,7 +49,6 @@ class TensorNameMap:
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
"lm_head.linear", # phi2
|
"lm_head.linear", # phi2
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output norm
|
# Output norm
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
|
@ -69,11 +64,8 @@ class TensorNameMap:
|
||||||
"backbone.norm_f", # mamba
|
"backbone.norm_f", # mamba
|
||||||
"transformer.rms_norm", # Grok
|
"transformer.rms_norm", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth
|
||||||
"rope.freqs", # llama-pth
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
|
@ -98,12 +90,8 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
MODEL_TENSOR.ATTN_NORM_2: (
|
MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b
|
||||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
|
||||||
),
|
|
||||||
|
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
|
@ -118,7 +106,6 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
|
@ -127,9 +114,8 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wq", # internlm2
|
"model.layers.{bid}.attention.wq", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
|
@ -138,9 +124,8 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wk", # internlm2
|
"model.layers.{bid}.attention.wk", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
|
@ -149,9 +134,8 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wv", # internlm2
|
"model.layers.{bid}.attention.wv", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
|
@ -173,7 +157,6 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: (
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
|
@ -181,7 +164,6 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||||
|
@ -189,7 +171,6 @@ class TensorNameMap:
|
||||||
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
||||||
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
|
@ -204,14 +185,12 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.ffn_norm", # internlm2
|
"model.layers.{bid}.ffn_norm", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.router", # Grok
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
|
@ -234,18 +213,13 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
MODEL_TENSOR.FFN_ACT: (
|
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
|
||||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
|
||||||
),
|
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
MODEL_TENSOR.FFN_GATE: (
|
MODEL_TENSOR.FFN_GATE: (
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
|
@ -255,13 +229,11 @@ class TensorNameMap:
|
||||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
|
@ -283,67 +255,95 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm
|
||||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_K_NORM: (
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm
|
||||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: (
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm2", # nomic-bert
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_IN: (
|
MODEL_TENSOR.SSM_IN: (
|
||||||
"model.layers.{bid}.in_proj",
|
"model.layers.{bid}.in_proj",
|
||||||
"backbone.layers.{bid}.mixer.in_proj",
|
"backbone.layers.{bid}.mixer.in_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_CONV1D: (
|
MODEL_TENSOR.SSM_CONV1D: (
|
||||||
"model.layers.{bid}.conv1d",
|
"model.layers.{bid}.conv1d",
|
||||||
"backbone.layers.{bid}.mixer.conv1d",
|
"backbone.layers.{bid}.mixer.conv1d",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_X: (
|
MODEL_TENSOR.SSM_X: (
|
||||||
"model.layers.{bid}.x_proj",
|
"model.layers.{bid}.x_proj",
|
||||||
"backbone.layers.{bid}.mixer.x_proj",
|
"backbone.layers.{bid}.mixer.x_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_DT: (
|
MODEL_TENSOR.SSM_DT: (
|
||||||
"model.layers.{bid}.dt_proj",
|
"model.layers.{bid}.dt_proj",
|
||||||
"backbone.layers.{bid}.mixer.dt_proj",
|
"backbone.layers.{bid}.mixer.dt_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_A: (
|
MODEL_TENSOR.SSM_A: (
|
||||||
"model.layers.{bid}.A_log",
|
"model.layers.{bid}.A_log",
|
||||||
"backbone.layers.{bid}.mixer.A_log",
|
"backbone.layers.{bid}.mixer.A_log",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_D: (
|
MODEL_TENSOR.SSM_D: (
|
||||||
"model.layers.{bid}.D",
|
"model.layers.{bid}.D",
|
||||||
"backbone.layers.{bid}.mixer.D",
|
"backbone.layers.{bid}.mixer.D",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_OUT: (
|
MODEL_TENSOR.SSM_OUT: (
|
||||||
"model.layers.{bid}.out_proj",
|
"model.layers.{bid}.out_proj",
|
||||||
"backbone.layers.{bid}.mixer.out_proj",
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
|
@ -374,7 +374,9 @@ class TensorNameMap:
|
||||||
key = key.format(bid=bid, xid=xid)
|
key = key.format(bid=bid, xid=xid)
|
||||||
self.mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
def get_type_and_name(
|
||||||
|
self, key: str, try_suffixes: Sequence[str] = ()
|
||||||
|
) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
result = self.mapping.get(key)
|
result = self.mapping.get(key)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
|
@ -391,7 +393,9 @@ class TensorNameMap:
|
||||||
return None
|
return None
|
||||||
return result[1]
|
return result[1]
|
||||||
|
|
||||||
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
def get_type(
|
||||||
|
self, key: str, try_suffixes: Sequence[str] = ()
|
||||||
|
) -> MODEL_TENSOR | None:
|
||||||
result = self.get_type_and_name(key, try_suffixes=try_suffixes)
|
result = self.get_type_and_name(key, try_suffixes=try_suffixes)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue