StableLM12 tensormapping and constants

This commit is contained in:
Ashish 2024-04-12 02:18:51 -07:00
parent d383c0d818
commit 13387d9c57
2 changed files with 482 additions and 457 deletions

View file

@ -74,7 +74,9 @@ class Keys:
MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
TOKEN_TYPE_COUNT = (
"tokenizer.ggml.token_type_count" # for BERT-style token types
)
SCORES = "tokenizer.ggml.scores"
MERGES = "tokenizer.ggml.merges"
BOS_ID = "tokenizer.ggml.bos_token_id"
@ -113,6 +115,7 @@ class MODEL_ARCH(IntEnum):
NOMIC_BERT = auto()
BLOOM = auto()
STABLELM = auto()
STABLELM2 = auto()
QWEN = auto()
QWEN2 = auto()
PHI2 = auto()
@ -183,6 +186,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.STABLELM2: "stablelm2",
MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.PHI2: "phi2",
@ -196,7 +200,6 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx",
}
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -440,6 +443,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
],
MODEL_ARCH.STABLELM2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
],
MODEL_ARCH.QWEN: [
MODEL_TENSOR.TOKEN_EMBD,
@ -710,9 +731,9 @@ class TokenType(IntEnum):
class RopeScalingType(Enum):
NONE = 'none'
LINEAR = 'linear'
YARN = 'yarn'
NONE = "none"
LINEAR = "linear"
YARN = "yarn"
class PoolingType(IntEnum):

View file

@ -25,26 +25,22 @@ class TensorNameMap:
"backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok
),
# Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: (
"embeddings.token_type_embeddings", # bert nomic-bert
),
# Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert
),
# Position embeddings
MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", # gpt2
"embeddings.position_embeddings", # bert
"wpe", # gpt2
),
# Output
MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
@ -53,7 +49,6 @@ class TensorNameMap:
"word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2
),
# Output norm
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox
@ -69,11 +64,8 @@ class TensorNameMap:
"backbone.norm_f", # mamba
"transformer.rms_norm", # Grok
),
# Rope frequencies
MODEL_TENSOR.ROPE_FREQS: (
"rope.freqs", # llama-pth
),
MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth
}
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
@ -98,12 +90,8 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
),
# Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: (
"transformer.h.{bid}.ln_attn", # falcon40b
),
MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b
# Attention query-key-value
MODEL_TENSOR.ATTN_QKV: (
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
@ -118,7 +106,6 @@ class TensorNameMap:
"transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
),
# Attention query
MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf
@ -127,9 +114,8 @@ class TensorNameMap:
"transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
),
# Attention key
MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf
@ -138,9 +124,8 @@ class TensorNameMap:
"transformer.h.{bid}.attn.k_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
"model.layers.{bid}.attention.wk", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
),
# Attention value
MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf
@ -149,9 +134,8 @@ class TensorNameMap:
"transformer.h.{bid}.attn.v_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
"model.layers.{bid}.attention.wv", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
),
# Attention output
MODEL_TENSOR.ATTN_OUT: (
"gpt_neox.layers.{bid}.attention.dense", # gptneox
@ -173,7 +157,6 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
),
# Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
@ -181,7 +164,6 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
),
# Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: (
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
@ -189,7 +171,6 @@ class TensorNameMap:
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
),
# Feed-forward norm
MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
@ -204,14 +185,12 @@ class TensorNameMap:
"model.layers.{bid}.ffn_norm", # internlm2
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
),
MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
"transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
),
# Feed-forward up
MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
@ -234,18 +213,13 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
"model.layers.{bid}.mlp.c_fc", # starcoder2
),
MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
),
# AWQ-activation gate
MODEL_TENSOR.FFN_ACT: (
"transformer.blocks.{bid}.ffn.act", # mpt
),
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
# Feed-forward gate
MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
@ -255,13 +229,11 @@ class TensorNameMap:
"model.layers.{bid}.feed_forward.w1", # internlm2
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
),
MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
),
# Feed-forward down
MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
@ -283,67 +255,95 @@ class TensorNameMap:
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
"model.layers.{bid}.mlp.c_proj", # starcoder2
),
MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
),
MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm
"model.layers.{bid}.self_attn.q_norm", # cohere
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
),
MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.k_norm", # cohere
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
),
MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
),
MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
),
MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj",
"backbone.layers.{bid}.mixer.in_proj",
),
MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d",
"backbone.layers.{bid}.mixer.conv1d",
),
MODEL_TENSOR.SSM_X: (
"model.layers.{bid}.x_proj",
"backbone.layers.{bid}.mixer.x_proj",
),
MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj",
"backbone.layers.{bid}.mixer.dt_proj",
),
MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log",
"backbone.layers.{bid}.mixer.A_log",
),
MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D",
"backbone.layers.{bid}.mixer.D",
),
MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj",
@ -374,7 +374,9 @@ class TensorNameMap:
key = key.format(bid=bid, xid=xid)
self.mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
def get_type_and_name(
self, key: str, try_suffixes: Sequence[str] = ()
) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key)
if result is not None:
return result
@ -391,7 +393,9 @@ class TensorNameMap:
return None
return result[1]
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
def get_type(
self, key: str, try_suffixes: Sequence[str] = ()
) -> MODEL_TENSOR | None:
result = self.get_type_and_name(key, try_suffixes=try_suffixes)
if result is None:
return None