Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
commit
10c3c419e9
518 changed files with 78202 additions and 66427 deletions
|
@ -24,6 +24,7 @@ class TensorNameMap:
|
|||
"backbone.embedding", # mamba
|
||||
"backbone.embeddings", # mamba-hf
|
||||
"transformer.in_out_embed", # Grok
|
||||
"shared", # t5
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
|
@ -186,6 +187,10 @@ class TensorNameMap:
|
|||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_POST_NORM: (
|
||||
"model.layers.{bid}.post_attention_layernorm", # gemma2
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||
|
@ -211,6 +216,16 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.pre_moe_layernorm", # mini-jamba
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_PRE_NORM: (
|
||||
"model.layers.{bid}.pre_feedforward_layernorm", # gemma2
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
MODEL_TENSOR.FFN_POST_NORM: (
|
||||
"model.layers.{bid}.post_feedforward_layernorm", # gemma2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
"layers.{bid}.feed_forward.gate", # mixtral
|
||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||
|
@ -440,6 +455,128 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.ATTN_KV_A_NORM: (
|
||||
"model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_SUB_NORM: (
|
||||
"model.layers.{bid}.self_attn.inner_attn_ln", # bitnet
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_SUB_NORM: (
|
||||
"model.layers.{bid}.mlp.ffn_layernorm", # bitnet
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_ATTN_NORM: (
|
||||
"decoder.block.{bid}.layer.0.layer_norm", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_ATTN_Q: (
|
||||
"decoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_ATTN_K: (
|
||||
"decoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_ATTN_V: (
|
||||
"decoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_ATTN_OUT: (
|
||||
"decoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_ATTN_REL_B: (
|
||||
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
|
||||
"decoder.block.{bid}.layer.1.layer_norm", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
|
||||
"decoder.block.{bid}.layer.1.EncDecAttention.q", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_K: (
|
||||
"decoder.block.{bid}.layer.1.EncDecAttention.k", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_V: (
|
||||
"decoder.block.{bid}.layer.1.EncDecAttention.v", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
|
||||
"decoder.block.{bid}.layer.1.EncDecAttention.o", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
|
||||
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_FFN_NORM: (
|
||||
"decoder.block.{bid}.layer.2.layer_norm", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_FFN_GATE: (
|
||||
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_FFN_UP: (
|
||||
"decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5
|
||||
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_FFN_DOWN: (
|
||||
"decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.DEC_OUTPUT_NORM: (
|
||||
"decoder.final_layer_norm", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_ATTN_NORM: (
|
||||
"encoder.block.{bid}.layer.0.layer_norm", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_ATTN_Q: (
|
||||
"encoder.block.{bid}.layer.0.SelfAttention.q", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_ATTN_K: (
|
||||
"encoder.block.{bid}.layer.0.SelfAttention.k", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_ATTN_V: (
|
||||
"encoder.block.{bid}.layer.0.SelfAttention.v", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_ATTN_OUT: (
|
||||
"encoder.block.{bid}.layer.0.SelfAttention.o", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_ATTN_REL_B: (
|
||||
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_FFN_NORM: (
|
||||
"encoder.block.{bid}.layer.1.layer_norm", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_FFN_GATE: (
|
||||
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_FFN_UP: (
|
||||
"encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5
|
||||
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_FFN_DOWN: (
|
||||
"encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: (
|
||||
"encoder.final_layer_norm", # t5
|
||||
),
|
||||
}
|
||||
|
||||
# architecture-specific block mappings
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue