llama : add OpenELM support (#7359)
* Initial OpenELM support (270M only so far) * Fill out missing entries in llama_model_type_name * fixup! Initial OpenELM support (270M only so far) Fix formatting * llama : support all OpenELM models * llama : add variable GQA and variable FFN sizes Some metadata keys can now also be arrays to support setting their value per-layer for models like OpenELM. * llama : minor spacing changes Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * llama : use std::array for per-layer hparams * llama : fix save/load state * llama : do not print hparams for vocab-only models * llama : handle n_head == 0 * llama : use const ref for print_f and fix division by zero * llama : fix t5 uses of n_head and n_ff * llama : minor comment --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
6f63d646c1
commit
d7fd29fff1
5 changed files with 676 additions and 176 deletions
|
@ -24,6 +24,7 @@ class TensorNameMap:
|
|||
"backbone.embedding", # mamba
|
||||
"backbone.embeddings", # mamba-hf
|
||||
"transformer.in_out_embed", # Grok
|
||||
"transformer.token_embeddings", # openelm
|
||||
"shared", # t5
|
||||
),
|
||||
|
||||
|
@ -37,6 +38,7 @@ class TensorNameMap:
|
|||
"word_embeddings_layernorm", # bloom
|
||||
"embeddings.LayerNorm", # bert
|
||||
"emb_ln", # nomic-bert
|
||||
"transformer.norm", # openelm
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
|
@ -69,6 +71,7 @@ class TensorNameMap:
|
|||
"model.norm_f", # mamba-qbert
|
||||
"backbone.norm_f", # mamba
|
||||
"transformer.rms_norm", # Grok
|
||||
"transformer.norm", # openelm
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
|
@ -98,6 +101,7 @@ class TensorNameMap:
|
|||
"backbone.layers.{bid}.norm", # mamba
|
||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||
"transformer.layers.{bid}.attn_norm", # openelm
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
|
@ -119,7 +123,8 @@ class TensorNameMap:
|
|||
"h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||
"model.layers.{bid}.self_attn.qkv_proj" # phi3
|
||||
"model.layers.{bid}.self_attn.qkv_proj", # phi3
|
||||
"transformer.layers.{bid}.attn.qkv_proj", # openelm
|
||||
),
|
||||
|
||||
# Attention query
|
||||
|
@ -177,6 +182,7 @@ class TensorNameMap:
|
|||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
||||
"transformer.layers.{bid}.attn.out_proj", # openelm
|
||||
),
|
||||
|
||||
# Attention output norm
|
||||
|
@ -212,6 +218,7 @@ class TensorNameMap:
|
|||
"h.{bid}.ln_2", # gpt2
|
||||
"model.layers.{bid}.ffn_norm", # internlm2
|
||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||
"transformer.layers.{bid}.ffn_norm", # openelm
|
||||
),
|
||||
|
||||
# Post feed-forward norm
|
||||
|
@ -327,6 +334,7 @@ class TensorNameMap:
|
|||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||
"transformer.layers.{bid}.ffn.proj_2", # openelm
|
||||
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
|
||||
),
|
||||
|
@ -348,7 +356,8 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q" # jina-bert-v2
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.q_norm", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
|
@ -356,7 +365,8 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k" # jina-bert-v2
|
||||
"encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2
|
||||
"transformer.layers.{bid}.attn.k_norm", # openelm
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue