StableLM12 tensormapping and constants
This commit is contained in:
parent
d383c0d818
commit
13387d9c57
2 changed files with 482 additions and 457 deletions
|
@ -8,8 +8,8 @@ from typing import Any
|
||||||
# constants
|
# constants
|
||||||
#
|
#
|
||||||
|
|
||||||
GGUF_MAGIC = 0x46554747 # "GGUF"
|
GGUF_MAGIC = 0x46554747 # "GGUF"
|
||||||
GGUF_VERSION = 3
|
GGUF_VERSION = 3
|
||||||
GGUF_DEFAULT_ALIGNMENT = 32
|
GGUF_DEFAULT_ALIGNMENT = 32
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -19,77 +19,79 @@ GGUF_DEFAULT_ALIGNMENT = 32
|
||||||
|
|
||||||
class Keys:
|
class Keys:
|
||||||
class General:
|
class General:
|
||||||
ARCHITECTURE = "general.architecture"
|
ARCHITECTURE = "general.architecture"
|
||||||
QUANTIZATION_VERSION = "general.quantization_version"
|
QUANTIZATION_VERSION = "general.quantization_version"
|
||||||
ALIGNMENT = "general.alignment"
|
ALIGNMENT = "general.alignment"
|
||||||
NAME = "general.name"
|
NAME = "general.name"
|
||||||
AUTHOR = "general.author"
|
AUTHOR = "general.author"
|
||||||
VERSION = "general.version"
|
VERSION = "general.version"
|
||||||
URL = "general.url"
|
URL = "general.url"
|
||||||
DESCRIPTION = "general.description"
|
DESCRIPTION = "general.description"
|
||||||
LICENSE = "general.license"
|
LICENSE = "general.license"
|
||||||
SOURCE_URL = "general.source.url"
|
SOURCE_URL = "general.source.url"
|
||||||
SOURCE_HF_REPO = "general.source.huggingface.repository"
|
SOURCE_HF_REPO = "general.source.huggingface.repository"
|
||||||
FILE_TYPE = "general.file_type"
|
FILE_TYPE = "general.file_type"
|
||||||
|
|
||||||
class LLM:
|
class LLM:
|
||||||
VOCAB_SIZE = "{arch}.vocab_size"
|
VOCAB_SIZE = "{arch}.vocab_size"
|
||||||
CONTEXT_LENGTH = "{arch}.context_length"
|
CONTEXT_LENGTH = "{arch}.context_length"
|
||||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||||
BLOCK_COUNT = "{arch}.block_count"
|
BLOCK_COUNT = "{arch}.block_count"
|
||||||
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
||||||
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
||||||
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
||||||
EXPERT_COUNT = "{arch}.expert_count"
|
EXPERT_COUNT = "{arch}.expert_count"
|
||||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
LOGIT_SCALE = "{arch}.logit_scale"
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
||||||
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
||||||
CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
||||||
KEY_LENGTH = "{arch}.attention.key_length"
|
KEY_LENGTH = "{arch}.attention.key_length"
|
||||||
VALUE_LENGTH = "{arch}.attention.value_length"
|
VALUE_LENGTH = "{arch}.attention.value_length"
|
||||||
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
||||||
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
||||||
CAUSAL = "{arch}.attention.causal"
|
CAUSAL = "{arch}.attention.causal"
|
||||||
|
|
||||||
class Rope:
|
class Rope:
|
||||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||||
FREQ_BASE = "{arch}.rope.freq_base"
|
FREQ_BASE = "{arch}.rope.freq_base"
|
||||||
SCALING_TYPE = "{arch}.rope.scaling.type"
|
SCALING_TYPE = "{arch}.rope.scaling.type"
|
||||||
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
||||||
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
class SSM:
|
class SSM:
|
||||||
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
||||||
INNER_SIZE = "{arch}.ssm.inner_size"
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||||
STATE_SIZE = "{arch}.ssm.state_size"
|
STATE_SIZE = "{arch}.ssm.state_size"
|
||||||
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
TOKEN_TYPE_COUNT = (
|
||||||
SCORES = "tokenizer.ggml.scores"
|
"tokenizer.ggml.token_type_count" # for BERT-style token types
|
||||||
MERGES = "tokenizer.ggml.merges"
|
)
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
CLS_ID = "tokenizer.ggml.cls_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
MASK_ID = "tokenizer.ggml.mask_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
CLS_ID = "tokenizer.ggml.cls_token_id"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
||||||
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
||||||
CHAT_TEMPLATE = "tokenizer.chat_template"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
|
RWKV = "tokenizer.rwkv.world"
|
||||||
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -98,30 +100,31 @@ class Keys:
|
||||||
|
|
||||||
|
|
||||||
class MODEL_ARCH(IntEnum):
|
class MODEL_ARCH(IntEnum):
|
||||||
LLAMA = auto()
|
LLAMA = auto()
|
||||||
FALCON = auto()
|
FALCON = auto()
|
||||||
BAICHUAN = auto()
|
BAICHUAN = auto()
|
||||||
GROK = auto()
|
GROK = auto()
|
||||||
GPT2 = auto()
|
GPT2 = auto()
|
||||||
GPTJ = auto()
|
GPTJ = auto()
|
||||||
GPTNEOX = auto()
|
GPTNEOX = auto()
|
||||||
MPT = auto()
|
MPT = auto()
|
||||||
STARCODER = auto()
|
STARCODER = auto()
|
||||||
PERSIMMON = auto()
|
PERSIMMON = auto()
|
||||||
REFACT = auto()
|
REFACT = auto()
|
||||||
BERT = auto()
|
BERT = auto()
|
||||||
NOMIC_BERT = auto()
|
NOMIC_BERT = auto()
|
||||||
BLOOM = auto()
|
BLOOM = auto()
|
||||||
STABLELM = auto()
|
STABLELM = auto()
|
||||||
QWEN = auto()
|
STABLELM2 = auto()
|
||||||
QWEN2 = auto()
|
QWEN = auto()
|
||||||
PHI2 = auto()
|
QWEN2 = auto()
|
||||||
PLAMO = auto()
|
PHI2 = auto()
|
||||||
CODESHELL = auto()
|
PLAMO = auto()
|
||||||
ORION = auto()
|
CODESHELL = auto()
|
||||||
INTERNLM2 = auto()
|
ORION = auto()
|
||||||
MINICPM = auto()
|
INTERNLM2 = auto()
|
||||||
GEMMA = auto()
|
MINICPM = auto()
|
||||||
|
GEMMA = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
XVERSE = auto()
|
XVERSE = auto()
|
||||||
|
@ -130,111 +133,111 @@ class MODEL_ARCH(IntEnum):
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD = auto()
|
TOKEN_EMBD = auto()
|
||||||
TOKEN_EMBD_NORM = auto()
|
TOKEN_EMBD_NORM = auto()
|
||||||
TOKEN_TYPES = auto()
|
TOKEN_TYPES = auto()
|
||||||
POS_EMBD = auto()
|
POS_EMBD = auto()
|
||||||
OUTPUT = auto()
|
OUTPUT = auto()
|
||||||
OUTPUT_NORM = auto()
|
OUTPUT_NORM = auto()
|
||||||
ROPE_FREQS = auto()
|
ROPE_FREQS = auto()
|
||||||
ATTN_Q = auto()
|
ATTN_Q = auto()
|
||||||
ATTN_K = auto()
|
ATTN_K = auto()
|
||||||
ATTN_V = auto()
|
ATTN_V = auto()
|
||||||
ATTN_QKV = auto()
|
ATTN_QKV = auto()
|
||||||
ATTN_OUT = auto()
|
ATTN_OUT = auto()
|
||||||
ATTN_NORM = auto()
|
ATTN_NORM = auto()
|
||||||
ATTN_NORM_2 = auto()
|
ATTN_NORM_2 = auto()
|
||||||
ATTN_OUT_NORM = auto()
|
ATTN_OUT_NORM = auto()
|
||||||
ATTN_ROT_EMBD = auto()
|
ATTN_ROT_EMBD = auto()
|
||||||
FFN_GATE_INP = auto()
|
FFN_GATE_INP = auto()
|
||||||
FFN_NORM = auto()
|
FFN_NORM = auto()
|
||||||
FFN_GATE = auto()
|
FFN_GATE = auto()
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
FFN_ACT = auto()
|
FFN_ACT = auto()
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
ATTN_Q_NORM = auto()
|
ATTN_Q_NORM = auto()
|
||||||
ATTN_K_NORM = auto()
|
ATTN_K_NORM = auto()
|
||||||
LAYER_OUT_NORM = auto()
|
LAYER_OUT_NORM = auto()
|
||||||
SSM_IN = auto()
|
SSM_IN = auto()
|
||||||
SSM_CONV1D = auto()
|
SSM_CONV1D = auto()
|
||||||
SSM_X = auto()
|
SSM_X = auto()
|
||||||
SSM_DT = auto()
|
SSM_DT = auto()
|
||||||
SSM_A = auto()
|
SSM_A = auto()
|
||||||
SSM_D = auto()
|
SSM_D = auto()
|
||||||
SSM_OUT = auto()
|
SSM_OUT = auto()
|
||||||
|
|
||||||
|
|
||||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.LLAMA: "llama",
|
MODEL_ARCH.LLAMA: "llama",
|
||||||
MODEL_ARCH.FALCON: "falcon",
|
MODEL_ARCH.FALCON: "falcon",
|
||||||
MODEL_ARCH.BAICHUAN: "baichuan",
|
MODEL_ARCH.BAICHUAN: "baichuan",
|
||||||
MODEL_ARCH.GROK: "grok",
|
MODEL_ARCH.GROK: "grok",
|
||||||
MODEL_ARCH.GPT2: "gpt2",
|
MODEL_ARCH.GPT2: "gpt2",
|
||||||
MODEL_ARCH.GPTJ: "gptj",
|
MODEL_ARCH.GPTJ: "gptj",
|
||||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||||
MODEL_ARCH.MPT: "mpt",
|
MODEL_ARCH.MPT: "mpt",
|
||||||
MODEL_ARCH.STARCODER: "starcoder",
|
MODEL_ARCH.STARCODER: "starcoder",
|
||||||
MODEL_ARCH.PERSIMMON: "persimmon",
|
MODEL_ARCH.PERSIMMON: "persimmon",
|
||||||
MODEL_ARCH.REFACT: "refact",
|
MODEL_ARCH.REFACT: "refact",
|
||||||
MODEL_ARCH.BERT: "bert",
|
MODEL_ARCH.BERT: "bert",
|
||||||
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
||||||
MODEL_ARCH.BLOOM: "bloom",
|
MODEL_ARCH.BLOOM: "bloom",
|
||||||
MODEL_ARCH.STABLELM: "stablelm",
|
MODEL_ARCH.STABLELM: "stablelm",
|
||||||
MODEL_ARCH.QWEN: "qwen",
|
MODEL_ARCH.STABLELM2: "stablelm2",
|
||||||
MODEL_ARCH.QWEN2: "qwen2",
|
MODEL_ARCH.QWEN: "qwen",
|
||||||
MODEL_ARCH.PHI2: "phi2",
|
MODEL_ARCH.QWEN2: "qwen2",
|
||||||
MODEL_ARCH.PLAMO: "plamo",
|
MODEL_ARCH.PHI2: "phi2",
|
||||||
MODEL_ARCH.CODESHELL: "codeshell",
|
MODEL_ARCH.PLAMO: "plamo",
|
||||||
MODEL_ARCH.ORION: "orion",
|
MODEL_ARCH.CODESHELL: "codeshell",
|
||||||
MODEL_ARCH.INTERNLM2: "internlm2",
|
MODEL_ARCH.ORION: "orion",
|
||||||
MODEL_ARCH.MINICPM: "minicpm",
|
MODEL_ARCH.INTERNLM2: "internlm2",
|
||||||
MODEL_ARCH.GEMMA: "gemma",
|
MODEL_ARCH.MINICPM: "minicpm",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.GEMMA: "gemma",
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
MODEL_ARCH.XVERSE: "xverse",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.XVERSE: "xverse",
|
||||||
MODEL_ARCH.DBRX: "dbrx",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
@ -440,6 +443,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_GATE,
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.STABLELM2: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.ATTN_Q_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_K_NORM,
|
||||||
],
|
],
|
||||||
MODEL_ARCH.QWEN: [
|
MODEL_ARCH.QWEN: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD,
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
@ -701,55 +722,55 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
|
|
||||||
|
|
||||||
class TokenType(IntEnum):
|
class TokenType(IntEnum):
|
||||||
NORMAL = 1
|
NORMAL = 1
|
||||||
UNKNOWN = 2
|
UNKNOWN = 2
|
||||||
CONTROL = 3
|
CONTROL = 3
|
||||||
USER_DEFINED = 4
|
USER_DEFINED = 4
|
||||||
UNUSED = 5
|
UNUSED = 5
|
||||||
BYTE = 6
|
BYTE = 6
|
||||||
|
|
||||||
|
|
||||||
class RopeScalingType(Enum):
|
class RopeScalingType(Enum):
|
||||||
NONE = 'none'
|
NONE = "none"
|
||||||
LINEAR = 'linear'
|
LINEAR = "linear"
|
||||||
YARN = 'yarn'
|
YARN = "yarn"
|
||||||
|
|
||||||
|
|
||||||
class PoolingType(IntEnum):
|
class PoolingType(IntEnum):
|
||||||
NONE = 0
|
NONE = 0
|
||||||
MEAN = 1
|
MEAN = 1
|
||||||
CLS = 2
|
CLS = 2
|
||||||
|
|
||||||
|
|
||||||
class GGMLQuantizationType(IntEnum):
|
class GGMLQuantizationType(IntEnum):
|
||||||
F32 = 0
|
F32 = 0
|
||||||
F16 = 1
|
F16 = 1
|
||||||
Q4_0 = 2
|
Q4_0 = 2
|
||||||
Q4_1 = 3
|
Q4_1 = 3
|
||||||
Q5_0 = 6
|
Q5_0 = 6
|
||||||
Q5_1 = 7
|
Q5_1 = 7
|
||||||
Q8_0 = 8
|
Q8_0 = 8
|
||||||
Q8_1 = 9
|
Q8_1 = 9
|
||||||
Q2_K = 10
|
Q2_K = 10
|
||||||
Q3_K = 11
|
Q3_K = 11
|
||||||
Q4_K = 12
|
Q4_K = 12
|
||||||
Q5_K = 13
|
Q5_K = 13
|
||||||
Q6_K = 14
|
Q6_K = 14
|
||||||
Q8_K = 15
|
Q8_K = 15
|
||||||
IQ2_XXS = 16
|
IQ2_XXS = 16
|
||||||
IQ2_XS = 17
|
IQ2_XS = 17
|
||||||
IQ3_XXS = 18
|
IQ3_XXS = 18
|
||||||
IQ1_S = 19
|
IQ1_S = 19
|
||||||
IQ4_NL = 20
|
IQ4_NL = 20
|
||||||
IQ3_S = 21
|
IQ3_S = 21
|
||||||
IQ2_S = 22
|
IQ2_S = 22
|
||||||
IQ4_XS = 23
|
IQ4_XS = 23
|
||||||
I8 = 24
|
I8 = 24
|
||||||
I16 = 25
|
I16 = 25
|
||||||
I32 = 26
|
I32 = 26
|
||||||
I64 = 27
|
I64 = 27
|
||||||
F64 = 28
|
F64 = 28
|
||||||
IQ1_M = 29
|
IQ1_M = 29
|
||||||
|
|
||||||
|
|
||||||
class GGUFEndian(IntEnum):
|
class GGUFEndian(IntEnum):
|
||||||
|
@ -758,18 +779,18 @@ class GGUFEndian(IntEnum):
|
||||||
|
|
||||||
|
|
||||||
class GGUFValueType(IntEnum):
|
class GGUFValueType(IntEnum):
|
||||||
UINT8 = 0
|
UINT8 = 0
|
||||||
INT8 = 1
|
INT8 = 1
|
||||||
UINT16 = 2
|
UINT16 = 2
|
||||||
INT16 = 3
|
INT16 = 3
|
||||||
UINT32 = 4
|
UINT32 = 4
|
||||||
INT32 = 5
|
INT32 = 5
|
||||||
FLOAT32 = 6
|
FLOAT32 = 6
|
||||||
BOOL = 7
|
BOOL = 7
|
||||||
STRING = 8
|
STRING = 8
|
||||||
ARRAY = 9
|
ARRAY = 9
|
||||||
UINT64 = 10
|
UINT64 = 10
|
||||||
INT64 = 11
|
INT64 = 11
|
||||||
FLOAT64 = 12
|
FLOAT64 = 12
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -794,94 +815,94 @@ class GGUFValueType(IntEnum):
|
||||||
QK_K = 256
|
QK_K = 256
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
GGML_QUANT_SIZES = {
|
GGML_QUANT_SIZES = {
|
||||||
GGMLQuantizationType.F32: (1, 4),
|
GGMLQuantizationType.F32: (1, 4),
|
||||||
GGMLQuantizationType.F16: (1, 2),
|
GGMLQuantizationType.F16: (1, 2),
|
||||||
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
||||||
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
||||||
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
||||||
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
||||||
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
||||||
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
||||||
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
||||||
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
||||||
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
||||||
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
||||||
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
||||||
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
||||||
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
|
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
|
||||||
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
|
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
|
||||||
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
|
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
|
||||||
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
|
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
|
||||||
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
|
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
|
||||||
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
||||||
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
||||||
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
||||||
GGMLQuantizationType.I8: (1, 1),
|
GGMLQuantizationType.I8: (1, 1),
|
||||||
GGMLQuantizationType.I16: (1, 2),
|
GGMLQuantizationType.I16: (1, 2),
|
||||||
GGMLQuantizationType.I32: (1, 4),
|
GGMLQuantizationType.I32: (1, 4),
|
||||||
GGMLQuantizationType.I64: (1, 8),
|
GGMLQuantizationType.I64: (1, 8),
|
||||||
GGMLQuantizationType.F64: (1, 8),
|
GGMLQuantizationType.F64: (1, 8),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# Aliases for backward compatibility.
|
# Aliases for backward compatibility.
|
||||||
|
|
||||||
# general
|
# general
|
||||||
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
||||||
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
|
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
|
||||||
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
|
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
|
||||||
KEY_GENERAL_NAME = Keys.General.NAME
|
KEY_GENERAL_NAME = Keys.General.NAME
|
||||||
KEY_GENERAL_AUTHOR = Keys.General.AUTHOR
|
KEY_GENERAL_AUTHOR = Keys.General.AUTHOR
|
||||||
KEY_GENERAL_URL = Keys.General.URL
|
KEY_GENERAL_URL = Keys.General.URL
|
||||||
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
|
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
|
||||||
KEY_GENERAL_LICENSE = Keys.General.LICENSE
|
KEY_GENERAL_LICENSE = Keys.General.LICENSE
|
||||||
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
|
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
|
||||||
KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
|
KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
|
||||||
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
||||||
|
|
||||||
# LLM
|
# LLM
|
||||||
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
|
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
|
||||||
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
||||||
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
||||||
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
||||||
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
|
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
|
||||||
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
|
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
|
||||||
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
|
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
|
||||||
|
|
||||||
# attention
|
# attention
|
||||||
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
|
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
|
||||||
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
|
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
|
||||||
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
|
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
|
||||||
KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV
|
KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV
|
||||||
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
|
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
|
||||||
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
|
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
|
||||||
|
|
||||||
# RoPE
|
# RoPE
|
||||||
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
|
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
|
||||||
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
|
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
|
||||||
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
|
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
|
||||||
KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
|
KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
|
||||||
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
|
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
|
||||||
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
||||||
|
|
||||||
# SSM
|
# SSM
|
||||||
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
||||||
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
||||||
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
||||||
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
||||||
|
|
||||||
# tokenization
|
# tokenization
|
||||||
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
||||||
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
|
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
|
||||||
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
|
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
|
||||||
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
|
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
|
||||||
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
||||||
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
||||||
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
||||||
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
||||||
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
||||||
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
||||||
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
|
||||||
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
||||||
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
||||||
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
||||||
|
|
|
@ -17,43 +17,38 @@ class TensorNameMap:
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
"embeddings.word_embeddings", # bert nomic-bert
|
"embeddings.word_embeddings", # bert nomic-bert
|
||||||
"language_model.embedding.word_embeddings", # persimmon
|
"language_model.embedding.word_embeddings", # persimmon
|
||||||
"wte", # gpt2
|
"wte", # gpt2
|
||||||
"transformer.embd.wte", # phi2
|
"transformer.embd.wte", # phi2
|
||||||
"model.tok_embeddings", # internlm2
|
"model.tok_embeddings", # internlm2
|
||||||
"model.embedding", # mamba-qbert
|
"model.embedding", # mamba-qbert
|
||||||
"backbone.embedding", # mamba
|
"backbone.embedding", # mamba
|
||||||
"backbone.embeddings", # mamba-hf
|
"backbone.embeddings", # mamba-hf
|
||||||
"transformer.in_out_embed", # Grok
|
"transformer.in_out_embed", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Token type embeddings
|
# Token type embeddings
|
||||||
MODEL_TENSOR.TOKEN_TYPES: (
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
"embeddings.token_type_embeddings", # bert nomic-bert
|
"embeddings.token_type_embeddings", # bert nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Normalization of token embeddings
|
# Normalization of token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
||||||
"word_embeddings_layernorm", # bloom
|
"word_embeddings_layernorm", # bloom
|
||||||
"embeddings.LayerNorm", # bert
|
"embeddings.LayerNorm", # bert
|
||||||
"emb_ln", # nomic-bert
|
"emb_ln", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe", # gpt2
|
"transformer.wpe", # gpt2
|
||||||
"embeddings.position_embeddings", # bert
|
"embeddings.position_embeddings", # bert
|
||||||
"wpe", # gpt2
|
"wpe", # gpt2
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
|
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
|
||||||
"output", # llama-pth bloom internlm2
|
"output", # llama-pth bloom internlm2
|
||||||
"word_embeddings_for_head", # persimmon
|
"word_embeddings_for_head", # persimmon
|
||||||
"lm_head.linear", # phi2
|
"lm_head.linear", # phi2
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output norm
|
# Output norm
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
|
@ -63,30 +58,27 @@ class TensorNameMap:
|
||||||
"transformer.norm_f", # mpt dbrx
|
"transformer.norm_f", # mpt dbrx
|
||||||
"ln_f", # refact bloom qwen gpt2
|
"ln_f", # refact bloom qwen gpt2
|
||||||
"language_model.encoder.final_layernorm", # persimmon
|
"language_model.encoder.final_layernorm", # persimmon
|
||||||
"model.final_layernorm", # persimmon
|
"model.final_layernorm", # persimmon
|
||||||
"lm_head.ln", # phi2
|
"lm_head.ln", # phi2
|
||||||
"model.norm_f", # mamba-qbert
|
"model.norm_f", # mamba-qbert
|
||||||
"backbone.norm_f", # mamba
|
"backbone.norm_f", # mamba
|
||||||
"transformer.rms_norm", # Grok
|
"transformer.rms_norm", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth
|
||||||
"rope.freqs", # llama-pth
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
# Attention norm
|
# Attention norm
|
||||||
MODEL_TENSOR.ATTN_NORM: (
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"h.{bid}.input_layernorm", # bloom
|
"h.{bid}.input_layernorm", # bloom
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln1", # yi
|
"model.layers.{bid}.ln1", # yi
|
||||||
"h.{bid}.ln_1", # gpt2
|
"h.{bid}.ln_1", # gpt2
|
||||||
|
@ -98,12 +90,8 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm", # Grok
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
MODEL_TENSOR.ATTN_NORM_2: (
|
MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b
|
||||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
|
||||||
),
|
|
||||||
|
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
|
@ -113,45 +101,41 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
"h.{bid}.self_attention.query_key_value", # bloom
|
"h.{bid}.self_attention.query_key_value", # bloom
|
||||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||||
"model.layers.{bid}.self_attn.query_key_value", # persimmon
|
"model.layers.{bid}.self_attn.query_key_value", # persimmon
|
||||||
"h.{bid}.attn.c_attn", # gpt2
|
"h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||||
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wq", # llama-pth
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.query", # bert
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wq", # internlm2
|
"model.layers.{bid}.attention.wq", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.key", # bert
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wk", # internlm2
|
"model.layers.{bid}.attention.wk", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
"encoder.layer.{bid}.attention.self.value", # bert
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
|
||||||
"model.layers.{bid}.attention.wv", # internlm2
|
"model.layers.{bid}.attention.wv", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
|
@ -172,8 +156,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
"encoder.layers.{bid}.attn.out_proj", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output norm
|
# Attention output norm
|
||||||
MODEL_TENSOR.ATTN_OUT_NORM: (
|
MODEL_TENSOR.ATTN_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
|
@ -181,169 +164,186 @@ class TensorNameMap:
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
|
||||||
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
MODEL_TENSOR.ATTN_ROT_EMBD: (
|
||||||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||||
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
||||||
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
||||||
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2 refact qwen
|
"transformer.h.{bid}.ln_2", # gpt2 refact qwen
|
||||||
"h.{bid}.post_attention_layernorm", # bloom
|
"h.{bid}.post_attention_layernorm", # bloom
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||||
"model.layers.{bid}.ln2", # yi
|
"model.layers.{bid}.ln2", # yi
|
||||||
"h.{bid}.ln_2", # gpt2
|
"h.{bid}.ln_2", # gpt2
|
||||||
"model.layers.{bid}.ffn_norm", # internlm2
|
"model.layers.{bid}.ffn_norm", # internlm2
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_INP: (
|
MODEL_TENSOR.FFN_GATE_INP: (
|
||||||
"layers.{bid}.feed_forward.gate", # mixtral
|
"layers.{bid}.feed_forward.gate", # mixtral
|
||||||
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
"model.layers.{bid}.block_sparse_moe.gate", # mixtral
|
||||||
"transformer.decoder_layer.{bid}.router", # Grok
|
"transformer.decoder_layer.{bid}.router", # Grok
|
||||||
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||||
"transformer.h.{bid}.mlp.w1", # qwen
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
"h.{bid}.mlp.c_fc", # gpt2
|
"h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||||
"model.layers.{bid}.mlp.fc1", # phi2
|
"model.layers.{bid}.mlp.fc1", # phi2
|
||||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w3", # internlm2
|
"model.layers.{bid}.feed_forward.w3", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# AWQ-activation gate
|
# AWQ-activation gate
|
||||||
MODEL_TENSOR.FFN_ACT: (
|
MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
|
||||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
|
||||||
),
|
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
MODEL_TENSOR.FFN_GATE: (
|
MODEL_TENSOR.FFN_GATE: (
|
||||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||||
"layers.{bid}.feed_forward.w1", # llama-pth
|
"layers.{bid}.feed_forward.w1", # llama-pth
|
||||||
"transformer.h.{bid}.mlp.w2", # qwen
|
"transformer.h.{bid}.mlp.w2", # qwen
|
||||||
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
"model.layers.layers.{bid}.mlp.gate_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w1", # internlm2
|
"model.layers.{bid}.feed_forward.w1", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen
|
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
||||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||||
"encoder.layer.{bid}.output.dense", # bert
|
"encoder.layer.{bid}.output.dense", # bert
|
||||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||||
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||||
"h.{bid}.mlp.c_proj", # gpt2
|
"h.{bid}.mlp.c_proj", # gpt2
|
||||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||||
"model.layers.{bid}.mlp.fc2", # phi2
|
"model.layers.{bid}.mlp.fc2", # phi2
|
||||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||||
"model.layers.{bid}.feed_forward.w2", # internlm2
|
"model.layers.{bid}.feed_forward.w2", # internlm2
|
||||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
|
||||||
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
|
||||||
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||||
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.q_layernorm", # persimmon
|
||||||
"model.layers.{bid}.self_attn.q_norm", # cohere
|
"model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm
|
||||||
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
"model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.q_norm", # cohere
|
||||||
|
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ATTN_K_NORM: (
|
MODEL_TENSOR.ATTN_K_NORM: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||||
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
"model.layers.{bid}.self_attn.k_layernorm", # persimmon
|
||||||
"model.layers.{bid}.self_attn.k_norm", # cohere
|
"model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm
|
||||||
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
"model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm
|
||||||
|
"model.layers.{bid}.self_attn.k_norm", # cohere
|
||||||
|
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.ROPE_FREQS: (
|
MODEL_TENSOR.ROPE_FREQS: (
|
||||||
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.LAYER_OUT_NORM: (
|
MODEL_TENSOR.LAYER_OUT_NORM: (
|
||||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
"encoder.layers.{bid}.norm2", # nomic-bert
|
"encoder.layers.{bid}.norm2", # nomic-bert
|
||||||
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_IN: (
|
MODEL_TENSOR.SSM_IN: (
|
||||||
"model.layers.{bid}.in_proj",
|
"model.layers.{bid}.in_proj",
|
||||||
"backbone.layers.{bid}.mixer.in_proj",
|
"backbone.layers.{bid}.mixer.in_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_CONV1D: (
|
MODEL_TENSOR.SSM_CONV1D: (
|
||||||
"model.layers.{bid}.conv1d",
|
"model.layers.{bid}.conv1d",
|
||||||
"backbone.layers.{bid}.mixer.conv1d",
|
"backbone.layers.{bid}.mixer.conv1d",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_X: (
|
MODEL_TENSOR.SSM_X: (
|
||||||
"model.layers.{bid}.x_proj",
|
"model.layers.{bid}.x_proj",
|
||||||
"backbone.layers.{bid}.mixer.x_proj",
|
"backbone.layers.{bid}.mixer.x_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_DT: (
|
MODEL_TENSOR.SSM_DT: (
|
||||||
"model.layers.{bid}.dt_proj",
|
"model.layers.{bid}.dt_proj",
|
||||||
"backbone.layers.{bid}.mixer.dt_proj",
|
"backbone.layers.{bid}.mixer.dt_proj",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_A: (
|
MODEL_TENSOR.SSM_A: (
|
||||||
"model.layers.{bid}.A_log",
|
"model.layers.{bid}.A_log",
|
||||||
"backbone.layers.{bid}.mixer.A_log",
|
"backbone.layers.{bid}.mixer.A_log",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_D: (
|
MODEL_TENSOR.SSM_D: (
|
||||||
"model.layers.{bid}.D",
|
"model.layers.{bid}.D",
|
||||||
"backbone.layers.{bid}.mixer.D",
|
"backbone.layers.{bid}.mixer.D",
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_OUT: (
|
MODEL_TENSOR.SSM_OUT: (
|
||||||
"model.layers.{bid}.out_proj",
|
"model.layers.{bid}.out_proj",
|
||||||
"backbone.layers.{bid}.mixer.out_proj",
|
"backbone.layers.{bid}.mixer.out_proj",
|
||||||
|
@ -368,31 +368,35 @@ class TensorNameMap:
|
||||||
# TODO: make this configurable
|
# TODO: make this configurable
|
||||||
n_experts = 8
|
n_experts = 8
|
||||||
for xid in range(n_experts):
|
for xid in range(n_experts):
|
||||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid=bid, xid=xid)
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
key = key.format(bid = bid, xid = xid)
|
key = key.format(bid=bid, xid=xid)
|
||||||
self.mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
def get_type_and_name(
|
||||||
|
self, key: str, try_suffixes: Sequence[str] = ()
|
||||||
|
) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
result = self.mapping.get(key)
|
result = self.mapping.get(key)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
for suffix in try_suffixes:
|
for suffix in try_suffixes:
|
||||||
if key.endswith(suffix):
|
if key.endswith(suffix):
|
||||||
result = self.mapping.get(key[:-len(suffix)])
|
result = self.mapping.get(key[: -len(suffix)])
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result[0], result[1] + suffix
|
return result[0], result[1] + suffix
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
|
||||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
result = self.get_type_and_name(key, try_suffixes=try_suffixes)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
return result[1]
|
return result[1]
|
||||||
|
|
||||||
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
|
def get_type(
|
||||||
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
|
self, key: str, try_suffixes: Sequence[str] = ()
|
||||||
|
) -> MODEL_TENSOR | None:
|
||||||
|
result = self.get_type_and_name(key, try_suffixes=try_suffixes)
|
||||||
if result is None:
|
if result is None:
|
||||||
return None
|
return None
|
||||||
return result[0]
|
return result[0]
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue