StableLM12 tensormapping and constants

This commit is contained in:
Ashish 2024-04-12 02:18:51 -07:00
parent d383c0d818
commit 13387d9c57
2 changed files with 482 additions and 457 deletions

View file

@ -8,8 +8,8 @@ from typing import Any
# constants # constants
# #
GGUF_MAGIC = 0x46554747 # "GGUF" GGUF_MAGIC = 0x46554747 # "GGUF"
GGUF_VERSION = 3 GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32 GGUF_DEFAULT_ALIGNMENT = 32
# #
@ -19,77 +19,79 @@ GGUF_DEFAULT_ALIGNMENT = 32
class Keys: class Keys:
class General: class General:
ARCHITECTURE = "general.architecture" ARCHITECTURE = "general.architecture"
QUANTIZATION_VERSION = "general.quantization_version" QUANTIZATION_VERSION = "general.quantization_version"
ALIGNMENT = "general.alignment" ALIGNMENT = "general.alignment"
NAME = "general.name" NAME = "general.name"
AUTHOR = "general.author" AUTHOR = "general.author"
VERSION = "general.version" VERSION = "general.version"
URL = "general.url" URL = "general.url"
DESCRIPTION = "general.description" DESCRIPTION = "general.description"
LICENSE = "general.license" LICENSE = "general.license"
SOURCE_URL = "general.source.url" SOURCE_URL = "general.source.url"
SOURCE_HF_REPO = "general.source.huggingface.repository" SOURCE_HF_REPO = "general.source.huggingface.repository"
FILE_TYPE = "general.file_type" FILE_TYPE = "general.file_type"
class LLM: class LLM:
VOCAB_SIZE = "{arch}.vocab_size" VOCAB_SIZE = "{arch}.vocab_size"
CONTEXT_LENGTH = "{arch}.context_length" CONTEXT_LENGTH = "{arch}.context_length"
EMBEDDING_LENGTH = "{arch}.embedding_length" EMBEDDING_LENGTH = "{arch}.embedding_length"
BLOCK_COUNT = "{arch}.block_count" BLOCK_COUNT = "{arch}.block_count"
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
EXPERT_COUNT = "{arch}.expert_count" EXPERT_COUNT = "{arch}.expert_count"
EXPERT_USED_COUNT = "{arch}.expert_used_count" EXPERT_USED_COUNT = "{arch}.expert_used_count"
POOLING_TYPE = "{arch}.pooling_type" POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale" LOGIT_SCALE = "{arch}.logit_scale"
class Attention: class Attention:
HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT = "{arch}.attention.head_count"
HEAD_COUNT_KV = "{arch}.attention.head_count_kv" HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias" MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
CLAMP_KQV = "{arch}.attention.clamp_kqv" CLAMP_KQV = "{arch}.attention.clamp_kqv"
KEY_LENGTH = "{arch}.attention.key_length" KEY_LENGTH = "{arch}.attention.key_length"
VALUE_LENGTH = "{arch}.attention.value_length" VALUE_LENGTH = "{arch}.attention.value_length"
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
CAUSAL = "{arch}.attention.causal" CAUSAL = "{arch}.attention.causal"
class Rope: class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count" DIMENSION_COUNT = "{arch}.rope.dimension_count"
FREQ_BASE = "{arch}.rope.freq_base" FREQ_BASE = "{arch}.rope.freq_base"
SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_TYPE = "{arch}.rope.scaling.type"
SCALING_FACTOR = "{arch}.rope.scaling.factor" SCALING_FACTOR = "{arch}.rope.scaling.factor"
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
class SSM: class SSM:
CONV_KERNEL = "{arch}.ssm.conv_kernel" CONV_KERNEL = "{arch}.ssm.conv_kernel"
INNER_SIZE = "{arch}.ssm.inner_size" INNER_SIZE = "{arch}.ssm.inner_size"
STATE_SIZE = "{arch}.ssm.state_size" STATE_SIZE = "{arch}.ssm.state_size"
TIME_STEP_RANK = "{arch}.ssm.time_step_rank" TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
class Tokenizer: class Tokenizer:
MODEL = "tokenizer.ggml.model" MODEL = "tokenizer.ggml.model"
LIST = "tokenizer.ggml.tokens" LIST = "tokenizer.ggml.tokens"
TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE = "tokenizer.ggml.token_type"
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types TOKEN_TYPE_COUNT = (
SCORES = "tokenizer.ggml.scores" "tokenizer.ggml.token_type_count" # for BERT-style token types
MERGES = "tokenizer.ggml.merges" )
BOS_ID = "tokenizer.ggml.bos_token_id" SCORES = "tokenizer.ggml.scores"
EOS_ID = "tokenizer.ggml.eos_token_id" MERGES = "tokenizer.ggml.merges"
UNK_ID = "tokenizer.ggml.unknown_token_id" BOS_ID = "tokenizer.ggml.bos_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id" EOS_ID = "tokenizer.ggml.eos_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id"
CLS_ID = "tokenizer.ggml.cls_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id"
MASK_ID = "tokenizer.ggml.mask_token_id" PAD_ID = "tokenizer.ggml.padding_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token" CLS_ID = "tokenizer.ggml.cls_token_id"
ADD_EOS = "tokenizer.ggml.add_eos_token" MASK_ID = "tokenizer.ggml.mask_token_id"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix" ADD_BOS = "tokenizer.ggml.add_bos_token"
HF_JSON = "tokenizer.huggingface.json" ADD_EOS = "tokenizer.ggml.add_eos_token"
RWKV = "tokenizer.rwkv.world" ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
CHAT_TEMPLATE = "tokenizer.chat_template" HF_JSON = "tokenizer.huggingface.json"
RWKV = "tokenizer.rwkv.world"
CHAT_TEMPLATE = "tokenizer.chat_template"
# #
@ -98,30 +100,31 @@ class Keys:
class MODEL_ARCH(IntEnum): class MODEL_ARCH(IntEnum):
LLAMA = auto() LLAMA = auto()
FALCON = auto() FALCON = auto()
BAICHUAN = auto() BAICHUAN = auto()
GROK = auto() GROK = auto()
GPT2 = auto() GPT2 = auto()
GPTJ = auto() GPTJ = auto()
GPTNEOX = auto() GPTNEOX = auto()
MPT = auto() MPT = auto()
STARCODER = auto() STARCODER = auto()
PERSIMMON = auto() PERSIMMON = auto()
REFACT = auto() REFACT = auto()
BERT = auto() BERT = auto()
NOMIC_BERT = auto() NOMIC_BERT = auto()
BLOOM = auto() BLOOM = auto()
STABLELM = auto() STABLELM = auto()
QWEN = auto() STABLELM2 = auto()
QWEN2 = auto() QWEN = auto()
PHI2 = auto() QWEN2 = auto()
PLAMO = auto() PHI2 = auto()
CODESHELL = auto() PLAMO = auto()
ORION = auto() CODESHELL = auto()
INTERNLM2 = auto() ORION = auto()
MINICPM = auto() INTERNLM2 = auto()
GEMMA = auto() MINICPM = auto()
GEMMA = auto()
STARCODER2 = auto() STARCODER2 = auto()
MAMBA = auto() MAMBA = auto()
XVERSE = auto() XVERSE = auto()
@ -130,111 +133,111 @@ class MODEL_ARCH(IntEnum):
class MODEL_TENSOR(IntEnum): class MODEL_TENSOR(IntEnum):
TOKEN_EMBD = auto() TOKEN_EMBD = auto()
TOKEN_EMBD_NORM = auto() TOKEN_EMBD_NORM = auto()
TOKEN_TYPES = auto() TOKEN_TYPES = auto()
POS_EMBD = auto() POS_EMBD = auto()
OUTPUT = auto() OUTPUT = auto()
OUTPUT_NORM = auto() OUTPUT_NORM = auto()
ROPE_FREQS = auto() ROPE_FREQS = auto()
ATTN_Q = auto() ATTN_Q = auto()
ATTN_K = auto() ATTN_K = auto()
ATTN_V = auto() ATTN_V = auto()
ATTN_QKV = auto() ATTN_QKV = auto()
ATTN_OUT = auto() ATTN_OUT = auto()
ATTN_NORM = auto() ATTN_NORM = auto()
ATTN_NORM_2 = auto() ATTN_NORM_2 = auto()
ATTN_OUT_NORM = auto() ATTN_OUT_NORM = auto()
ATTN_ROT_EMBD = auto() ATTN_ROT_EMBD = auto()
FFN_GATE_INP = auto() FFN_GATE_INP = auto()
FFN_NORM = auto() FFN_NORM = auto()
FFN_GATE = auto() FFN_GATE = auto()
FFN_DOWN = auto() FFN_DOWN = auto()
FFN_UP = auto() FFN_UP = auto()
FFN_ACT = auto() FFN_ACT = auto()
FFN_GATE_EXP = auto() FFN_GATE_EXP = auto()
FFN_DOWN_EXP = auto() FFN_DOWN_EXP = auto()
FFN_UP_EXP = auto() FFN_UP_EXP = auto()
ATTN_Q_NORM = auto() ATTN_Q_NORM = auto()
ATTN_K_NORM = auto() ATTN_K_NORM = auto()
LAYER_OUT_NORM = auto() LAYER_OUT_NORM = auto()
SSM_IN = auto() SSM_IN = auto()
SSM_CONV1D = auto() SSM_CONV1D = auto()
SSM_X = auto() SSM_X = auto()
SSM_DT = auto() SSM_DT = auto()
SSM_A = auto() SSM_A = auto()
SSM_D = auto() SSM_D = auto()
SSM_OUT = auto() SSM_OUT = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN: "baichuan", MODEL_ARCH.BAICHUAN: "baichuan",
MODEL_ARCH.GROK: "grok", MODEL_ARCH.GROK: "grok",
MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox", MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt", MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder", MODEL_ARCH.STARCODER: "starcoder",
MODEL_ARCH.PERSIMMON: "persimmon", MODEL_ARCH.PERSIMMON: "persimmon",
MODEL_ARCH.REFACT: "refact", MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert", MODEL_ARCH.BERT: "bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert", MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.BLOOM: "bloom", MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.STABLELM2: "stablelm2",
MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.ORION: "orion", MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.ORION: "orion",
MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.MINICPM: "minicpm",
MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.COMMAND_R: "command-r",
} }
TENSOR_NAMES: dict[MODEL_TENSOR, str] = { TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
MODEL_TENSOR.TOKEN_TYPES: "token_types", MODEL_TENSOR.TOKEN_TYPES: "token_types",
MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output", MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs", MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
} }
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -440,6 +443,24 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
],
MODEL_ARCH.STABLELM2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
], ],
MODEL_ARCH.QWEN: [ MODEL_ARCH.QWEN: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,
@ -701,55 +722,55 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
class TokenType(IntEnum): class TokenType(IntEnum):
NORMAL = 1 NORMAL = 1
UNKNOWN = 2 UNKNOWN = 2
CONTROL = 3 CONTROL = 3
USER_DEFINED = 4 USER_DEFINED = 4
UNUSED = 5 UNUSED = 5
BYTE = 6 BYTE = 6
class RopeScalingType(Enum): class RopeScalingType(Enum):
NONE = 'none' NONE = "none"
LINEAR = 'linear' LINEAR = "linear"
YARN = 'yarn' YARN = "yarn"
class PoolingType(IntEnum): class PoolingType(IntEnum):
NONE = 0 NONE = 0
MEAN = 1 MEAN = 1
CLS = 2 CLS = 2
class GGMLQuantizationType(IntEnum): class GGMLQuantizationType(IntEnum):
F32 = 0 F32 = 0
F16 = 1 F16 = 1
Q4_0 = 2 Q4_0 = 2
Q4_1 = 3 Q4_1 = 3
Q5_0 = 6 Q5_0 = 6
Q5_1 = 7 Q5_1 = 7
Q8_0 = 8 Q8_0 = 8
Q8_1 = 9 Q8_1 = 9
Q2_K = 10 Q2_K = 10
Q3_K = 11 Q3_K = 11
Q4_K = 12 Q4_K = 12
Q5_K = 13 Q5_K = 13
Q6_K = 14 Q6_K = 14
Q8_K = 15 Q8_K = 15
IQ2_XXS = 16 IQ2_XXS = 16
IQ2_XS = 17 IQ2_XS = 17
IQ3_XXS = 18 IQ3_XXS = 18
IQ1_S = 19 IQ1_S = 19
IQ4_NL = 20 IQ4_NL = 20
IQ3_S = 21 IQ3_S = 21
IQ2_S = 22 IQ2_S = 22
IQ4_XS = 23 IQ4_XS = 23
I8 = 24 I8 = 24
I16 = 25 I16 = 25
I32 = 26 I32 = 26
I64 = 27 I64 = 27
F64 = 28 F64 = 28
IQ1_M = 29 IQ1_M = 29
class GGUFEndian(IntEnum): class GGUFEndian(IntEnum):
@ -758,18 +779,18 @@ class GGUFEndian(IntEnum):
class GGUFValueType(IntEnum): class GGUFValueType(IntEnum):
UINT8 = 0 UINT8 = 0
INT8 = 1 INT8 = 1
UINT16 = 2 UINT16 = 2
INT16 = 3 INT16 = 3
UINT32 = 4 UINT32 = 4
INT32 = 5 INT32 = 5
FLOAT32 = 6 FLOAT32 = 6
BOOL = 7 BOOL = 7
STRING = 8 STRING = 8
ARRAY = 9 ARRAY = 9
UINT64 = 10 UINT64 = 10
INT64 = 11 INT64 = 11
FLOAT64 = 12 FLOAT64 = 12
@staticmethod @staticmethod
@ -794,94 +815,94 @@ class GGUFValueType(IntEnum):
QK_K = 256 QK_K = 256
# Items here are (block size, type size) # Items here are (block size, type size)
GGML_QUANT_SIZES = { GGML_QUANT_SIZES = {
GGMLQuantizationType.F32: (1, 4), GGMLQuantizationType.F32: (1, 4),
GGMLQuantizationType.F16: (1, 2), GGMLQuantizationType.F16: (1, 2),
GGMLQuantizationType.Q4_0: (32, 2 + 16), GGMLQuantizationType.Q4_0: (32, 2 + 16),
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16), GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16), GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16), GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
GGMLQuantizationType.Q8_0: (32, 2 + 32), GGMLQuantizationType.Q8_0: (32, 2 + 32),
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32), GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4), GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12), GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12), GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12), GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16), GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8), GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4), GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32), GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8), GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16), GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
GGMLQuantizationType.IQ4_NL: (32, 2 + 16), GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4), GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16), GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64), GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
GGMLQuantizationType.I8: (1, 1), GGMLQuantizationType.I8: (1, 1),
GGMLQuantizationType.I16: (1, 2), GGMLQuantizationType.I16: (1, 2),
GGMLQuantizationType.I32: (1, 4), GGMLQuantizationType.I32: (1, 4),
GGMLQuantizationType.I64: (1, 8), GGMLQuantizationType.I64: (1, 8),
GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.F64: (1, 8),
} }
# Aliases for backward compatibility. # Aliases for backward compatibility.
# general # general
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
KEY_GENERAL_NAME = Keys.General.NAME KEY_GENERAL_NAME = Keys.General.NAME
KEY_GENERAL_AUTHOR = Keys.General.AUTHOR KEY_GENERAL_AUTHOR = Keys.General.AUTHOR
KEY_GENERAL_URL = Keys.General.URL KEY_GENERAL_URL = Keys.General.URL
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
KEY_GENERAL_LICENSE = Keys.General.LICENSE KEY_GENERAL_LICENSE = Keys.General.LICENSE
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
# LLM # LLM
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
# attention # attention
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
# RoPE # RoPE
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
# SSM # SSM
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
# tokenization # tokenization
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV

View file

@ -17,43 +17,38 @@ class TensorNameMap:
"tok_embeddings", # llama-pth "tok_embeddings", # llama-pth
"embeddings.word_embeddings", # bert nomic-bert "embeddings.word_embeddings", # bert nomic-bert
"language_model.embedding.word_embeddings", # persimmon "language_model.embedding.word_embeddings", # persimmon
"wte", # gpt2 "wte", # gpt2
"transformer.embd.wte", # phi2 "transformer.embd.wte", # phi2
"model.tok_embeddings", # internlm2 "model.tok_embeddings", # internlm2
"model.embedding", # mamba-qbert "model.embedding", # mamba-qbert
"backbone.embedding", # mamba "backbone.embedding", # mamba
"backbone.embeddings", # mamba-hf "backbone.embeddings", # mamba-hf
"transformer.in_out_embed", # Grok "transformer.in_out_embed", # Grok
), ),
# Token type embeddings # Token type embeddings
MODEL_TENSOR.TOKEN_TYPES: ( MODEL_TENSOR.TOKEN_TYPES: (
"embeddings.token_type_embeddings", # bert nomic-bert "embeddings.token_type_embeddings", # bert nomic-bert
), ),
# Normalization of token embeddings # Normalization of token embeddings
MODEL_TENSOR.TOKEN_EMBD_NORM: ( MODEL_TENSOR.TOKEN_EMBD_NORM: (
"word_embeddings_layernorm", # bloom "word_embeddings_layernorm", # bloom
"embeddings.LayerNorm", # bert "embeddings.LayerNorm", # bert
"emb_ln", # nomic-bert "emb_ln", # nomic-bert
), ),
# Position embeddings # Position embeddings
MODEL_TENSOR.POS_EMBD: ( MODEL_TENSOR.POS_EMBD: (
"transformer.wpe", # gpt2 "transformer.wpe", # gpt2
"embeddings.position_embeddings", # bert "embeddings.position_embeddings", # bert
"wpe", # gpt2 "wpe", # gpt2
), ),
# Output # Output
MODEL_TENSOR.OUTPUT: ( MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox "embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx
"output", # llama-pth bloom internlm2 "output", # llama-pth bloom internlm2
"word_embeddings_for_head", # persimmon "word_embeddings_for_head", # persimmon
"lm_head.linear", # phi2 "lm_head.linear", # phi2
), ),
# Output norm # Output norm
MODEL_TENSOR.OUTPUT_NORM: ( MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox "gpt_neox.final_layer_norm", # gptneox
@ -63,30 +58,27 @@ class TensorNameMap:
"transformer.norm_f", # mpt dbrx "transformer.norm_f", # mpt dbrx
"ln_f", # refact bloom qwen gpt2 "ln_f", # refact bloom qwen gpt2
"language_model.encoder.final_layernorm", # persimmon "language_model.encoder.final_layernorm", # persimmon
"model.final_layernorm", # persimmon "model.final_layernorm", # persimmon
"lm_head.ln", # phi2 "lm_head.ln", # phi2
"model.norm_f", # mamba-qbert "model.norm_f", # mamba-qbert
"backbone.norm_f", # mamba "backbone.norm_f", # mamba
"transformer.rms_norm", # Grok "transformer.rms_norm", # Grok
), ),
# Rope frequencies # Rope frequencies
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: ("rope.freqs",), # llama-pth
"rope.freqs", # llama-pth
),
} }
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
# Attention norm # Attention norm
MODEL_TENSOR.ATTN_NORM: ( MODEL_TENSOR.ATTN_NORM: (
"gpt_neox.layers.{bid}.input_layernorm", # gptneox "gpt_neox.layers.{bid}.input_layernorm", # gptneox
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen
"transformer.blocks.{bid}.norm_1", # mpt "transformer.blocks.{bid}.norm_1", # mpt
"transformer.h.{bid}.input_layernorm", # falcon7b "transformer.h.{bid}.input_layernorm", # falcon7b
"h.{bid}.input_layernorm", # bloom "h.{bid}.input_layernorm", # bloom
"transformer.h.{bid}.ln_mlp", # falcon40b "transformer.h.{bid}.ln_mlp", # falcon40b
"model.layers.{bid}.input_layernorm", # llama-hf "model.layers.{bid}.input_layernorm", # llama-hf
"layers.{bid}.attention_norm", # llama-pth "layers.{bid}.attention_norm", # llama-pth
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon "language_model.encoder.layers.{bid}.input_layernorm", # persimmon
"model.layers.{bid}.ln1", # yi "model.layers.{bid}.ln1", # yi
"h.{bid}.ln_1", # gpt2 "h.{bid}.ln_1", # gpt2
@ -98,12 +90,8 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm", # Grok "transformer.decoder_layer.{bid}.rms_norm", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
), ),
# Attention norm 2 # Attention norm 2
MODEL_TENSOR.ATTN_NORM_2: ( MODEL_TENSOR.ATTN_NORM_2: ("transformer.h.{bid}.ln_attn",), # falcon40b
"transformer.h.{bid}.ln_attn", # falcon40b
),
# Attention query-key-value # Attention query-key-value
MODEL_TENSOR.ATTN_QKV: ( MODEL_TENSOR.ATTN_QKV: (
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
@ -113,45 +101,41 @@ class TensorNameMap:
"transformer.h.{bid}.self_attention.query_key_value", # falcon "transformer.h.{bid}.self_attention.query_key_value", # falcon
"h.{bid}.self_attention.query_key_value", # bloom "h.{bid}.self_attention.query_key_value", # bloom
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
"model.layers.{bid}.self_attn.query_key_value", # persimmon "model.layers.{bid}.self_attn.query_key_value", # persimmon
"h.{bid}.attn.c_attn", # gpt2 "h.{bid}.attn.c_attn", # gpt2
"transformer.h.{bid}.mixer.Wqkv", # phi2 "transformer.h.{bid}.mixer.Wqkv", # phi2
"encoder.layers.{bid}.attn.Wqkv", # nomic-bert "encoder.layers.{bid}.attn.Wqkv", # nomic-bert
), ),
# Attention query # Attention query
MODEL_TENSOR.ATTN_Q: ( MODEL_TENSOR.ATTN_Q: (
"model.layers.{bid}.self_attn.q_proj", # llama-hf "model.layers.{bid}.self_attn.q_proj", # llama-hf
"layers.{bid}.attention.wq", # llama-pth "layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert "encoder.layer.{bid}.attention.self.query", # bert
"transformer.h.{bid}.attn.q_proj", # gpt-j "transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo "model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2 "model.layers.{bid}.attention.wq", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok
), ),
# Attention key # Attention key
MODEL_TENSOR.ATTN_K: ( MODEL_TENSOR.ATTN_K: (
"model.layers.{bid}.self_attn.k_proj", # llama-hf "model.layers.{bid}.self_attn.k_proj", # llama-hf
"layers.{bid}.attention.wk", # llama-pth "layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert "encoder.layer.{bid}.attention.self.key", # bert
"transformer.h.{bid}.attn.k_proj", # gpt-j "transformer.h.{bid}.attn.k_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.k_proj", # plamo "model.layers.layers.{bid}.self_attn.k_proj", # plamo
"model.layers.{bid}.attention.wk", # internlm2 "model.layers.{bid}.attention.wk", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok
), ),
# Attention value # Attention value
MODEL_TENSOR.ATTN_V: ( MODEL_TENSOR.ATTN_V: (
"model.layers.{bid}.self_attn.v_proj", # llama-hf "model.layers.{bid}.self_attn.v_proj", # llama-hf
"layers.{bid}.attention.wv", # llama-pth "layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert "encoder.layer.{bid}.attention.self.value", # bert
"transformer.h.{bid}.attn.v_proj", # gpt-j "transformer.h.{bid}.attn.v_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.v_proj", # plamo "model.layers.layers.{bid}.self_attn.v_proj", # plamo
"model.layers.{bid}.attention.wv", # internlm2 "model.layers.{bid}.attention.wv", # internlm2
"transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok
), ),
# Attention output # Attention output
MODEL_TENSOR.ATTN_OUT: ( MODEL_TENSOR.ATTN_OUT: (
"gpt_neox.layers.{bid}.attention.dense", # gptneox "gpt_neox.layers.{bid}.attention.dense", # gptneox
@ -172,8 +156,7 @@ class TensorNameMap:
"encoder.layers.{bid}.attn.out_proj", # nomic-bert "encoder.layers.{bid}.attn.out_proj", # nomic-bert
"transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx
), ),
# Attention output norm # Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: ( MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layer.{bid}.attention.output.LayerNorm", # bert
@ -181,169 +164,186 @@ class TensorNameMap:
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.decoder_layer.{bid}.rms_norm_1", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
), ),
# Rotary embeddings # Rotary embeddings
MODEL_TENSOR.ATTN_ROT_EMBD: ( MODEL_TENSOR.ATTN_ROT_EMBD: (
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
), ),
# Feed-forward norm # Feed-forward norm
MODEL_TENSOR.FFN_NORM: ( MODEL_TENSOR.FFN_NORM: (
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
"transformer.h.{bid}.ln_2", # gpt2 refact qwen "transformer.h.{bid}.ln_2", # gpt2 refact qwen
"h.{bid}.post_attention_layernorm", # bloom "h.{bid}.post_attention_layernorm", # bloom
"transformer.blocks.{bid}.norm_2", # mpt "transformer.blocks.{bid}.norm_2", # mpt
"model.layers.{bid}.post_attention_layernorm", # llama-hf "model.layers.{bid}.post_attention_layernorm", # llama-hf
"layers.{bid}.ffn_norm", # llama-pth "layers.{bid}.ffn_norm", # llama-pth
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
"model.layers.{bid}.ln2", # yi "model.layers.{bid}.ln2", # yi
"h.{bid}.ln_2", # gpt2 "h.{bid}.ln_2", # gpt2
"model.layers.{bid}.ffn_norm", # internlm2 "model.layers.{bid}.ffn_norm", # internlm2
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok "transformer.decoder_layer.{bid}.rms_norm_2", # Grok
), ),
MODEL_TENSOR.FFN_GATE_INP: ( MODEL_TENSOR.FFN_GATE_INP: (
"layers.{bid}.feed_forward.gate", # mixtral "layers.{bid}.feed_forward.gate", # mixtral
"model.layers.{bid}.block_sparse_moe.gate", # mixtral "model.layers.{bid}.block_sparse_moe.gate", # mixtral
"transformer.decoder_layer.{bid}.router", # Grok "transformer.decoder_layer.{bid}.router", # Grok
"transformer.blocks.{bid}.ffn.router.layer", # dbrx "transformer.blocks.{bid}.ffn.router.layer", # dbrx
), ),
# Feed-forward up # Feed-forward up
MODEL_TENSOR.FFN_UP: ( MODEL_TENSOR.FFN_UP: (
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
"transformer.h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.c_fc", # gpt2
"transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.blocks.{bid}.ffn.up_proj", # mpt
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
"h.{bid}.mlp.dense_h_to_4h", # bloom "h.{bid}.mlp.dense_h_to_4h", # bloom
"model.layers.{bid}.mlp.up_proj", # llama-hf refact "model.layers.{bid}.mlp.up_proj", # llama-hf refact
"layers.{bid}.feed_forward.w3", # llama-pth "layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert "encoder.layer.{bid}.intermediate.dense", # bert
"transformer.h.{bid}.mlp.fc_in", # gpt-j "transformer.h.{bid}.mlp.fc_in", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
"model.layers.{bid}.mlp.dense_h_to_4h", # persimmon "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon
"transformer.h.{bid}.mlp.w1", # qwen "transformer.h.{bid}.mlp.w1", # qwen
"h.{bid}.mlp.c_fc", # gpt2 "h.{bid}.mlp.c_fc", # gpt2
"transformer.h.{bid}.mlp.fc1", # phi2 "transformer.h.{bid}.mlp.fc1", # phi2
"model.layers.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2
"model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.layers.{bid}.mlp.up_proj", # plamo
"model.layers.{bid}.feed_forward.w3", # internlm2 "model.layers.{bid}.feed_forward.w3", # internlm2
"encoder.layers.{bid}.mlp.fc11", # nomic-bert "encoder.layers.{bid}.mlp.fc11", # nomic-bert
"model.layers.{bid}.mlp.c_fc", # starcoder2 "model.layers.{bid}.mlp.c_fc", # starcoder2
), ),
MODEL_TENSOR.FFN_UP_EXP: ( MODEL_TENSOR.FFN_UP_EXP: (
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged) "layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
), ),
# AWQ-activation gate # AWQ-activation gate
MODEL_TENSOR.FFN_ACT: ( MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt
"transformer.blocks.{bid}.ffn.act", # mpt
),
# Feed-forward gate # Feed-forward gate
MODEL_TENSOR.FFN_GATE: ( MODEL_TENSOR.FFN_GATE: (
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact "model.layers.{bid}.mlp.gate_proj", # llama-hf refact
"layers.{bid}.feed_forward.w1", # llama-pth "layers.{bid}.feed_forward.w1", # llama-pth
"transformer.h.{bid}.mlp.w2", # qwen "transformer.h.{bid}.mlp.w2", # qwen
"model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.layers.{bid}.mlp.gate_proj", # plamo
"model.layers.{bid}.feed_forward.w1", # internlm2 "model.layers.{bid}.feed_forward.w1", # internlm2
"encoder.layers.{bid}.mlp.fc12", # nomic-bert "encoder.layers.{bid}.mlp.fc12", # nomic-bert
), ),
MODEL_TENSOR.FFN_GATE_EXP: ( MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged) "layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
), ),
# Feed-forward down # Feed-forward down
MODEL_TENSOR.FFN_DOWN: ( MODEL_TENSOR.FFN_DOWN: (
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen
"transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.blocks.{bid}.ffn.down_proj", # mpt
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
"h.{bid}.mlp.dense_4h_to_h", # bloom "h.{bid}.mlp.dense_4h_to_h", # bloom
"model.layers.{bid}.mlp.down_proj", # llama-hf "model.layers.{bid}.mlp.down_proj", # llama-hf
"layers.{bid}.feed_forward.w2", # llama-pth "layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert "encoder.layer.{bid}.output.dense", # bert
"transformer.h.{bid}.mlp.fc_out", # gpt-j "transformer.h.{bid}.mlp.fc_out", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"h.{bid}.mlp.c_proj", # gpt2 "h.{bid}.mlp.c_proj", # gpt2
"transformer.h.{bid}.mlp.fc2", # phi2 "transformer.h.{bid}.mlp.fc2", # phi2
"model.layers.{bid}.mlp.fc2", # phi2 "model.layers.{bid}.mlp.fc2", # phi2
"model.layers.layers.{bid}.mlp.down_proj", # plamo "model.layers.layers.{bid}.mlp.down_proj", # plamo
"model.layers.{bid}.feed_forward.w2", # internlm2 "model.layers.{bid}.feed_forward.w2", # internlm2
"encoder.layers.{bid}.mlp.fc2", # nomic-bert "encoder.layers.{bid}.mlp.fc2", # nomic-bert
"model.layers.{bid}.mlp.c_proj", # starcoder2 "model.layers.{bid}.mlp.c_proj", # starcoder2
), ),
MODEL_TENSOR.FFN_DOWN_EXP: ( MODEL_TENSOR.FFN_DOWN_EXP: (
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged) "layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
), ),
MODEL_TENSOR.ATTN_Q_NORM: ( MODEL_TENSOR.ATTN_Q_NORM: (
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
"model.layers.{bid}.self_attn.q_layernorm", # persimmon "model.layers.{bid}.self_attn.q_layernorm", # persimmon
"model.layers.{bid}.self_attn.q_norm", # cohere "model.layers.{bid}.self_attn.q_layernorm.norms.0", # stablelm
"transformer.blocks.{bid}.attn.q_ln", # sea-lion "model.layers.{bid}.self_attn.q_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.8", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.9", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.10", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.11", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.12", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.13", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.14", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.15", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.16", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.17", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.18", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.19", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.20", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.21", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.22", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.23", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.24", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.25", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.26", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.27", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.28", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.29", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.30", # stablelm
"model.layers.{bid}.self_attn.q_layernorm.norms.31", # stablelm
"model.layers.{bid}.self_attn.q_norm", # cohere
"transformer.blocks.{bid}.attn.q_ln", # sea-lion
), ),
MODEL_TENSOR.ATTN_K_NORM: ( MODEL_TENSOR.ATTN_K_NORM: (
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
"model.layers.{bid}.self_attn.k_layernorm", # persimmon "model.layers.{bid}.self_attn.k_layernorm", # persimmon
"model.layers.{bid}.self_attn.k_norm", # cohere "model.layers.{bid}.self_attn.k_layernorm.norms.0", # stablelm
"transformer.blocks.{bid}.attn.k_ln", # sea-lion "model.layers.{bid}.self_attn.k_layernorm.norms.1", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.2", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.3", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.4", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.5", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.6", # stablelm
"model.layers.{bid}.self_attn.k_layernorm.norms.7", # stablelm
"model.layers.{bid}.self_attn.k_norm", # cohere
"transformer.blocks.{bid}.attn.k_ln", # sea-lion
), ),
MODEL_TENSOR.ROPE_FREQS: ( MODEL_TENSOR.ROPE_FREQS: (
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
), ),
MODEL_TENSOR.LAYER_OUT_NORM: ( MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layer.{bid}.output.LayerNorm", # bert
"encoder.layers.{bid}.norm2", # nomic-bert "encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok "transformer.decoder_layer.{bid}.rms_norm_3", # Grok
), ),
MODEL_TENSOR.SSM_IN: ( MODEL_TENSOR.SSM_IN: (
"model.layers.{bid}.in_proj", "model.layers.{bid}.in_proj",
"backbone.layers.{bid}.mixer.in_proj", "backbone.layers.{bid}.mixer.in_proj",
), ),
MODEL_TENSOR.SSM_CONV1D: ( MODEL_TENSOR.SSM_CONV1D: (
"model.layers.{bid}.conv1d", "model.layers.{bid}.conv1d",
"backbone.layers.{bid}.mixer.conv1d", "backbone.layers.{bid}.mixer.conv1d",
), ),
MODEL_TENSOR.SSM_X: ( MODEL_TENSOR.SSM_X: (
"model.layers.{bid}.x_proj", "model.layers.{bid}.x_proj",
"backbone.layers.{bid}.mixer.x_proj", "backbone.layers.{bid}.mixer.x_proj",
), ),
MODEL_TENSOR.SSM_DT: ( MODEL_TENSOR.SSM_DT: (
"model.layers.{bid}.dt_proj", "model.layers.{bid}.dt_proj",
"backbone.layers.{bid}.mixer.dt_proj", "backbone.layers.{bid}.mixer.dt_proj",
), ),
MODEL_TENSOR.SSM_A: ( MODEL_TENSOR.SSM_A: (
"model.layers.{bid}.A_log", "model.layers.{bid}.A_log",
"backbone.layers.{bid}.mixer.A_log", "backbone.layers.{bid}.mixer.A_log",
), ),
MODEL_TENSOR.SSM_D: ( MODEL_TENSOR.SSM_D: (
"model.layers.{bid}.D", "model.layers.{bid}.D",
"backbone.layers.{bid}.mixer.D", "backbone.layers.{bid}.mixer.D",
), ),
MODEL_TENSOR.SSM_OUT: ( MODEL_TENSOR.SSM_OUT: (
"model.layers.{bid}.out_proj", "model.layers.{bid}.out_proj",
"backbone.layers.{bid}.mixer.out_proj", "backbone.layers.{bid}.mixer.out_proj",
@ -368,31 +368,35 @@ class TensorNameMap:
# TODO: make this configurable # TODO: make this configurable
n_experts = 8 n_experts = 8
for xid in range(n_experts): for xid in range(n_experts):
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) tensor_name = TENSOR_NAMES[tensor].format(bid=bid, xid=xid)
self.mapping[tensor_name] = (tensor, tensor_name) self.mapping[tensor_name] = (tensor, tensor_name)
for key in keys: for key in keys:
key = key.format(bid = bid, xid = xid) key = key.format(bid=bid, xid=xid)
self.mapping[key] = (tensor, tensor_name) self.mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: def get_type_and_name(
self, key: str, try_suffixes: Sequence[str] = ()
) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key) result = self.mapping.get(key)
if result is not None: if result is not None:
return result return result
for suffix in try_suffixes: for suffix in try_suffixes:
if key.endswith(suffix): if key.endswith(suffix):
result = self.mapping.get(key[:-len(suffix)]) result = self.mapping.get(key[: -len(suffix)])
if result is not None: if result is not None:
return result[0], result[1] + suffix return result[0], result[1] + suffix
return None return None
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes) result = self.get_type_and_name(key, try_suffixes=try_suffixes)
if result is None: if result is None:
return None return None
return result[1] return result[1]
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: def get_type(
result = self.get_type_and_name(key, try_suffixes = try_suffixes) self, key: str, try_suffixes: Sequence[str] = ()
) -> MODEL_TENSOR | None:
result = self.get_type_and_name(key, try_suffixes=try_suffixes)
if result is None: if result is None:
return None return None
return result[0] return result[0]