Merge branch 'master' into compilade/imatrix-batched-chunks
This commit is contained in:
commit
3ad0603c65
165 changed files with 22092 additions and 13591 deletions
|
@ -94,6 +94,9 @@ class Keys:
|
|||
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
||||
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
||||
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
||||
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
||||
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
|
||||
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
|
@ -130,6 +133,10 @@ class Keys:
|
|||
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||
STATE_SIZE = "{arch}.ssm.state_size"
|
||||
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
||||
|
||||
class WKV:
|
||||
HEAD_SIZE = "{arch}.wkv.head_size"
|
||||
|
||||
class Tokenizer:
|
||||
MODEL = "tokenizer.ggml.model"
|
||||
|
@ -213,6 +220,7 @@ class MODEL_ARCH(IntEnum):
|
|||
GEMMA = auto()
|
||||
GEMMA2 = auto()
|
||||
STARCODER2 = auto()
|
||||
RWKV6 = auto()
|
||||
MAMBA = auto()
|
||||
XVERSE = auto()
|
||||
COMMAND_R = auto()
|
||||
|
@ -276,6 +284,29 @@ class MODEL_TENSOR(IntEnum):
|
|||
SSM_A = auto()
|
||||
SSM_D = auto()
|
||||
SSM_OUT = auto()
|
||||
TIME_MIX_W1 = auto()
|
||||
TIME_MIX_W2 = auto()
|
||||
TIME_MIX_LERP_X = auto()
|
||||
TIME_MIX_LERP_K = auto()
|
||||
TIME_MIX_LERP_V = auto()
|
||||
TIME_MIX_LERP_R = auto()
|
||||
TIME_MIX_LERP_G = auto()
|
||||
TIME_MIX_LERP_W = auto()
|
||||
TIME_MIX_FIRST = auto()
|
||||
TIME_MIX_DECAY = auto()
|
||||
TIME_MIX_DECAY_W1 = auto()
|
||||
TIME_MIX_DECAY_W2 = auto()
|
||||
TIME_MIX_KEY = auto()
|
||||
TIME_MIX_VALUE = auto()
|
||||
TIME_MIX_RECEPTANCE = auto()
|
||||
TIME_MIX_GATE = auto()
|
||||
TIME_MIX_LN = auto()
|
||||
TIME_MIX_OUTPUT = auto()
|
||||
CHANNEL_MIX_LERP_K = auto()
|
||||
CHANNEL_MIX_LERP_R = auto()
|
||||
CHANNEL_MIX_KEY = auto()
|
||||
CHANNEL_MIX_RECEPTANCE = auto()
|
||||
CHANNEL_MIX_VALUE = auto()
|
||||
ATTN_Q_A = auto()
|
||||
ATTN_Q_B = auto()
|
||||
ATTN_KV_A_MQA = auto()
|
||||
|
@ -343,6 +374,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.RWKV6: "rwkv6",
|
||||
MODEL_ARCH.MAMBA: "mamba",
|
||||
MODEL_ARCH.XVERSE: "xverse",
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
|
@ -361,87 +393,110 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
||||
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
||||
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
||||
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
||||
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
||||
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
||||
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
||||
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
||||
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
||||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
||||
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
||||
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
||||
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
||||
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
|
||||
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
|
||||
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
|
||||
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
|
||||
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
|
||||
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
|
||||
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
|
||||
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
|
||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
|
||||
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
|
||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
||||
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
||||
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
||||
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
||||
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
||||
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
||||
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
||||
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
||||
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
||||
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
||||
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
||||
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
|
@ -862,6 +917,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.RWKV6: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM_2,
|
||||
MODEL_TENSOR.TIME_MIX_W1,
|
||||
MODEL_TENSOR.TIME_MIX_W2,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_X,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_K,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_V,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_R,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_G,
|
||||
MODEL_TENSOR.TIME_MIX_LERP_W,
|
||||
MODEL_TENSOR.TIME_MIX_FIRST,
|
||||
MODEL_TENSOR.TIME_MIX_DECAY,
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||
MODEL_TENSOR.TIME_MIX_KEY,
|
||||
MODEL_TENSOR.TIME_MIX_VALUE,
|
||||
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
||||
MODEL_TENSOR.TIME_MIX_GATE,
|
||||
MODEL_TENSOR.TIME_MIX_LN,
|
||||
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
|
||||
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
||||
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
||||
],
|
||||
MODEL_ARCH.MAMBA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
@ -1212,6 +1298,8 @@ class GGMLQuantizationType(IntEnum):
|
|||
Q4_0_4_4 = 31
|
||||
Q4_0_4_8 = 32
|
||||
Q4_0_8_8 = 33
|
||||
TQ1_0 = 34
|
||||
TQ2_0 = 35
|
||||
|
||||
|
||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||
|
@ -1256,6 +1344,8 @@ class LlamaFileType(IntEnum):
|
|||
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
|
||||
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
|
||||
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
|
||||
MOSTLY_TQ1_0 = 36 # except 1d tensors
|
||||
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
|
||||
|
@ -1332,6 +1422,8 @@ GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
|||
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
|
||||
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
||||
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
||||
}
|
||||
|
||||
|
||||
|
@ -1379,6 +1471,7 @@ KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
|||
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
||||
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
||||
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
||||
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
||||
|
||||
# tokenization
|
||||
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
||||
|
|
|
@ -670,6 +670,18 @@ class GGUFWriter:
|
|||
def add_expert_weights_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_rescale_every_n_layers(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
|
||||
|
||||
def add_time_mix_extra_dim(self, dim: int) -> None:
|
||||
self.add_uint32(Keys.LLM.TIME_MIX_EXTRA_DIM.format(arch=self.arch), dim)
|
||||
|
||||
def add_time_decay_extra_dim(self, dim: int) -> None:
|
||||
self.add_uint32(Keys.LLM.TIME_DECAY_EXTRA_DIM.format(arch=self.arch), dim)
|
||||
|
||||
def add_wkv_head_size(self, size: int) -> None:
|
||||
self.add_uint32(Keys.WKV.HEAD_SIZE.format(arch=self.arch), size)
|
||||
|
||||
def add_layer_norm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)
|
||||
|
||||
|
@ -730,6 +742,9 @@ class GGUFWriter:
|
|||
def add_ssm_time_step_rank(self, value: int) -> None:
|
||||
self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
|
||||
|
||||
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
|
||||
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
|
||||
|
||||
def add_tokenizer_model(self, model: str) -> None:
|
||||
self.add_string(Keys.Tokenizer.MODEL, model)
|
||||
|
||||
|
|
|
@ -574,6 +574,87 @@ class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
|
|||
return (d * q).reshape((n_blocks, QK_K))
|
||||
|
||||
|
||||
class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||
|
||||
qs0, qs1, qh = qs[..., :(32 * 5)], qs[..., (32 * 5):(48 * 5)], qs[..., (48 * 5):]
|
||||
qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
|
||||
qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array([81, 27, 9, 3, 1], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
|
||||
qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array([81, 27, 9, 3], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
|
||||
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||
qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
|
||||
|
||||
qs = qs.astype(np.uint8)
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
|
||||
return np.concatenate([qs, d], axis=-1)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
|
||||
qh, d = np.hsplit(rest, [QK_K // 64])
|
||||
|
||||
d = d.view(np.float16).astype(np.float32)
|
||||
|
||||
qs0, qs1 = qs[..., :32], qs[..., 32:]
|
||||
qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs0 = qs0.reshape((n_blocks, -1))
|
||||
qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array([1, 3, 9, 27, 81], dtype=np.uint8).reshape((1, 1, 5, 1))
|
||||
qs1 = qs1.reshape((n_blocks, -1))
|
||||
qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array([1, 3, 9, 27], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qh = qh.reshape((n_blocks, -1))
|
||||
qs = np.concatenate([qs0, qs1, qh], axis=-1)
|
||||
qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
|
||||
|
||||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
|
||||
@classmethod
|
||||
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
d = abs(blocks).max(axis=-1, keepdims=True)
|
||||
with np.errstate(divide="ignore"):
|
||||
id = np.where(d == 0, 0, 1 / d)
|
||||
qs = np_roundf(blocks * id)
|
||||
qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
|
||||
|
||||
qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
|
||||
qs = qs.reshape((n_blocks, -1))
|
||||
|
||||
d = d.astype(np.float16).view(np.uint8)
|
||||
|
||||
return np.concatenate([qs, d], axis=-1)
|
||||
|
||||
@classmethod
|
||||
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
|
||||
n_blocks = blocks.shape[0]
|
||||
|
||||
qs, d = np.hsplit(blocks, [QK_K // 4])
|
||||
|
||||
d = d.view(np.float16).astype(np.float32)
|
||||
|
||||
qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
|
||||
qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
|
||||
|
||||
return (d * qs.astype(np.float32))
|
||||
|
||||
|
||||
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
|
||||
ksigns: bytes = (
|
||||
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
|
||||
|
|
|
@ -27,6 +27,7 @@ class TensorNameMap:
|
|||
"embedding.word_embeddings", # chatglm
|
||||
"transformer.token_embeddings", # openelm
|
||||
"shared", # t5
|
||||
"rwkv.embeddings", # rwkv
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
|
@ -40,6 +41,7 @@ class TensorNameMap:
|
|||
"embeddings.LayerNorm", # bert
|
||||
"emb_ln", # nomic-bert
|
||||
"transformer.norm", # openelm
|
||||
"rwkv.blocks.0.pre_ln", # rwkv
|
||||
),
|
||||
|
||||
# Position embeddings
|
||||
|
@ -57,6 +59,7 @@ class TensorNameMap:
|
|||
"word_embeddings_for_head", # persimmon
|
||||
"lm_head.linear", # phi2
|
||||
"output_layer", # chatglm
|
||||
"head", # rwkv
|
||||
),
|
||||
|
||||
# Output norm
|
||||
|
@ -76,6 +79,7 @@ class TensorNameMap:
|
|||
"encoder.final_layernorm", # chatglm
|
||||
"transformer.norm", # openelm
|
||||
"model.norm", # nemotron
|
||||
"rwkv.ln_out", # rwkv
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
|
@ -108,12 +112,14 @@ class TensorNameMap:
|
|||
"transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx
|
||||
"encoder.layers.{bid}.input_layernorm", # chatglm
|
||||
"transformer.layers.{bid}.attn_norm", # openelm
|
||||
"rwkv.blocks.{bid}.ln1", # rwkv
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
MODEL_TENSOR.ATTN_NORM_2: (
|
||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||
"transformer.h.{bid}.ln_attn", # falcon40b
|
||||
"encoder.layer.{bid}.layer_norm_1", # jina-v2-code
|
||||
"rwkv.blocks.{bid}.ln2", # rwkv
|
||||
),
|
||||
|
||||
# Attention query-key-value
|
||||
|
@ -434,6 +440,98 @@ class TensorNameMap:
|
|||
"backbone.layers.{bid}.mixer.out_proj",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_W1: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_W2: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_X: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_K: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_V: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_R: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_G: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LERP_W: (
|
||||
"rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_FIRST: (
|
||||
"rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_DECAY: (
|
||||
"rwkv.blocks.{bid}.attention.time_decay", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W1: (
|
||||
"rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_DECAY_W2: (
|
||||
"rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_KEY: (
|
||||
"rwkv.blocks.{bid}.attention.key", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_VALUE: (
|
||||
"rwkv.blocks.{bid}.attention.value", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
|
||||
"rwkv.blocks.{bid}.attention.receptance", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_GATE: (
|
||||
"rwkv.blocks.{bid}.attention.gate", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_LN: (
|
||||
"rwkv.blocks.{bid}.attention.ln_x", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.TIME_MIX_OUTPUT: (
|
||||
"rwkv.blocks.{bid}.attention.output", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
|
||||
"rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
|
||||
"rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_KEY: (
|
||||
"rwkv.blocks.{bid}.feed_forward.key", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
|
||||
"rwkv.blocks.{bid}.feed_forward.receptance", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.CHANNEL_MIX_VALUE: (
|
||||
"rwkv.blocks.{bid}.feed_forward.value", # rwkv
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_A: (
|
||||
"model.layers.{bid}.self_attn.q_a_proj", # deepseek2
|
||||
),
|
||||
|
|
|
@ -23,6 +23,7 @@ python = ">=3.8"
|
|||
numpy = ">=1.17"
|
||||
tqdm = ">=4.27"
|
||||
pyyaml = ">=5.1"
|
||||
sentencepiece = ">=0.1.98,<=0.2.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
|
|
@ -66,6 +66,7 @@ class GGMLQuants:
|
|||
for t in (
|
||||
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
|
||||
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
|
||||
"tq1_0", "tq2_0",
|
||||
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
|
||||
"iq4_nl", "iq4_xs",
|
||||
):
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue