llama : initial Mamba-2 support
This commit is contained in:
parent
a1631e53f6
commit
1f0fea70fb
7 changed files with 490 additions and 82 deletions
|
@ -130,6 +130,7 @@ class Keys:
|
|||
INNER_SIZE = "{arch}.ssm.inner_size"
|
||||
STATE_SIZE = "{arch}.ssm.state_size"
|
||||
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
||||
GROUP_COUNT = "{arch}.ssm.group_count"
|
||||
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
||||
|
||||
class Tokenizer:
|
||||
|
@ -208,6 +209,7 @@ class MODEL_ARCH(IntEnum):
|
|||
GEMMA2 = auto()
|
||||
STARCODER2 = auto()
|
||||
MAMBA = auto()
|
||||
MAMBA2 = auto()
|
||||
XVERSE = auto()
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
|
@ -269,6 +271,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
SSM_DT = auto()
|
||||
SSM_A = auto()
|
||||
SSM_D = auto()
|
||||
SSM_NORM = auto()
|
||||
SSM_OUT = auto()
|
||||
ATTN_Q_A = auto()
|
||||
ATTN_Q_B = auto()
|
||||
|
@ -338,6 +341,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.GEMMA2: "gemma2",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.MAMBA: "mamba",
|
||||
MODEL_ARCH.MAMBA2: "mamba2",
|
||||
MODEL_ARCH.XVERSE: "xverse",
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
|
@ -399,6 +403,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
||||
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
||||
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
||||
MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm",
|
||||
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
||||
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
||||
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
||||
|
@ -869,6 +874,19 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.SSM_D,
|
||||
MODEL_TENSOR.SSM_OUT,
|
||||
],
|
||||
MODEL_ARCH.MAMBA2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.SSM_IN,
|
||||
MODEL_TENSOR.SSM_CONV1D,
|
||||
MODEL_TENSOR.SSM_DT,
|
||||
MODEL_TENSOR.SSM_A,
|
||||
MODEL_TENSOR.SSM_D,
|
||||
MODEL_TENSOR.SSM_NORM,
|
||||
MODEL_TENSOR.SSM_OUT,
|
||||
],
|
||||
MODEL_ARCH.XVERSE: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
@ -1373,6 +1391,7 @@ KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
|||
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
||||
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
||||
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
||||
KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT
|
||||
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
||||
|
||||
# tokenization
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue