add SmolVLM
This commit is contained in:
parent
25a97ce4cb
commit
c3a654c0fb
9 changed files with 171 additions and 10 deletions
|
@ -238,6 +238,7 @@ class Keys:
|
|||
PATCH_MERGE_TYPE = "vision.vit.patch_merge_type"
|
||||
HEAD_COUNT = "vision.vit.attention.head_count"
|
||||
LAYERNORM_EPS = "vision.vit.attention.layer_norm_epsilon"
|
||||
SCALE_FACTOR = "vision.vit.scale_factor" # only used by idefics3 for now
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
|
@ -311,6 +312,7 @@ class MODEL_ARCH(IntEnum):
|
|||
VISION_LLAVA = auto()
|
||||
VISION_MOBILEVLM = auto()
|
||||
VISION_MINICPMV = auto()
|
||||
VISION_IDEFICS3 = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
|
@ -441,6 +443,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
POSNET_ATTN_OUT = auto()
|
||||
# vision
|
||||
V_MMPROJ = auto()
|
||||
V_MMPROJ_FC = auto()
|
||||
V_MMPROJ_MLP = auto()
|
||||
V_MMPROJ_PEG = auto()
|
||||
V_ENC_EMBD_CLS = auto()
|
||||
|
@ -535,6 +538,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.VISION_LLAVA: "llava",
|
||||
MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm",
|
||||
MODEL_ARCH.VISION_MINICPMV: "minicpmv",
|
||||
MODEL_ARCH.VISION_IDEFICS3: "idefics3",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
|
@ -664,6 +668,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
|
||||
# vision
|
||||
MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}",
|
||||
MODEL_TENSOR.V_MMPROJ_FC: "v.mmproj.fc",
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: "v.mmproj.mlp.{bid}",
|
||||
MODEL_TENSOR.V_MMPROJ_PEG: "v.mmproj.peg.{bid}",
|
||||
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls",
|
||||
|
@ -1695,6 +1700,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.V_TOK_EMBD_SLICE,
|
||||
MODEL_TENSOR.V_TOK_EMBD_END_SLICE,
|
||||
],
|
||||
MODEL_ARCH.VISION_IDEFICS3: [
|
||||
MODEL_TENSOR.V_MMPROJ_FC,
|
||||
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS,
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q,
|
||||
MODEL_TENSOR.V_ENC_ATTN_K,
|
||||
MODEL_TENSOR.V_ENC_ATTN_V,
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_OUTPUT,
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM,
|
||||
MODEL_TENSOR.V_ENC_FFN_UP,
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
||||
MODEL_TENSOR.V_POST_NORM,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
|
|
@ -928,6 +928,9 @@ class GGUFWriter:
|
|||
def add_vision_vit_image_std(self, value: Sequence[float]) -> None:
|
||||
self.add_array(Keys.Vision.IMAGE_STD, value)
|
||||
|
||||
def add_vision_vit_scale_factor(self, value: int) -> None:
|
||||
self.add_int32(Keys.Vision.Vit.SCALE_FACTOR, value)
|
||||
|
||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||
if not isinstance(value, str):
|
||||
template_default = None
|
||||
|
|
|
@ -794,6 +794,10 @@ class TensorNameMap:
|
|||
"multi_modal_projector.linear_{bid}",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_FC: (
|
||||
"model.connector.modality_projection.proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MMPROJ_MLP: (
|
||||
"model.mm_projector.mlp.mlp.{bid}",
|
||||
),
|
||||
|
@ -809,51 +813,61 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
|
||||
"vision_tower.vision_model.embeddings.patch_embedding",
|
||||
"vpm.embeddings.patch_embedding",
|
||||
"model.vision_model.embeddings.patch_embedding", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||
"vision_tower.vision_model.embeddings.position_embedding",
|
||||
"vpm.embeddings.position_embedding",
|
||||
"model.vision_model.embeddings.position_embedding", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.q_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_K: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.k_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_ATTN_V: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.v_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_INPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
"vpm.encoder.layers.{bid}.layer_norm1",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"vpm.encoder.layers.{bid}.self_attn.out_proj",
|
||||
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vpm.encoder.layers.{bid}.layer_norm2",
|
||||
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
|
@ -862,6 +876,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.V_POST_NORM: (
|
||||
"vision_tower.vision_model.post_layernorm",
|
||||
"model.vision_model.post_layernorm", # SmolVLM
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue