wip minicpmv

2025-01-19 22:33:05 +01:00 · 2025-01-19 22:33:05 +01:00 · 4a7ab89d75
commit 4a7ab89d75
parent d0068ef0ed
9 changed files with 491 additions and 77 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -310,6 +310,7 @@ class MODEL_ARCH(IntEnum):
    # vision models
    VISION_LLAVA     = auto()
    VISION_MOBILEVLM = auto()
+    VISION_MINICPMV  = auto()


 class MODEL_TENSOR(IntEnum):
@ -455,6 +456,15 @@ class MODEL_TENSOR(IntEnum):
    V_ENC_FFN_DOWN       = auto()
    V_PRE_NORM           = auto()
    V_POST_NORM          = auto()
+    V_RESMPL_POS_EMBD_K  = auto() # minicpmv
+    V_RESMPL_ATTN_IN     = auto() # minicpmv
+    V_RESMPL_ATTN_OUT    = auto() # minicpmv
+    V_RESMPL_KV_PROJ     = auto() # minicpmv
+    V_RESMPL_NORM_POST   = auto() # minicpmv
+    V_RESMPL_NORM_KV     = auto() # minicpmv
+    V_RESMPL_NORM_Q      = auto() # minicpmv
+    V_RESMPL_PROJ        = auto() # minicpmv
+    V_RESMPL_QUERY       = auto() # minicpmv


 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -518,6 +528,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    # vision
    MODEL_ARCH.VISION_LLAVA:     "llava",
    MODEL_ARCH.VISION_MOBILEVLM: "mobilevlm",
+    MODEL_ARCH.VISION_MINICPMV:  "minicpmv",
 }

 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@ -662,6 +673,15 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_ENC_FFN_DOWN:            "v.enc.blk.{bid}.ffn_down",
    MODEL_TENSOR.V_PRE_NORM:                "v.pre_norm",
    MODEL_TENSOR.V_POST_NORM:               "v.post_norm",
+    MODEL_TENSOR.V_RESMPL_POS_EMBD_K:       "v.resmpl.pos_embd_k",
+    MODEL_TENSOR.V_RESMPL_ATTN_IN:          "v.resmpl.attn_in",
+    MODEL_TENSOR.V_RESMPL_ATTN_OUT:         "v.resmpl.attn_out",
+    MODEL_TENSOR.V_RESMPL_KV_PROJ:          "v.resmpl.kv_proj",
+    MODEL_TENSOR.V_RESMPL_NORM_POST:        "v.resmpl.norm_post",
+    MODEL_TENSOR.V_RESMPL_NORM_KV:          "v.resmpl.norm_kv",
+    MODEL_TENSOR.V_RESMPL_NORM_Q:           "v.resmpl.norm_q",
+    MODEL_TENSOR.V_RESMPL_PROJ:             "v.resmpl.proj",
+    MODEL_TENSOR.V_RESMPL_QUERY:            "v.resmpl.query",
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -1636,6 +1656,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_PRE_NORM,
        MODEL_TENSOR.V_POST_NORM,
    ],
+    MODEL_ARCH.VISION_MINICPMV: [
+        MODEL_TENSOR.V_ENC_EMBD_PATCH,
+        MODEL_TENSOR.V_ENC_EMBD_POS,
+        MODEL_TENSOR.V_ENC_ATTN_Q,
+        MODEL_TENSOR.V_ENC_ATTN_K,
+        MODEL_TENSOR.V_ENC_ATTN_V,
+        MODEL_TENSOR.V_ENC_INPUT_NORM,
+        MODEL_TENSOR.V_ENC_OUTPUT,
+        MODEL_TENSOR.V_ENC_OUTPUT_NORM,
+        MODEL_TENSOR.V_ENC_FFN_UP,
+        MODEL_TENSOR.V_ENC_FFN_DOWN,
+        MODEL_TENSOR.V_RESMPL_ATTN_IN,
+        MODEL_TENSOR.V_RESMPL_ATTN_OUT,
+        MODEL_TENSOR.V_RESMPL_KV_PROJ,
+        MODEL_TENSOR.V_RESMPL_NORM_POST,
+        MODEL_TENSOR.V_RESMPL_NORM_KV,
+        MODEL_TENSOR.V_RESMPL_NORM_Q,
+        MODEL_TENSOR.V_RESMPL_PROJ,
+        MODEL_TENSOR.V_RESMPL_QUERY,
+    ],
    # TODO
 }

@ -1718,8 +1758,10 @@ class PoolingType(IntEnum):


 class CLIPProjectorType(Enum):
-    MLP   = 'mlp'
-    LDPV2 = 'ldpv2'
+    MLP          = 'mlp'
+    LDPV2        = 'ldpv2'
+    MINICPMV_2_5 = 'minicpmv-2.5' # resampler
+    MINICPMV_2_6 = 'minicpmv-2.6' # resampler


 class CLIPPatchMergeType(Enum):
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -808,42 +808,52 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_EMBD_PATCH: (
            "vision_tower.vision_model.embeddings.patch_embedding",
+            "vpm.embeddings.patch_embedding",
        ),

        MODEL_TENSOR.V_ENC_EMBD_POS: (
            "vision_tower.vision_model.embeddings.position_embedding",
+            "vpm.embeddings.position_embedding",
        ),

        MODEL_TENSOR.V_ENC_ATTN_Q: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
+            "vpm.encoder.layers.{bid}.self_attn.q_proj",
        ),

        MODEL_TENSOR.V_ENC_ATTN_K: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
+            "vpm.encoder.layers.{bid}.self_attn.k_proj",
        ),

        MODEL_TENSOR.V_ENC_ATTN_V: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
+            "vpm.encoder.layers.{bid}.self_attn.v_proj",
        ),

        MODEL_TENSOR.V_ENC_INPUT_NORM: (
            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
+            "vpm.encoder.layers.{bid}.layer_norm1",
        ),

        MODEL_TENSOR.V_ENC_OUTPUT: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
+            "vpm.encoder.layers.{bid}.self_attn.out_proj",
        ),

        MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
+            "vpm.encoder.layers.{bid}.layer_norm2",
        ),

        MODEL_TENSOR.V_ENC_FFN_UP: (
            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
+            "vpm.encoder.layers.{bid}.mlp.fc1",
        ),

        MODEL_TENSOR.V_ENC_FFN_DOWN: (
            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
+            "vpm.encoder.layers.{bid}.mlp.fc2",
        ),

        MODEL_TENSOR.V_PRE_NORM: (
@ -853,6 +863,42 @@ class TensorNameMap:
        MODEL_TENSOR.V_POST_NORM: (
            "vision_tower.vision_model.post_layernorm",
        ),
+
+        MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
+            "resampler.pos_embed_k",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_ATTN_IN: (
+            "resampler.attn.in_proj",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
+            "resampler.attn.out_proj",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_KV_PROJ: (
+            "resampler.kv_proj",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_NORM_POST: (
+            "resampler.ln_post",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_NORM_KV: (
+            "resampler.ln_kv",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_NORM_Q: (
+            "resampler.ln_q",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_PROJ: (
+            "resampler.proj",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_QUERY: (
+            "resampler.query",
+        ),
    }

    # architecture-specific block mappings