diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index d351a56d1..3e1a676c0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -375,8 +375,7 @@ class MODEL_TENSOR(IntEnum): ENC_FFN_UP = auto() ENC_OUTPUT_NORM = auto() # vision - V_MMPROJ_A = auto() - V_MMPROJ_B = auto() + V_MMPROJ = auto() V_ENC_EMBD_CLS = auto() V_ENC_EMBD_PATCH = auto() V_ENC_EMBD_POS = auto() @@ -552,8 +551,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", # vision - MODEL_TENSOR.V_MMPROJ_A: "v.mmproj_a", - MODEL_TENSOR.V_MMPROJ_B: "v.mmproj_b", + MODEL_TENSOR.V_MMPROJ: "v.mmproj_{bid}", MODEL_TENSOR.V_ENC_EMBD_CLS: "v.enc.embd.cls", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.enc.embd.patch", MODEL_TENSOR.V_ENC_EMBD_POS: "v.enc.embd.pos", @@ -1343,8 +1341,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, ], MODEL_ARCH.LLAVA_VISION: [ - MODEL_TENSOR.V_MMPROJ_A, - MODEL_TENSOR.V_MMPROJ_B, + MODEL_TENSOR.V_MMPROJ, MODEL_TENSOR.V_ENC_EMBD_CLS, MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_POS, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 4e73706a0..5ae4d65c7 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -680,12 +680,12 @@ class TensorNameMap: "encoder.final_layer_norm", # t5 ), - MODEL_TENSOR.V_MMPROJ_A: ( - "multi_modal_projector.linear_1", + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", ), - MODEL_TENSOR.V_MMPROJ_B: ( - "multi_modal_projector.linear_2", + MODEL_TENSOR.V_MMPROJ: ( + "multi_modal_projector.linear_{bid}", ), MODEL_TENSOR.V_ENC_EMBD_CLS: ( diff --git a/src/llama.cpp b/src/llama.cpp index c4f1c1d76..0dd60cd81 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -658,8 +658,7 @@ enum llm_tensor { }; enum vision_tensor { - VISION_TENSOR_MMPROJ_A, - VISION_TENSOR_MMPROJ_B, + VISION_TENSOR_MMPROJ, VISION_TENSOR_ENC_EMBD_CLS, VISION_TENSOR_ENC_EMBD_PATCH, VISION_TENSOR_ENC_EMBD_POS, @@ -1601,8 +1600,7 @@ static const std::map> VISION_ { VISION_ARCH_LLAVA, { - { VISION_TENSOR_MMPROJ_A, "v.mmproj_a" }, - { VISION_TENSOR_MMPROJ_B, "v.mmproj_b" }, + { VISION_TENSOR_MMPROJ, "v.mmproj" }, { VISION_TENSOR_ENC_EMBD_CLS, "v.enc.embd.cls" }, { VISION_TENSOR_ENC_EMBD_PATCH, "v.enc.embd.patch" }, { VISION_TENSOR_ENC_EMBD_POS, "v.enc.embd.pos" }, @@ -8992,10 +8990,10 @@ static bool llm_load_tensors( switch (vparams.arch) { case VISION_ARCH_LLAVA: { - model.clip.mm_a_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_A, "weight"), {n_embd, n_ff}); - model.clip.mm_a_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_A, "bias" ), {n_ff}); - model.clip.mm_b_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_B, "weight"), {n_ff, n_ff}); - model.clip.mm_b_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ_B, "bias" ), {n_ff}); + model.clip.mm_a_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}); + model.clip.mm_a_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff}); + model.clip.mm_b_w = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff}); + model.clip.mm_b_b = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff}); model.clip.class_embedding = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_embd}); model.clip.patch_embeddings = ml.create_tensor(ctx_vision, tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd});