From 32b47f600f6a4759959f354b1b9e985090c6a803 Mon Sep 17 00:00:00 2001 From: caitianchi Date: Sat, 10 Aug 2024 21:51:04 +0800 Subject: [PATCH] fix type-check --- examples/llava/clip.cpp | 3 +-- examples/llava/llava.cpp | 4 ++-- examples/llava/minicpmv-cli.cpp | 8 ++++---- .../minicpmv2_5-convert-image-encoder-to-gguf.py | 2 +- .../minicpmv2_6-convert-image-encoder-to-gguf.py | 2 +- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index afb7929d0..ed7195bc3 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -776,8 +776,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - - } + } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index c5f909cc7..851af0f00 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -254,7 +254,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); int patch_size=14; load_image_size->width = img_res_v.data[i].nx; - load_image_size->height = img_res_v.data[i].ny; + load_image_size->height = img_res_v.data[i].ny; clip_add_load_image_size(ctx_clip, load_image_size); bool encoded = false; int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); @@ -263,7 +263,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli } else if (has_minicpmv_projector == 3) { encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); - } + } if (!encoded) { LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index f108b86ec..379fc295f 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -140,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e } else if (has_minicpmv_projector == 3) { system_prompt = "<|im_start|>user\n"; - } + } LOG_TEE("%s: image token past: %d\n", __func__, n_past); eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); @@ -223,7 +223,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla } else if (has_minicpmv_projector == 3) { user_prompt = "<|im_start|>user\n" + prompt; - } + } } eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); @@ -232,8 +232,8 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla } else if (has_minicpmv_projector == 3) { eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); - } - + } + // generate the response LOG_TEE("\n"); diff --git a/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py index 4519c57ab..781a7f7a6 100644 --- a/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py @@ -6,7 +6,6 @@ import re import torch import numpy as np from gguf import * -import timm from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig TEXT = "clip.text" @@ -159,6 +158,7 @@ fname_middle = None has_text_encoder = True has_vision_encoder = True has_minicpmv_projector = False +minicpmv_version = 2 if args.text_only: fname_middle = "text-" has_vision_encoder = False diff --git a/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py index 19f70eb3e..bf6849cfe 100644 --- a/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py @@ -133,7 +133,6 @@ class SiglipVisionConfig(PretrainedConfig): ) return cls.from_dict(config_dict, **kwargs) - _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" @@ -1096,6 +1095,7 @@ fname_middle = None has_text_encoder = True has_vision_encoder = True has_minicpmv_projector = False +minicpmv_version = 3 if args.text_only: fname_middle = "text-" has_vision_encoder = False