From 32b47f600f6a4759959f354b1b9e985090c6a803 Mon Sep 17 00:00:00 2001
From: caitianchi <caitianchi@modelbest.cn>
Date: Sat, 10 Aug 2024 21:51:04 +0800
Subject: [PATCH] fix type-check

---
 examples/llava/clip.cpp                                   | 3 +--
 examples/llava/llava.cpp                                  | 4 ++--
 examples/llava/minicpmv-cli.cpp                           | 8 ++++----
 .../minicpmv2_5-convert-image-encoder-to-gguf.py          | 2 +-
 .../minicpmv2_6-convert-image-encoder-to-gguf.py          | 2 +-
 5 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index afb7929d0..ed7195bc3 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -776,8 +776,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             embeddings = ggml_gelu(ctx0, embeddings);
             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-
-        } 
+        }
         else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index c5f909cc7..851af0f00 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -254,7 +254,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
             int patch_size=14;
             load_image_size->width = img_res_v.data[i].nx;
-            load_image_size->height = img_res_v.data[i].ny; 
+            load_image_size->height = img_res_v.data[i].ny;
             clip_add_load_image_size(ctx_clip, load_image_size);
             bool encoded = false;
             int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
@@ -263,7 +263,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
             }
             else if (has_minicpmv_projector == 3) {
                 encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-            }            
+            }
             if (!encoded) {
                 LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                 return false;
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
index f108b86ec..379fc295f 100644
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -140,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
     }
     else if (has_minicpmv_projector == 3) {
         system_prompt = "<|im_start|>user\n";
-    }     
+    }
     LOG_TEE("%s: image token past: %d\n", __func__, n_past);
     eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
     process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@@ -223,7 +223,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
         }
         else if (has_minicpmv_projector == 3) {
             user_prompt = "<|im_start|>user\n" + prompt;
-        }    
+        }
     }
 
     eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@@ -232,8 +232,8 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
     }
     else if (has_minicpmv_projector == 3) {
         eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    } 
-    
+    }
+
     // generate the response
 
     LOG_TEE("\n");
diff --git a/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py
index 4519c57ab..781a7f7a6 100644
--- a/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py
@@ -6,7 +6,6 @@ import re
 import torch
 import numpy as np
 from gguf import *
-import timm
 from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
 
 TEXT = "clip.text"
@@ -159,6 +158,7 @@ fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_minicpmv_projector = False
+minicpmv_version = 2
 if args.text_only:
     fname_middle = "text-"
     has_vision_encoder = False
diff --git a/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py
index 19f70eb3e..bf6849cfe 100644
--- a/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py
@@ -133,7 +133,6 @@ class SiglipVisionConfig(PretrainedConfig):
             )
 
         return cls.from_dict(config_dict, **kwargs)
-        
 
 _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
 
@@ -1096,6 +1095,7 @@ fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_minicpmv_projector = False
+minicpmv_version = 3
 if args.text_only:
     fname_middle = "text-"
     has_vision_encoder = False