fix type-check

2024-08-10 21:51:04 +08:00 · 2024-08-10 21:51:04 +08:00 · 32b47f600f
commit 32b47f600f
parent 4a87d1d93e
5 changed files with 9 additions and 10 deletions
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -776,8 +776,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            embeddings = ggml_gelu(ctx0, embeddings);
            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-
-        } 
+        }
        else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -254,7 +254,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
            int patch_size=14;
            load_image_size->width = img_res_v.data[i].nx;
-            load_image_size->height = img_res_v.data[i].ny; 
+            load_image_size->height = img_res_v.data[i].ny;
            clip_add_load_image_size(ctx_clip, load_image_size);
            bool encoded = false;
            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
@ -263,7 +263,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            }
            else if (has_minicpmv_projector == 3) {
                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-            }            
+            }
            if (!encoded) {
                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -140,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
    }
    else if (has_minicpmv_projector == 3) {
        system_prompt = "<|im_start|>user\n";
-    }     
+    }
    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@ -223,7 +223,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
        }
        else if (has_minicpmv_projector == 3) {
            user_prompt = "<|im_start|>user\n" + prompt;
-        }    
+        }
    }

    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@ -232,8 +232,8 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
    }
    else if (has_minicpmv_projector == 3) {
        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    } 
-    
+    }
+
    // generate the response

    LOG_TEE("\n");
--- a/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert/minicpmv2_5-convert-image-encoder-to-gguf.py
@ -6,7 +6,6 @@ import re
 import torch
 import numpy as np
 from gguf import *
-import timm
 from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig

 TEXT = "clip.text"
@ -159,6 +158,7 @@ fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_minicpmv_projector = False
+minicpmv_version = 2
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
--- a/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py
+++ b/examples/llava/minicpmv-convert/minicpmv2_6-convert-image-encoder-to-gguf.py
@ -133,7 +133,6 @@ class SiglipVisionConfig(PretrainedConfig):
            )

        return cls.from_dict(config_dict, **kwargs)
-        

 _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"

@ -1096,6 +1095,7 @@ fname_middle = None
 has_text_encoder = True
 has_vision_encoder = True
 has_minicpmv_projector = False
+minicpmv_version = 3
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False