fix type-check

This commit is contained in:
caitianchi 2024-08-10 21:51:04 +08:00
parent 4a87d1d93e
commit 32b47f600f
5 changed files with 9 additions and 10 deletions

View file

@ -776,8 +776,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_gelu(ctx0, embeddings); embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

View file

@ -254,7 +254,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
int patch_size=14; int patch_size=14;
load_image_size->width = img_res_v.data[i].nx; load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny; load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size); clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false; bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
@ -263,7 +263,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
} }
else if (has_minicpmv_projector == 3) { else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
} }
if (!encoded) { if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false; return false;

View file

@ -140,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
} }
else if (has_minicpmv_projector == 3) { else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n"; system_prompt = "<|im_start|>user\n";
} }
LOG_TEE("%s: image token past: %d\n", __func__, n_past); LOG_TEE("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
@ -223,7 +223,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
} }
else if (has_minicpmv_projector == 3) { else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt; user_prompt = "<|im_start|>user\n" + prompt;
} }
} }
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@ -232,8 +232,8 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
} }
else if (has_minicpmv_projector == 3) { else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
} }
// generate the response // generate the response
LOG_TEE("\n"); LOG_TEE("\n");

View file

@ -6,7 +6,6 @@ import re
import torch import torch
import numpy as np import numpy as np
from gguf import * from gguf import *
import timm
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
TEXT = "clip.text" TEXT = "clip.text"
@ -159,6 +158,7 @@ fname_middle = None
has_text_encoder = True has_text_encoder = True
has_vision_encoder = True has_vision_encoder = True
has_minicpmv_projector = False has_minicpmv_projector = False
minicpmv_version = 2
if args.text_only: if args.text_only:
fname_middle = "text-" fname_middle = "text-"
has_vision_encoder = False has_vision_encoder = False

View file

@ -133,7 +133,6 @@ class SiglipVisionConfig(PretrainedConfig):
) )
return cls.from_dict(config_dict, **kwargs) return cls.from_dict(config_dict, **kwargs)
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224" _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
@ -1096,6 +1095,7 @@ fname_middle = None
has_text_encoder = True has_text_encoder = True
has_vision_encoder = True has_vision_encoder = True
has_minicpmv_projector = False has_minicpmv_projector = False
minicpmv_version = 3
if args.text_only: if args.text_only:
fname_middle = "text-" fname_middle = "text-"
has_vision_encoder = False has_vision_encoder = False