fix type-check
This commit is contained in:
parent
4a87d1d93e
commit
32b47f600f
5 changed files with 9 additions and 10 deletions
|
@ -776,8 +776,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
embeddings = ggml_gelu(ctx0, embeddings);
|
embeddings = ggml_gelu(ctx0, embeddings);
|
||||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||||
|
}
|
||||||
}
|
|
||||||
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||||
|
|
|
@ -254,7 +254,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
||||||
int patch_size=14;
|
int patch_size=14;
|
||||||
load_image_size->width = img_res_v.data[i].nx;
|
load_image_size->width = img_res_v.data[i].nx;
|
||||||
load_image_size->height = img_res_v.data[i].ny;
|
load_image_size->height = img_res_v.data[i].ny;
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
clip_add_load_image_size(ctx_clip, load_image_size);
|
||||||
bool encoded = false;
|
bool encoded = false;
|
||||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
||||||
|
@ -263,7 +263,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
}
|
}
|
||||||
else if (has_minicpmv_projector == 3) {
|
else if (has_minicpmv_projector == 3) {
|
||||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
||||||
}
|
}
|
||||||
if (!encoded) {
|
if (!encoded) {
|
||||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -140,7 +140,7 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
||||||
}
|
}
|
||||||
else if (has_minicpmv_projector == 3) {
|
else if (has_minicpmv_projector == 3) {
|
||||||
system_prompt = "<|im_start|>user\n";
|
system_prompt = "<|im_start|>user\n";
|
||||||
}
|
}
|
||||||
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
|
||||||
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
|
||||||
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
|
||||||
|
@ -223,7 +223,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
|
||||||
}
|
}
|
||||||
else if (has_minicpmv_projector == 3) {
|
else if (has_minicpmv_projector == 3) {
|
||||||
user_prompt = "<|im_start|>user\n" + prompt;
|
user_prompt = "<|im_start|>user\n" + prompt;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||||
|
@ -232,8 +232,8 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla
|
||||||
}
|
}
|
||||||
else if (has_minicpmv_projector == 3) {
|
else if (has_minicpmv_projector == 3) {
|
||||||
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// generate the response
|
// generate the response
|
||||||
|
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
|
@ -6,7 +6,6 @@ import re
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from gguf import *
|
from gguf import *
|
||||||
import timm
|
|
||||||
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
|
from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer, Idefics2VisionConfig
|
||||||
|
|
||||||
TEXT = "clip.text"
|
TEXT = "clip.text"
|
||||||
|
@ -159,6 +158,7 @@ fname_middle = None
|
||||||
has_text_encoder = True
|
has_text_encoder = True
|
||||||
has_vision_encoder = True
|
has_vision_encoder = True
|
||||||
has_minicpmv_projector = False
|
has_minicpmv_projector = False
|
||||||
|
minicpmv_version = 2
|
||||||
if args.text_only:
|
if args.text_only:
|
||||||
fname_middle = "text-"
|
fname_middle = "text-"
|
||||||
has_vision_encoder = False
|
has_vision_encoder = False
|
||||||
|
|
|
@ -133,7 +133,6 @@ class SiglipVisionConfig(PretrainedConfig):
|
||||||
)
|
)
|
||||||
|
|
||||||
return cls.from_dict(config_dict, **kwargs)
|
return cls.from_dict(config_dict, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
|
_CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
|
||||||
|
|
||||||
|
@ -1096,6 +1095,7 @@ fname_middle = None
|
||||||
has_text_encoder = True
|
has_text_encoder = True
|
||||||
has_vision_encoder = True
|
has_vision_encoder = True
|
||||||
has_minicpmv_projector = False
|
has_minicpmv_projector = False
|
||||||
|
minicpmv_version = 3
|
||||||
if args.text_only:
|
if args.text_only:
|
||||||
fname_middle = "text-"
|
fname_middle = "text-"
|
||||||
has_vision_encoder = False
|
has_vision_encoder = False
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue