fix uhd code for review comment

This commit is contained in:
caitianchi 2024-07-25 15:22:11 +08:00
parent 6fd0937e9f
commit 107e1edb20
4 changed files with 18 additions and 62 deletions

View file

@ -413,32 +413,6 @@ void llava_image_embed_free(struct llava_image_embed * embed) {
free(embed);
}
static bool encode_image_with_clip_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
// std::vector<clip_image_f32*> img_res_v;
// format VectN x H x W x RGB (N x 448 x 448 x 3)
clip_image_f32 * img_res_v = clip_image_f32_init();
uhd_normalize_image_u8_to_f32(ctx_clip, img, img_res_v);
const int64_t t_img_enc_start_us = ggml_time_us();
const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
LOG_TEE("\n%s: mm_patch_merge_type is %s.\n", __func__, mm_patch_merge_type);
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v, image_embd); // image_embd shape is 96 x 4096
if (!encoded) {
LOG_TEE("Unable to encode image\n");
return false;
}
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
const int64_t t_img_enc_end_us = ggml_time_us();
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
return true;
}
static int ensure_divide(int length, int patch_size) {
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
}
@ -691,7 +665,7 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
load_image_size->height = imgs[i][j]->ny;
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
clip_add_load_image_size(ctx_clip, load_image_size);
bool image_embed_result = llava_image_embed_make_with_clip_img_uhd(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos);
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos);
if (!image_embed_result) {
LOG_TEE("%s: coulnd't embed the image\n", __func__);
return NULL;
@ -706,25 +680,6 @@ struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx *
return results;
}
bool llava_image_embed_make_with_clip_img_uhd(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
if (!image_embd) {
LOG_TEE("Unable to allocate memory for image embeddings\n");
return false;
}
int n_img_pos;
if (!encode_image_with_clip_uhd(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
free(image_embd);
return false;
}
*image_embd_out = image_embd;
*n_img_pos_out = n_img_pos;
return true;
}
struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
unsigned char* image_bytes;
long image_bytes_length;