From f3d400dac097dffb3de10037e01ce3813e7b005b Mon Sep 17 00:00:00 2001 From: caitianchi Date: Fri, 26 Jul 2024 21:15:03 +0800 Subject: [PATCH] remove uhd_image_embed --- Makefile | 2 +- examples/llava/clip.cpp | 186 +++++++++++++++- examples/llava/clip.h | 2 + examples/llava/llava.cpp | 378 ++++++-------------------------- examples/llava/llava.h | 10 +- examples/llava/minicpmv-cli.cpp | 78 +++---- 6 files changed, 291 insertions(+), 365 deletions(-) diff --git a/Makefile b/Makefile index 1b12431b6..32d027221 100644 --- a/Makefile +++ b/Makefile @@ -954,7 +954,7 @@ llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp examples/llava/clip.h exampl $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual $(CXX) $(CXXFLAGS) -c examples/llava/llava.cpp -o $(call GET_OBJ_FILE, examples/llava/llava.cpp) - $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp $^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) $(filter-out %.h $< examples/llava/clip.cpp examples/llava/llava.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) $(call GET_OBJ_FILE, examples/llava/llava.cpp) -o $@ $(LDFLAGS) llama-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 934409e1f..6430e2954 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -563,14 +563,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const auto & model = ctx->vision_model; const auto & hparams = model.hparams; - const int image_size = hparams.image_size; + const int image_size = hparams.image_size; int image_size_width = image_size; int image_size_height = image_size; if (ctx->has_minicpmv_projector) { if(load_image_size==nullptr){ load_image_size= clip_image_size_init(); } - LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height); + LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height); image_size_width = load_image_size->width; image_size_height = load_image_size->height; if (is_inf){ @@ -578,8 +578,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 image_size_height = imgs->data->ny; } } - const int patch_size = hparams.patch_size; - const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); + const int patch_size = hparams.patch_size; + const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; @@ -1748,17 +1748,182 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im return patches; } +static int ensure_divide(int length, int patch_size) { + return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); +} + +static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width = original_size.first; + int height = original_size.second; + if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { + float r = static_cast(width) / height; + height = static_cast(scale_resolution / std::sqrt(r)); + width = static_cast(height * r); + } + int best_width = ensure_divide(width, patch_size); + int best_height = ensure_divide(height, patch_size); + return std::make_pair(best_width, best_height); +} + +static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { + int width, height; + std::tie(width, height) = original_size; + int grid_x, grid_y; + std::tie(grid_x, grid_y) = grid; + + int refine_width = ensure_divide(width, grid_x); + int refine_height = ensure_divide(height, grid_y); + + int grid_width = refine_width / grid_x; + int grid_height = refine_height / grid_y; + + // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) + auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair + int best_grid_width, best_grid_height; + std::tie(best_grid_width, best_grid_height) = best_grid_size; + + // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) + std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) + return refine_size; +} + +inline int clip(int x, int lower, int upper) { + return std::max(lower, std::min(x, upper)); +} + +static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { + std::vector candidate_split_grids_nums; + for (int i : {multiple - 1, multiple, multiple + 1}) { + if (i == 1 || i > max_slice_nums) { + continue; + } + candidate_split_grids_nums.push_back(i); + } + + std::vector> candidate_grids; + for (int split_grids_nums : candidate_split_grids_nums) { + int m = 1; + while (m <= split_grids_nums) { + if (split_grids_nums % m == 0) { + candidate_grids.emplace_back(m, split_grids_nums / m); + } + ++m; + } + } + + std::pair best_grid{1, 1}; + float min_error = std::numeric_limits::infinity(); + for (const auto& grid : candidate_grids) { + float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); + if (error < min_error) { + best_grid = grid; + min_error = error; + } + } + return best_grid; +} + +// inspired from LLaVA-UHD: +// -> https://arxiv.org/pdf/2403.11703 +// -> https://github.com/thunlp/LLaVA-UHD +// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 +static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { + const std::pair original_size={img->nx,img->ny}; + const int original_width = img->nx; + const int original_height = img->ny; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + + std::vector> images; + LOG_TEE("%s: multiple %d\n", __func__, multiple); + images.push_back(std::vector()); + + if(multiple <= 1){ + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); + clip_image_u8 *source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.resize(best_size, Image.Resampling.BICUBIC) + images[images.size()-1].push_back(source_image); + } + else if(multiple > 1){ + auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); + clip_image_u8 *source_image = clip_image_u8_init(); + bicubic_resize(*img, *source_image, best_size.first, best_size.second); + // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); + images[images.size()-1].push_back(source_image); + + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); + + auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); + clip_image_u8 *refine_image = clip_image_u8_init(); + bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); + + LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); + + // split_to_patches + int width = refine_image->nx; + int height = refine_image->ny; + int grid_x = int(width / best_grid.first); + int grid_y = int(height / best_grid.second); + for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ + images.push_back(std::vector()); + for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ + clip_image_u8 * patch = clip_image_u8_init(); + patch->nx = grid_x; + patch->ny = grid_y; + patch->buf.resize(3 * patch->nx * patch->ny); + for (int y = patches_i; y < patches_i + grid_y; ++y) { + for (int x = patches_j; x < patches_j + grid_x; ++x) { + const int i = 3 * (y * refine_image->nx + x); + const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); + patch->buf[j] = refine_image->buf[i]; + patch->buf[j+1] = refine_image->buf[i+1]; + patch->buf[j+2] = refine_image->buf[i+2]; + } + } + images[images.size()-1].push_back(patch); + } + } + } + return images; +} + +int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){ + const int max_slice_nums=9; + const int scale_resolution=448; + const int original_width = ctx_clip->load_image_size->width; + const int original_height = ctx_clip->load_image_size->height; + const float log_ratio = log(1.0*original_width/original_height); + const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); + const int multiple = fmin(ceil(ratio), max_slice_nums); + std::pair best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio); + return best_grid.first; +} + // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector // res_imgs memory is being allocated here, previous allocations will be freed if found bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { - + if(clip_is_minicpmv(ctx)){ - clip_image_f32 * res = clip_image_f32_init(); - normalize_image_u8_to_f32(img, res, ctx->image_mean, ctx->image_std); - res_imgs->size = 1; + std::vector> imgs = uhd_slice_image(img); + res_imgs->size = 0; + for (size_t i = 0; i < imgs.size(); ++i){ + res_imgs->size += imgs[i].size(); + } res_imgs->data = new clip_image_f32[res_imgs->size]; - res_imgs->data[0] = *res; - clip_image_f32_free(res); + int idx = 0; + for (size_t i = 0; i < imgs.size(); ++i){ + for (size_t j = 0; j < imgs[i].size(); ++j) { + LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); + clip_image_f32 * res = clip_image_f32_init(); + normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std); + res_imgs->data[idx++] = *res; + clip_image_f32_free(res); + } + } return true; } @@ -2163,7 +2328,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if(ctx->load_image_size==nullptr){ ctx->load_image_size= clip_image_size_init(); } - LOG_TEE("%s : %d %d\n", __func__, ctx->load_image_size->width, ctx->load_image_size->height); int pos_w = ctx->load_image_size->width/patch_size; int pos_h = ctx->load_image_size->height/patch_size; int embed_dim = 4096; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 19efa88c2..232b5dc7d 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -59,7 +59,9 @@ CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx); CLIP_API int clip_n_patches (const struct clip_ctx * ctx); CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx); +CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); + CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); CLIP_API struct clip_image_f32 * clip_image_f32_init(); diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 5d3c3dba5..7d10a84f3 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -31,9 +31,6 @@ struct clip_image_grid_shape { int second; }; -struct uhd_image_embed { - std::vector> image_embeds; -}; /** * Selects the best resolution from a list of possible resolutions based on the original size. * @@ -205,6 +202,33 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector return true; } +static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) { + int width = image->nx; + int height = image->ny; + int num_patches = (height / patch_size) * (width / patch_size); + clip_image_f32 * patch = clip_image_f32_init(); + patch->nx = patch_size * num_patches; + patch->ny = patch_size; + patch->buf.resize(3 * patch->nx * patch->ny); + + int patch_index = 0; + + for (int i = 0; i < height; i += patch_size) { + for (int j = 0; j < width; j += patch_size) { + for (int pi = 0; pi < patch_size; ++pi) { + for (int pj = 0; pj < patch_size; ++pj) { + int input_index = ((i + pi) * width + (j + pj)) * 3; + int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; + patch->buf[output_index] = image->buf[input_index]; + patch->buf[output_index+1] = image->buf[input_index+1]; + patch->buf[output_index+2] = image->buf[input_index+2]; + } + } + patch_index++; + } + } + return patch; +} static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) { // std::vector img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336 @@ -221,7 +245,44 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { + if (clip_is_minicpmv(ctx_clip)) { + std::vector image_embd_v; + image_embd_v.resize(img_res_v.size); + struct clip_image_size * load_image_size = clip_image_size_init(); + for (size_t i = 0; i < img_res_v.size; i++) { + const int64_t t_img_enc_step_start_us = ggml_time_us(); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); + int patch_size=14; + load_image_size->width = img_res_v.data[i].nx; + load_image_size->height = img_res_v.data[i].ny; + clip_add_load_image_size(ctx_clip, load_image_size); + const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside + if (!encoded) { + LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + return false; + } + const int64_t t_img_enc_steop_batch_us = ggml_time_us(); + LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0); + } + const int64_t t_img_enc_batch_us = ggml_time_us(); + LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + + int n_img_pos_out = 0; + for (size_t i = 0; i < image_embd_v.size(); i++) { + std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); + n_img_pos_out += clip_n_patches(ctx_clip); + } + *n_img_pos = n_img_pos_out; + for (size_t i = 0; i < image_embd_v.size(); i++) { + free(image_embd_v[i]); + } + image_embd_v.clear(); + load_image_size->width = img->nx; + load_image_size->height = img->ny; + clip_add_load_image_size(ctx_clip, load_image_size); + LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height); + } + else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) { // flat / default llava-1.5 type embedding *n_img_pos = clip_n_patches(ctx_clip); bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 @@ -231,7 +292,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli return false; } - } else { + } + else { // spatial_unpad llava-1.6 type embedding // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working std::vector image_embd_v; @@ -300,7 +362,11 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * } bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model + int num_max_patches = 6; // + if (clip_is_minicpmv(ctx_clip)) { + num_max_patches = 10; + } + float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model if (!image_embd) { LOG_TEE("Unable to allocate memory for image embeddings\n"); return false; @@ -412,303 +478,3 @@ void llava_image_embed_free(struct llava_image_embed * embed) { free(embed->embed); free(embed); } - -static int ensure_divide(int length, int patch_size) { - return std::max(static_cast(std::round(static_cast(length) / patch_size) * patch_size), patch_size); -} - -static std::pair uhd_find_best_resize(std::pair original_size, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width = original_size.first; - int height = original_size.second; - if ((width * height > scale_resolution * scale_resolution) || allow_upscale) { - float r = static_cast(width) / height; - height = static_cast(scale_resolution / std::sqrt(r)); - width = static_cast(height * r); - } - int best_width = ensure_divide(width, patch_size); - int best_height = ensure_divide(height, patch_size); - return std::make_pair(best_width, best_height); -} - -static std::pair uhd_get_refine_size(std::pair original_size, std::pair grid, int scale_resolution, int patch_size, bool allow_upscale = false) { - int width, height; - std::tie(width, height) = original_size; - int grid_x, grid_y; - std::tie(grid_x, grid_y) = grid; - - int refine_width = ensure_divide(width, grid_x); - int refine_height = ensure_divide(height, grid_y); - - int grid_width = refine_width / grid_x; - int grid_height = refine_height / grid_y; - - // auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line) - auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair - int best_grid_width, best_grid_height; - std::tie(best_grid_width, best_grid_height) = best_grid_size; - - // std::pair refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line) - std::pair refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line) - return refine_size; -} - -inline int clip(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); -} - -static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) { - const int nx = img.nx; - const int ny = img.ny; - - dst.nx = target_width; - dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); - - float Cc; - float C[5]; - float d0, d2, d3, a0, a1, a2, a3; - int i, j, k, jj; - int x, y; - float dx, dy; - float tx, ty; - - tx = (float)nx / (float)target_width; - ty = (float)ny / (float)target_height; - - // Bicubic interpolation; adapted from ViT.cpp, inspired from : - // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 - // -> https://en.wikipedia.org/wiki/Bicubic_interpolation - - for (i = 0; i < target_height; i++) { - for (j = 0; j < target_width; j++) { - x = (int)(tx * j); - y = (int)(ty * i); - - dx = tx * j - x; - dy = ty * i - y; - - for (k = 0; k < 3; k++) { - for (jj = 0; jj <= 3; jj++) { - d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; - - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - - C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; - - d0 = C[0] - C[1]; - d2 = C[2] - C[1]; - d3 = C[3] - C[1]; - a0 = C[1]; - a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; - a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; - a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; - Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; - - const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); - dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); - } - } - } - } - - return true; -} - -static clip_image_u8 * only_v2_5_reshape_by_patch(clip_image_u8 * image, int patch_size) { - int width = image->nx; - int height = image->ny; - int num_patches = (height / patch_size) * (width / patch_size); - clip_image_u8 * patch = clip_image_u8_init(); - patch->nx = patch_size * num_patches; - patch->ny = patch_size; - patch->buf.resize(3 * patch->nx * patch->ny); - - int patch_index = 0; - - for (int i = 0; i < height; i += patch_size) { - for (int j = 0; j < width; j += patch_size) { - for (int pi = 0; pi < patch_size; ++pi) { - for (int pj = 0; pj < patch_size; ++pj) { - int input_index = ((i + pi) * width + (j + pj)) * 3; - int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3; - patch->buf[output_index] = image->buf[input_index]; - patch->buf[output_index+1] = image->buf[input_index+1]; - patch->buf[output_index+2] = image->buf[input_index+2]; - } - } - patch_index++; - } - } - return patch; -} - -// inspired from LLaVA-UHD: -// -> https://arxiv.org/pdf/2403.11703 -// -> https://github.com/thunlp/LLaVA-UHD -// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118 -static std::vector> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) { - const std::pair original_size={img->nx,img->ny}; - const int original_width = img->nx; - const int original_height = img->ny; - const float log_ratio = log(1.0*original_width/original_height); // - const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution); - const int multiple = fmin(ceil(ratio), max_slice_nums); - - std::vector> images; - LOG_TEE("%s: multiple %d\n", __func__, multiple); - images.push_back(std::vector()); - - if(multiple <= 1){ - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true); - clip_image_u8 *source_image = clip_image_u8_init(); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.resize(best_size, Image.Resampling.BICUBIC) - images[images.size()-1].push_back(source_image); - } - else if(multiple > 1){ - - std::vector candidate_split_grids_nums; - for (int i : {multiple - 1, multiple, multiple + 1}) { - if (i == 1 || i > max_slice_nums) { - continue; - } - candidate_split_grids_nums.push_back(i); - } - - auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size); - clip_image_u8 *source_image = clip_image_u8_init(); - bicubic_resize(*img, *source_image, best_size.first, best_size.second); - // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) - LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second); - images[images.size()-1].push_back(source_image); - - std::vector> candidate_grids; - - for (int split_grids_nums : candidate_split_grids_nums) { - int m = 1; - while (m <= split_grids_nums) { - if (split_grids_nums % m == 0) { - candidate_grids.emplace_back(m, split_grids_nums / m); - } - ++m; - } - } - - std::pair best_grid{1, 1}; - float min_error = std::numeric_limits::infinity(); - - for (const auto& grid : candidate_grids) { - float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second)); - if (error < min_error) { - best_grid = grid; - min_error = error; - } - } - LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second); - - auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true); - clip_image_u8 *refine_image = clip_image_u8_init(); - bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second); - - LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second); - - // split_to_patches - int width = refine_image->nx; - int height = refine_image->ny; - int grid_x = int(width / best_grid.first); - int grid_y = int(height / best_grid.second); - for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){ - images.push_back(std::vector()); - for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){ - clip_image_u8 * patch = clip_image_u8_init(); - patch->nx = grid_x; - patch->ny = grid_y; - patch->buf.resize(3 * patch->nx * patch->ny); - for (int y = patches_i; y < patches_i + grid_y; ++y) { - for (int x = patches_j; x < patches_j + grid_x; ++x) { - const int i = 3 * (y * refine_image->nx + x); - const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j)); - patch->buf[j] = refine_image->buf[i]; - patch->buf[j+1] = refine_image->buf[i+1]; - patch->buf[j+2] = refine_image->buf[i+2]; - } - } - images[images.size()-1].push_back(patch); - } - } - } - return images; -} - -struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img) { - std::vector> imgs = uhd_slice_image(img); - for (size_t i = 0; i < imgs.size(); ++i){ - for (size_t j = 0; j < imgs[i].size(); ++j) { - LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny); - } - } - struct uhd_image_embed * results = new uhd_image_embed(); - - for (size_t i = 0; i < imgs.size(); ++i){ - results->image_embeds.push_back(std::vector()); - for (size_t j = 0; j < imgs[i].size(); ++j) { - float* image_embed = NULL; - int n_image_pos = 0; - int patch_size=14; - struct clip_image_size * load_image_size = clip_image_size_init(); - load_image_size->width = imgs[i][j]->nx; - load_image_size->height = imgs[i][j]->ny; - LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height); - clip_add_load_image_size(ctx_clip, load_image_size); - bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, only_v2_5_reshape_by_patch(imgs[i][j], patch_size), &image_embed, &n_image_pos); - if (!image_embed_result) { - LOG_TEE("%s: coulnd't embed the image\n", __func__); - return NULL; - } - - auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed)); - result->embed = image_embed; - result->n_image_pos = n_image_pos; - results->image_embeds[i].push_back(result); - } - } - return results; -} - -struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) { - unsigned char* image_bytes; - long image_bytes_length; - auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); - if (!loaded) { - LOG_TEE("%s: failed to load %s\n", __func__, image_path); - return NULL; - } - clip_image_u8 * img = clip_image_u8_init(); - if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { - clip_image_u8_free(img); - LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); - return NULL; - } - - struct uhd_image_embed * embeds = llava_image_embed_make_with_bytes_uhd(ctx_clip, n_threads, img); - - clip_image_u8_free(img); - free(image_bytes); - return embeds; -} - -void llava_image_embed_free_uhd(struct uhd_image_embed * embed) { - for (size_t i = 0; i < embed->image_embeds.size(); ++i){ - for (size_t j = 0; j < embed->image_embeds[i].size(); ++j){ - free(embed->image_embeds[i][j]->embed); - free(embed->image_embeds[i][j]); - } - embed->image_embeds[i] = std::vector(); - } - embed->image_embeds = std::vector>(); -} \ No newline at end of file diff --git a/examples/llava/llava.h b/examples/llava/llava.h index 49cb0274e..f37302aa0 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -18,13 +18,11 @@ #endif struct clip_ctx; -struct uhd_image_embed; #ifdef __cplusplus extern "C" { #endif -struct uhd_image_embed; struct llava_image_embed { float * embed; int n_image_pos; @@ -39,14 +37,8 @@ LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length); /** build an image embed from a path to an image filename */ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** free an embedding made with llava_image_embed_make_* */ - -/** build an image embed from image file bytes */ -LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_bytes_uhd(struct clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img); -/** build an image embed from a path to an image filename */ -LLAVA_API struct uhd_image_embed * llava_image_embed_make_with_filename_uhd(struct clip_ctx * ctx_clip, int n_threads, const char * image_path); -LLAVA_API void llava_image_embed_free_uhd(struct uhd_image_embed * embed); +LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past); diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 483a0fbb0..6a1ae358d 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -9,10 +9,6 @@ #include #include -struct uhd_image_embed { - std::vector> image_embeds; -}; - struct llava_context { struct clip_ctx * ctx_clip = NULL; struct llama_context * ctx_llama = NULL; @@ -30,7 +26,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } -struct llama_model * llava_init(gpt_params * params) { +static struct llama_model * llava_init(gpt_params * params) { llama_backend_init(); llama_numa_init(params->numa); @@ -44,7 +40,7 @@ struct llama_model * llava_init(gpt_params * params) { return model; } -struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { auto prompt = params->prompt; if (prompt.empty()) { prompt = "describe the image in detail."; @@ -73,13 +69,18 @@ struct llava_context * llava_init_context(gpt_params * params, llama_model * mod return ctx_llava; } -void llava_free(struct llava_context * ctx_llava) { +static void llava_free(struct llava_context * ctx_llava) { + if (ctx_llava->ctx_clip) { + clip_free(ctx_llava->ctx_clip); + ctx_llava->ctx_clip = NULL; + } + llama_free(ctx_llava->ctx_llama); llama_free_model(ctx_llava->model); llama_backend_free(); } -struct clip_ctx * clip_init_context(gpt_params * params) { +static struct clip_ctx * clip_init_context(gpt_params * params) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -90,18 +91,7 @@ struct clip_ctx * clip_init_context(gpt_params * params) { return ctx_clip; } -struct uhd_image_embed * minicpmv_image_embed(gpt_params * params, const std::string & fname){ - auto ctx_clip = clip_init_context(params); - auto image_embed_and_slices = llava_image_embed_make_with_filename_uhd(ctx_clip, params->n_threads, fname.c_str()); - if (ctx_clip) { - clip_free(ctx_clip); - ctx_clip = NULL; - } - return image_embed_and_slices; -} - - -bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { +static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { int N = (int) tokens.size(); for (int i = 0; i < N; i += n_batch) { int n_eval = (int) tokens.size() - i; @@ -117,45 +107,57 @@ bool eval_tokens(struct llama_context * ctx_llama, std::vector toke return true; } -bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { +static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) { std::vector tokens; tokens.push_back(id); return eval_tokens(ctx_llama, tokens, 1, n_past); } -bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ +static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){ std::string str2 = str; std::vector embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true); return eval_tokens(ctx_llama, embd_inp, n_batch, n_past); } -void process_image(struct llava_context * ctx_llava, struct uhd_image_embed * image_embed_slices, gpt_params * params, int &n_past) { - std::string system_prompt; +static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { + float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); + std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); + + auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + slice_embed->embed = image_embed; + slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); + llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past); + llava_image_embed_free(slice_embed); +} +static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { + std::string system_prompt; + int idx = 0; + int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; LOG_TEE("%s: image token past: %d\n", __func__, n_past); eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices->image_embeds[0][0], params->n_batch, &n_past); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (image_embed_slices->image_embeds.size() > 1) { + if (num_image_embeds > 1) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 1; i < image_embed_slices->image_embeds.size(); ++i) { - for (size_t j = 0; j < image_embed_slices->image_embeds[i].size(); ++j) { + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - llava_eval_image_embed(ctx_llava->ctx_llama, image_embed_slices->image_embeds[i][j], params->n_batch, &n_past); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == image_embed_slices->image_embeds[i].size() - 1) { + if (j == num_image_embeds_col - 1) { eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); } } } eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - } LOG_TEE("%s: image token past: %d\n", __func__, n_past); } -const char * sample(struct llama_sampling_context * ctx_sampling, +static const char * sample(struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_llama, int * n_past) { const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); @@ -171,9 +173,9 @@ const char * sample(struct llama_sampling_context * ctx_sampling, } static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){ - auto embeds = minicpmv_image_embed(params, fname); - auto image_embed_slices = embeds->image_embeds; - if (!image_embed_slices[0][0]) { + auto ctx_clip = clip_init_context(params); + auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); + if (!embeds) { std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; return NULL; } @@ -191,7 +193,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri } const int64_t t_llava_init_start_us = ggml_time_us(); auto ctx_llava = llava_init_context(params, model); - + ctx_llava->ctx_clip = ctx_clip; const int64_t t_llava_init_end_us = ggml_time_us(); float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); @@ -202,7 +204,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); - llava_image_embed_free_uhd(embeds); + llava_image_embed_free(embeds); return ctx_llava; } @@ -220,7 +222,7 @@ static struct llama_sampling_context * llama_init(struct llava_context * ctx_lla return ctx_sampling; } -static const char * llama_loop(struct minicpmv_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ +static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); return tmp;