remove uhd_image_embed
This commit is contained in:
parent
72b962925b
commit
f3d400dac0
6 changed files with 291 additions and 365 deletions
|
@ -563,14 +563,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
const auto & model = ctx->vision_model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int image_size = hparams.image_size;
|
||||
const int image_size = hparams.image_size;
|
||||
int image_size_width = image_size;
|
||||
int image_size_height = image_size;
|
||||
if (ctx->has_minicpmv_projector) {
|
||||
if(load_image_size==nullptr){
|
||||
load_image_size= clip_image_size_init();
|
||||
}
|
||||
LOG_TEE("%s : %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
LOG_TEE("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
||||
image_size_width = load_image_size->width;
|
||||
image_size_height = load_image_size->height;
|
||||
if (is_inf){
|
||||
|
@ -578,8 +578,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
image_size_height = imgs->data->ny;
|
||||
}
|
||||
}
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int patch_size = hparams.patch_size;
|
||||
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
||||
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||
const int hidden_size = hparams.hidden_size;
|
||||
const int n_head = hparams.n_head;
|
||||
|
@ -1748,17 +1748,182 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
|||
return patches;
|
||||
}
|
||||
|
||||
static int ensure_divide(int length, int patch_size) {
|
||||
return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_find_best_resize(std::pair<int, int> original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||
int width = original_size.first;
|
||||
int height = original_size.second;
|
||||
if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
|
||||
float r = static_cast<float>(width) / height;
|
||||
height = static_cast<int>(scale_resolution / std::sqrt(r));
|
||||
width = static_cast<int>(height * r);
|
||||
}
|
||||
int best_width = ensure_divide(width, patch_size);
|
||||
int best_height = ensure_divide(height, patch_size);
|
||||
return std::make_pair(best_width, best_height);
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size, std::pair<int, int> grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
|
||||
int width, height;
|
||||
std::tie(width, height) = original_size;
|
||||
int grid_x, grid_y;
|
||||
std::tie(grid_x, grid_y) = grid;
|
||||
|
||||
int refine_width = ensure_divide(width, grid_x);
|
||||
int refine_height = ensure_divide(height, grid_y);
|
||||
|
||||
int grid_width = refine_width / grid_x;
|
||||
int grid_height = refine_height / grid_y;
|
||||
|
||||
// auto best_grid_size = find_best_resize(std::make_tuple(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); (old line)
|
||||
auto best_grid_size = uhd_find_best_resize(std::make_pair(grid_width, grid_height), scale_resolution, patch_size, allow_upscale); // (new line) => fixes conversion for make_tuple to make_pair
|
||||
int best_grid_width, best_grid_height;
|
||||
std::tie(best_grid_width, best_grid_height) = best_grid_size;
|
||||
|
||||
// std::pair<int, int> refine_size = std::make_tuple(best_grid_width * grid_x, best_grid_height * grid_y); (old line)
|
||||
std::pair<int, int> refine_size = std::make_pair(best_grid_width * grid_x, best_grid_height * grid_y); // (new line)
|
||||
return refine_size;
|
||||
}
|
||||
|
||||
inline int clip(int x, int lower, int upper) {
|
||||
return std::max(lower, std::min(x, upper));
|
||||
}
|
||||
|
||||
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||
std::vector<int> candidate_split_grids_nums;
|
||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||
if (i == 1 || i > max_slice_nums) {
|
||||
continue;
|
||||
}
|
||||
candidate_split_grids_nums.push_back(i);
|
||||
}
|
||||
|
||||
std::vector<std::pair<int, int>> candidate_grids;
|
||||
for (int split_grids_nums : candidate_split_grids_nums) {
|
||||
int m = 1;
|
||||
while (m <= split_grids_nums) {
|
||||
if (split_grids_nums % m == 0) {
|
||||
candidate_grids.emplace_back(m, split_grids_nums / m);
|
||||
}
|
||||
++m;
|
||||
}
|
||||
}
|
||||
|
||||
std::pair<int, int> best_grid{1, 1};
|
||||
float min_error = std::numeric_limits<float>::infinity();
|
||||
for (const auto& grid : candidate_grids) {
|
||||
float error = std::abs(log_ratio - std::log(1.0 * grid.first / grid.second));
|
||||
if (error < min_error) {
|
||||
best_grid = grid;
|
||||
min_error = error;
|
||||
}
|
||||
}
|
||||
return best_grid;
|
||||
}
|
||||
|
||||
// inspired from LLaVA-UHD:
|
||||
// -> https://arxiv.org/pdf/2403.11703
|
||||
// -> https://github.com/thunlp/LLaVA-UHD
|
||||
// -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
|
||||
static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
|
||||
const std::pair<int, int> original_size={img->nx,img->ny};
|
||||
const int original_width = img->nx;
|
||||
const int original_height = img->ny;
|
||||
const float log_ratio = log(1.0*original_width/original_height);
|
||||
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||
|
||||
std::vector<std::vector<clip_image_u8 *>> images;
|
||||
LOG_TEE("%s: multiple %d\n", __func__, multiple);
|
||||
images.push_back(std::vector<clip_image_u8 *>());
|
||||
|
||||
if(multiple <= 1){
|
||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
|
||||
clip_image_u8 *source_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
|
||||
images[images.size()-1].push_back(source_image);
|
||||
}
|
||||
else if(multiple > 1){
|
||||
auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
|
||||
clip_image_u8 *source_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *source_image, best_size.first, best_size.second);
|
||||
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
|
||||
LOG_TEE("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
|
||||
images[images.size()-1].push_back(source_image);
|
||||
|
||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||
LOG_TEE("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);
|
||||
|
||||
auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
|
||||
clip_image_u8 *refine_image = clip_image_u8_init();
|
||||
bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);
|
||||
|
||||
LOG_TEE("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
|
||||
|
||||
// split_to_patches
|
||||
int width = refine_image->nx;
|
||||
int height = refine_image->ny;
|
||||
int grid_x = int(width / best_grid.first);
|
||||
int grid_y = int(height / best_grid.second);
|
||||
for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
|
||||
images.push_back(std::vector<clip_image_u8 *>());
|
||||
for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
|
||||
clip_image_u8 * patch = clip_image_u8_init();
|
||||
patch->nx = grid_x;
|
||||
patch->ny = grid_y;
|
||||
patch->buf.resize(3 * patch->nx * patch->ny);
|
||||
for (int y = patches_i; y < patches_i + grid_y; ++y) {
|
||||
for (int x = patches_j; x < patches_j + grid_x; ++x) {
|
||||
const int i = 3 * (y * refine_image->nx + x);
|
||||
const int j = 3 * ((y-patches_i) * patch->nx + (x-patches_j));
|
||||
patch->buf[j] = refine_image->buf[i];
|
||||
patch->buf[j+1] = refine_image->buf[i+1];
|
||||
patch->buf[j+2] = refine_image->buf[i+2];
|
||||
}
|
||||
}
|
||||
images[images.size()-1].push_back(patch);
|
||||
}
|
||||
}
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){
|
||||
const int max_slice_nums=9;
|
||||
const int scale_resolution=448;
|
||||
const int original_width = ctx_clip->load_image_size->width;
|
||||
const int original_height = ctx_clip->load_image_size->height;
|
||||
const float log_ratio = log(1.0*original_width/original_height);
|
||||
const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
|
||||
const int multiple = fmin(ceil(ratio), max_slice_nums);
|
||||
std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
|
||||
return best_grid.first;
|
||||
}
|
||||
|
||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||
|
||||
|
||||
if(clip_is_minicpmv(ctx)){
|
||||
clip_image_f32 * res = clip_image_f32_init();
|
||||
normalize_image_u8_to_f32(img, res, ctx->image_mean, ctx->image_std);
|
||||
res_imgs->size = 1;
|
||||
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);
|
||||
res_imgs->size = 0;
|
||||
for (size_t i = 0; i < imgs.size(); ++i){
|
||||
res_imgs->size += imgs[i].size();
|
||||
}
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
res_imgs->data[0] = *res;
|
||||
clip_image_f32_free(res);
|
||||
int idx = 0;
|
||||
for (size_t i = 0; i < imgs.size(); ++i){
|
||||
for (size_t j = 0; j < imgs[i].size(); ++j) {
|
||||
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
|
||||
clip_image_f32 * res = clip_image_f32_init();
|
||||
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
|
||||
res_imgs->data[idx++] = *res;
|
||||
clip_image_f32_free(res);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -2163,7 +2328,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
if(ctx->load_image_size==nullptr){
|
||||
ctx->load_image_size= clip_image_size_init();
|
||||
}
|
||||
LOG_TEE("%s : %d %d\n", __func__, ctx->load_image_size->width, ctx->load_image_size->height);
|
||||
int pos_w = ctx->load_image_size->width/patch_size;
|
||||
int pos_h = ctx->load_image_size->height/patch_size;
|
||||
int embed_dim = 4096;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue