From 7e9fc7202ebfebd5b62251b2fe08517f0c751541 Mon Sep 17 00:00:00 2001 From: HimariO Date: Fri, 18 Oct 2024 18:59:47 +0800 Subject: [PATCH] make batch and clip utils compatible with qwen2vl --- examples/llava/clip.cpp | 51 +++++++++++++++++++++++++++------------- examples/llava/clip.h | 3 +++ examples/llava/llava.cpp | 37 +++++++++++++++++++++-------- src/llama.cpp | 9 +++++-- 4 files changed, 72 insertions(+), 28 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index fe744a6c1..0b648c5b7 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -673,6 +673,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ctx0, inp, hidden_size, patches_w * patches_h, batch_size); + // ggml_build_forward_expand(gf, inp); + // ggml_free(ctx0); + + // return gf; } else { inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); @@ -756,7 +760,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_qwen2vl_merger) { Q = ggml_mrope_ext( ctx0, Q, positions, nullptr, - d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 1000000, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 10000, 1, 0, 1, 32, 1); } Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head)); Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); @@ -769,7 +773,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_qwen2vl_merger) { K = ggml_mrope_ext( ctx0, K, positions, nullptr, - d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 1000000, 1, 0, 1, 32, 1); + d_head/2, mrope_sections, 2 /*LLAMA_ROPE_TYPE_NEOX8*/, 32768, 10000, 1, 0, 1, 32, 1); } K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size); @@ -823,7 +827,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 cur = ggml_add(ctx0, embeddings, cur); embeddings = cur; + } + // ggml_build_forward_expand(gf, embeddings); + // ggml_free(ctx0); + + // return gf; // post-layernorm if (ctx->has_post_norm) { @@ -1623,6 +1632,10 @@ void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size ctx_clip->load_image_size = load_image_size; } +struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) { + return ctx_clip->load_image_size; +} + struct clip_image_size * clip_image_size_init() { struct clip_image_size * load_image_size = new struct clip_image_size(); load_image_size->width = 448; @@ -2086,6 +2099,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli // clip_image_f32 * res = clip_image_f32_init(); normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std); // res_imgs->data[0] = *res; + res_imgs->size = 1; // clip_image_f32_free(res); clip_image_u8_free(resized); @@ -2280,6 +2294,13 @@ size_t clip_embd_nbytes(const struct clip_ctx * ctx) { return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float); } +size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) { + clip_image_f32 img; + img.nx = img_w; + img.ny = img_h; + return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float); +} + int32_t clip_image_size(const struct clip_ctx * ctx) { return ctx->vision_model.hparams.image_size; } @@ -2561,26 +2582,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int pw = image_size_width / patch_size; const int ph = image_size_height / patch_size; int* positions_data = (int*)malloc(ggml_nbytes(positions)); - // for (size_t y = 0; y < ph; y++) - // { - // for (size_t x = 0; x < pw; x++) - // { - // positions_data[y * pw + x] = y; - // positions_data[num_patches + (y * pw + x)] = x; - // positions_data[num_patches * 2 + (y * pw + x)] = 0; - // } - // } int ptr = -1; for (size_t y = 0; y < ph; y+=2) { for (size_t x = 0; x < pw; x+=2) { - for (size_t dy = 0; y < 2; y++) { - for (size_t dx = 0; x < 2; x++) { - positions_data[ptr++] = y + dy; - positions_data[ptr++] = x + dx; - positions_data[ptr++] = 0; + for (size_t dy = 0; dy < 2; dy++) { + for (size_t dx = 0; dx < 2; dx++) { + positions_data[ptr++] = y + dy; + positions_data[num_patches + ptr] = x + dx; + positions_data[num_patches * 2 + ptr] = 0; } } } @@ -2780,6 +2792,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return 3584; } } + if (ctx->proj_type == PROJECTOR_TYPE_MERGER) { + return ctx->vision_model.mm_1_b->ne[0]; + } std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -2792,6 +2807,10 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) { return 0; } +bool clip_is_qwen2vl(const struct clip_ctx * ctx) { + return ctx->has_qwen2vl_merger; +} + bool tmp_clip_image_encode (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { clip_image_f32 clip_img; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 6d992d901..9f75c67db 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -45,6 +45,7 @@ CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx); +CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w); CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx); CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx); @@ -61,6 +62,7 @@ CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx); CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip); CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size); +CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip); CLIP_API struct clip_image_size * clip_image_size_init(); CLIP_API struct clip_image_u8 * clip_image_u8_init (); @@ -87,6 +89,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); +CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx); CLIP_API bool tmp_clip_image_encode (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); #ifdef __cplusplus diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 4ca53a0b8..80fba1c51 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -259,25 +259,33 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip); - if (clip_is_minicpmv(ctx_clip)) { + if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) { std::vector image_embd_v; image_embd_v.resize(img_res_v.size); struct clip_image_size * load_image_size = clip_image_size_init(); + for (size_t i = 0; i < img_res_v.size; i++) { const int64_t t_img_enc_step_start_us = ggml_time_us(); - image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); + image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); int patch_size=14; load_image_size->width = img_res_v.data[i].nx; load_image_size->height = img_res_v.data[i].ny; clip_add_load_image_size(ctx_clip, load_image_size); + bool encoded = false; - int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); - if (has_minicpmv_projector == 2) { - encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); - } - else if (has_minicpmv_projector == 3) { + if (clip_is_qwen2vl(ctx_clip)) { encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); } + else { + int has_minicpmv_projector = clip_is_minicpmv(ctx_clip); + if (has_minicpmv_projector == 2) { + encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]); + } + else if (has_minicpmv_projector == 3) { + encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); + } + } + if (!encoded) { LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; @@ -290,8 +298,11 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli int n_img_pos_out = 0; for (size_t i = 0; i < image_embd_v.size(); i++) { - std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip)); - n_img_pos_out += clip_n_patches(ctx_clip); + std::memcpy( + image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), + image_embd_v[i], + clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny)); + n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]); } *n_img_pos = n_img_pos_out; for (size_t i = 0; i < image_embd_v.size(); i++) { @@ -387,7 +398,13 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co if (clip_is_minicpmv(ctx_clip)) { num_max_patches = 10; } - float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model + float * image_embd; + if (clip_is_qwen2vl(ctx_clip)) { + // qwen2vl don't split image into chunks, so `num_max_patches` is not needed. + image_embd = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img->nx, img->ny)); + } else { + image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model + } if (!image_embd) { LOG_ERR("Unable to allocate memory for image embeddings\n"); return false; diff --git a/src/llama.cpp b/src/llama.cpp index 2e42d206d..aa09dc98c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3332,6 +3332,10 @@ struct llama_context { // whether we are computing encoder output or decoder output bool is_encoding = false; + + // number of position id each token get, 1 for each token in most cases. + // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate. + int n_pos_per_token = 1; // output of the encoder part of the encoder-decoder models std::vector embd_enc; @@ -16874,6 +16878,7 @@ static struct ggml_cgraph * llama_build_graph( } break; case LLM_ARCH_QWEN2VL: { + lctx.n_pos_per_token = 3; result = llm.build_qwen2vl(); } break; case LLM_ARCH_QWEN2MOE: @@ -17098,8 +17103,8 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) if (ubatch.pos && lctx.inp_pos) { const int64_t n_tokens = ubatch.n_tokens; - - ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); + auto n_pos = lctx.n_pos_per_token; + ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos)); } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {