From 07baee57c91b06bb698ede2871a6d54b12e4c2cd Mon Sep 17 00:00:00 2001 From: Yutong Dai Date: Tue, 27 Aug 2024 00:18:21 +0000 Subject: [PATCH] push for juntao --- .gitignore | 20 + examples/xgenmm/CMakeLists.txt | 7 + examples/xgenmm/clip.cpp | 214 +++++- examples/xgenmm/convert.sh | 14 + examples/xgenmm/playground.ipynb | 48 ++ .../xgenmm/test_anyres_handle_patches.cpp | 637 ++++++++++++++++++ examples/xgenmm/test_anyres_img.cpp | 89 ++- .../xgenmm_convert_image_encoder_to_gguf.py | 271 ++++++-- 8 files changed, 1189 insertions(+), 111 deletions(-) create mode 100644 examples/xgenmm/convert.sh create mode 100644 examples/xgenmm/test_anyres_handle_patches.cpp diff --git a/.gitignore b/.gitignore index 0223be685..99a22c4a1 100644 --- a/.gitignore +++ b/.gitignore @@ -133,3 +133,23 @@ poetry.toml # Test models for lora adapters /lora-tests examples/xgenmm/imgs/*.csv +examples/xgenmm copy/clip.cpp +examples/xgenmm copy/clip.h +examples/xgenmm copy/CMakeLists.txt +examples/xgenmm copy/convert.sh +examples/xgenmm copy/debug.py +examples/xgenmm copy/playground.ipynb +examples/xgenmm copy/test_anyres_img.cpp +examples/xgenmm copy/xgenmm_convert_image_encoder_to_gguf.py +examples/xgenmm copy/xgenmm_surgery.py +examples/xgenmm copy/xgenmm.cpp +examples/xgenmm copy/xgenmm.h +examples/xgenmm copy/bak/xgenmm-surgery copy.py +examples/xgenmm copy/imgs/image_original_resize.csv +examples/xgenmm copy/imgs/image_res_0.csv +examples/xgenmm copy/imgs/image_res_1.csv +examples/xgenmm copy/imgs/image_res_2.csv +examples/xgenmm copy/imgs/image_res_3.csv +examples/xgenmm copy/imgs/image_res_4.csv +examples/xgenmm copy/imgs/image-1d100e9-1.jpg +examples/xgenmm copy/imgs/image-1d100e9.jpg diff --git a/examples/xgenmm/CMakeLists.txt b/examples/xgenmm/CMakeLists.txt index 40b745fb5..2d7d81588 100644 --- a/examples/xgenmm/CMakeLists.txt +++ b/examples/xgenmm/CMakeLists.txt @@ -38,6 +38,13 @@ target_link_libraries(test_anyres_img PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_ target_compile_features(xgenmm PRIVATE cxx_std_11) +set(TARGET test_anyres_handle_patches) +add_executable(test_anyres_handle_patches test_anyres_handle_patches.cpp) +install(TARGETS test_anyres_handle_patches RUNTIME) +target_link_libraries(test_anyres_handle_patches PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(xgenmm PRIVATE cxx_std_11) + + # not implemented yet # set(TARGET xgenmm-cli) # add_executable(xgenmm-cli xgenmm-cli.cpp) diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp index 1afc3e316..19d628405 100644 --- a/examples/xgenmm/clip.cpp +++ b/examples/xgenmm/clip.cpp @@ -85,6 +85,7 @@ static std::string format(const char * fmt, ...) { #define KEY_HAS_VIS_ENC "clip.has_vision_encoder" #define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector" #define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector" +#define KEY_HAS_XGENMM_PROJ "clip.has_xgenmm_projector" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_USE_GELU "clip.use_gelu" #define KEY_N_EMBD "clip.%s.embedding_length" @@ -140,13 +141,17 @@ static std::string format(const char * fmt, ...) { #define TN_MINICPMV_ATTN "resampler.attn.%s.%s" #define TN_MINICPMV_LN "resampler.ln_%s.%s" +#define TN_XGENMM_ATTN "perceiver_resampler.blk.%d.attn.%s.%s" +#define TN_XGENMM_FFN "perceiver_resampler.blk.%d.ffn.%s.%s" -enum projector_type { +enum projector_type +{ PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_PERCIVER_RESAMPLER, PROJECTOR_TYPE_UNKNOWN, }; @@ -155,6 +160,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LDP, "ldp" }, { PROJECTOR_TYPE_LDPV2, "ldpv2"}, { PROJECTOR_TYPE_RESAMPLER, "resampler"}, + { PROJECTOR_TYPE_PERCIVER_RESAMPLER, "PercevierResampler"} }; @@ -436,6 +442,30 @@ struct clip_layer { struct ggml_tensor * ln_2_b; }; +struct xgenmm_perceiver_resampler_layer +{ + // PerceiverAttention + int dim = 1152; + int dim_head = 96; + int heads = 16; + float scale = std::pow(dim_head, -0.5); + struct ggml_tensor *mm_model_k_w; + struct ggml_tensor *mm_model_q_w; + struct ggml_tensor *mm_model_v_w; + struct ggml_tensor *mm_model_o_w; + struct ggml_tensor *mm_model_ln_media_w; + struct ggml_tensor *mm_model_ln_media_b; + struct ggml_tensor *mm_model_ln_latents_w; + struct ggml_tensor *mm_model_ln_latents_b; + + // Forward + int mult = 4; + struct ggml_tensor *mm_model_ffn_ln_w; + struct ggml_tensor *mm_model_ffn_ln_b; + struct ggml_tensor *mm_model_ffn_linear_up_w; + struct ggml_tensor *mm_model_ffn_linear_down_w; +}; + struct clip_vision_model { struct clip_hparams hparams; @@ -524,13 +554,25 @@ struct clip_vision_model { struct ggml_tensor * mm_model_ln_kv_b; struct ggml_tensor * mm_model_ln_post_w; struct ggml_tensor * mm_model_ln_post_b; + + // XGenMM projection + struct ggml_tensor *mm_model_latents; + struct ggml_tensor *mm_model_projection_w; + struct ggml_tensor *mm_model_projection_b; + std::vector mm_model_layers; + struct ggml_tensor *mm_model_norm_w; + struct ggml_tensor *mm_model_norm_b; }; + + + struct clip_ctx { bool has_text_encoder = false; bool has_vision_encoder = false; bool has_llava_projector = false; bool has_minicpmv_projector = false; + bool has_xgenmm_projector = false; int minicpmv_version = 2; struct clip_vision_model vision_model; @@ -560,7 +602,7 @@ struct clip_ctx { struct clip_image_size * load_image_size; }; -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) { +static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) { if (!ctx->has_vision_encoder) { LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; @@ -584,6 +626,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 image_size_height = imgs->data->ny; } } + if (ctx->has_xgenmm_projector) { + //TODO: implement something for example, image masks + printf("use has_xgenmm_projector\n"); + } const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); @@ -591,7 +637,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; int n_layer = hparams.n_layer; - const float eps = hparams.eps; + const float eps = hparams.eps; const int batch_size = imgs->size; @@ -625,25 +671,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_tensor * pos_embed = nullptr; if (ctx->has_llava_projector) { + printf("use has_llava_projector\n"); // concat class_embeddings and patch_embeddings if (ctx->has_class_embedding) { + printf("I am in!\n"); embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + printf("created embeddings new 3d tensors\n"); ggml_set_name(embeddings, "embeddings"); ggml_set_input(embeddings); + printf("ggml_set_input\n"); embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); } } - + printf("hi1!"); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); ggml_set_input(positions); + printf("hi2!"); embeddings = ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); - + printf("hi3!"); if (ctx->has_minicpmv_projector) { int pos_w = image_size_width/patch_size; int pos_h = image_size_height/patch_size; @@ -1008,6 +1059,124 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 GGML_ASSERT(false); } } + // xgenmm-projector + else if (ctx->has_xgenmm_projector) + { + if (ctx->proj_type == PROJECTOR_TYPE_PERCIVER_RESAMPLER) + { + struct ggml_tensor * self_latents = model.mm_model_latents; + struct ggml_tensor *img_embeddings = embeddings; + // FIXME: hard coded for now + int n_layer = 6; + const float scale = model.mm_model_layers[0].scale; + const int num_head = model.mm_model_layers[0].heads; + const int dim_head = model.mm_model_layers[0].dim_head; + const int q_len = self_latents->ne[1]; + const int kv_len = img_embeddings->ne[1] + self_latents->ne[1]; // concat img_embeddings and latents + const int hidden_size = dim_head * num_head; + // TODO: repeat for (batch_size, n_query_tokens, dim) + ggml_tensor *latents = self_latents; + + for (int il = 0; il < n_layer; ++il) + { + struct ggml_tensor *residual = latents; + auto &layer = model.mm_model_layers[il]; + // layer norm + + struct ggml_tensor *img_embeddings_normalized = ggml_norm(ctx0, img_embeddings, eps); + img_embeddings_normalized = + ggml_add(ctx0, ggml_mul(ctx0, img_embeddings_normalized, layer.mm_model_ln_media_w), + layer.mm_model_ln_media_b); + + latents = ggml_norm(ctx0, latents, eps); + latents = + ggml_add(ctx0, ggml_mul(ctx0, latents, layer.mm_model_ln_latents_w), layer.mm_model_ln_latents_b); + + // cross attention + { + struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.mm_model_q_w, latents); + Q = ggml_scale_inplace(ctx0, Q, scale); + struct ggml_tensor *kv_inputs = ggml_concat(ctx0, img_embeddings_normalized, latents, 1); + // if (vision_attn_masks){ + // // printf("vision_attn_masks dim0: %ld, dim1: %ld\n", vision_attn_masks->ne[0], + // // vision_attn_masks->ne[1]); create all one tensor + // const int dim0 = latents->ne[1]; // seq length + // const int dim1 = batch_size; + // struct ggml_tensor *all_one_tensor = ggml_new_tensor_2d(ctx0, latents->type, dim0, dim1); + // ggml_set_name(all_one_tensor, "all_one_tensor"); + // ggml_set_input(all_one_tensor); + + // vision_attn_masks = ggml_concat(ctx0, vision_attn_masks, all_one_tensor, 0); + // } + struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.mm_model_k_w, kv_inputs); + struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.mm_model_v_w, kv_inputs); + // permute + Q = ggml_reshape_4d(ctx0, Q, dim_head, num_head, q_len, batch_size); + Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3)); + Q = ggml_reshape_3d(ctx0, Q, dim_head, q_len, num_head * batch_size); + + K = ggml_reshape_4d(ctx0, K, dim_head, num_head, kv_len, batch_size); + K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3)); + K = ggml_reshape_3d(ctx0, K, dim_head, kv_len, num_head * batch_size); + + V = ggml_reshape_4d(ctx0, V, dim_head, num_head, kv_len, batch_size); + V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3)); + V = ggml_reshape_3d(ctx0, V, kv_len, dim_head, num_head * batch_size); + + struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q); + + // Apply vision attention mask here. + // if (vision_attn_masks){ + // } + if (attn_bias_input) + { + KQ = ggml_add(ctx0, KQ, attn_bias_input); + }; + + // ggml_soft_max_inplace use numerical stable softmax implementation + // ggml_soft_max_inplace(ctx0, KQ) = (sim - sim.amax(dim=-1, + // keepdim=True).detach()).softmax(dim=-1) + KQ = ggml_soft_max_inplace(ctx0, KQ); + + struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ); + KQV = ggml_reshape_4d(ctx0, KQV, dim_head, q_len, num_head, batch_size); + KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + KQV = ggml_cont_3d(ctx0, KQV, hidden_size, q_len, batch_size); + + latents = ggml_mul_mat(ctx0, layer.mm_model_o_w, KQV); + } + // residual connection + + latents = ggml_add(ctx0, latents, residual); + residual = latents; // update residual + + // FFN + { + // layer norm + latents = ggml_norm(ctx0, latents, eps); + latents = ggml_add(ctx0, ggml_mul(ctx0, latents, layer.mm_model_ffn_ln_w), layer.mm_model_ffn_ln_b); + // feed forward + latents = ggml_mul_mat(ctx0, layer.mm_model_ffn_linear_up_w, latents); + latents = ggml_gelu_inplace(ctx0, latents); + latents = ggml_mul_mat(ctx0, layer.mm_model_ffn_linear_down_w, latents); + } + + // residual connection + latents = ggml_add(ctx0, latents, residual); + } + + // post layer norm + latents = ggml_norm(ctx0, latents, eps); + latents = ggml_add(ctx0, ggml_mul(ctx0, latents, model.mm_model_norm_w), model.mm_model_norm_b); + latents = + ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_projection_w, latents), model.mm_model_projection_b); + embeddings = latents; + } + else + { + GGML_ASSERT(false); + } + } // build the graph ggml_build_forward_expand(gf, embeddings); @@ -1450,6 +1619,40 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_ln_post_w = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "weight")); vision_model.mm_model_ln_post_b = get_tensor(new_clip->ctx_data, format(TN_MINICPMV_LN, "post", "bias")); } + else if(new_clip->proj_type == PROJECTOR_TYPE_PERCIVER_RESAMPLER){ + vision_model.mm_model_latents = ggml_get_tensor(new_clip->ctx_data, "perceiver_resampler.latents"); + vision_model.mm_model_projection_w = + ggml_get_tensor(new_clip->ctx_data, "perceiver_resampler.projection.weight"); + vision_model.mm_model_projection_b = + ggml_get_tensor(new_clip->ctx_data, "perceiver_resampler.projection.bias"); + // FIXME: hard coded for now + int n_layer = 6; + vision_model.mm_model_layers.resize(n_layer); + for (int il = 0; il < n_layer; ++il) + { + auto &layer = vision_model.mm_model_layers[il]; + layer.mm_model_k_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_k", "weight")); + layer.mm_model_q_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_q", "weight")); + layer.mm_model_v_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_v", "weight")); + layer.mm_model_o_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "to_out", "weight")); + layer.mm_model_ln_media_w = + get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_media", "weight")); + layer.mm_model_ln_media_b = + get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_media", "bias")); + layer.mm_model_ln_latents_w = + get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_latents", "weight")); + layer.mm_model_ln_latents_b = + get_tensor(new_clip->ctx_data, format(TN_XGENMM_ATTN, il, "norm_latents", "bias")); + layer.mm_model_ffn_ln_w = get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "ln", "weight")); + layer.mm_model_ffn_ln_b = get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "ln", "bias")); + layer.mm_model_ffn_linear_up_w = + get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "linear_up", "weight")); + layer.mm_model_ffn_linear_down_w = + get_tensor(new_clip->ctx_data, format(TN_XGENMM_FFN, il, "linear_down", "weight")); + } + vision_model.mm_model_norm_w = get_tensor(new_clip->ctx_data, "perceiver_resampler.ln.weight"); + vision_model.mm_model_norm_b = get_tensor(new_clip->ctx_data, "perceiver_resampler.ln.bias"); + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); @@ -2009,6 +2212,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); } std::pair best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions); + printf("best_resolution: %d %d\n", best_resolution.first, best_resolution.second); // clip_image_save_to_bmp(*img, "input.bmp"); resize_and_pad_image(*img, *temp, best_resolution); // we do not pad with mean-bg color anymore in llava-1.6 // clip_image_save_to_bmp(*temp, "resized.bmp"); diff --git a/examples/xgenmm/convert.sh b/examples/xgenmm/convert.sh new file mode 100644 index 000000000..75217c946 --- /dev/null +++ b/examples/xgenmm/convert.sh @@ -0,0 +1,14 @@ +source /export/share/yutong/miniconda3/bin/activate +conda activate xgenmm-flamingo +which python +# # step 1: surgery +# python xgenmm_surgery.py + +# step 2: convert to gguf (vit + projector) + +python xgenmm_convert_image_encoder_to_gguf.py \ + --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \ + --output_dirname gguf_test \ + --version siglip_kosmos_phi3_4k_instruct \ + --use_f32 \ + diff --git a/examples/xgenmm/playground.ipynb b/examples/xgenmm/playground.ipynb index b77350436..38feb1e95 100644 --- a/examples/xgenmm/playground.ipynb +++ b/examples/xgenmm/playground.ipynb @@ -5,6 +5,54 @@ "metadata": {}, "source": [] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# llama.cpp image layout\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "channel: 0\n", + "row 0: 0 3 6 9 12 \n", + "row 1: 15 18 21 24 27 \n", + "row 2: 30 33 36 39 42 \n", + "------------------------------\n", + "channel: 1\n", + "row 0: 1 4 7 10 13 \n", + "row 1: 16 19 22 25 28 \n", + "row 2: 31 34 37 40 43 \n", + "------------------------------\n", + "channel: 2\n", + "row 0: 2 5 8 11 14 \n", + "row 1: 17 20 23 26 29 \n", + "row 2: 32 35 38 41 44 \n", + "------------------------------\n" + ] + } + ], + "source": [ + "nx = 5\n", + "ny = 3\n", + "for k in range(3):\n", + " print(f'channel: {k}')\n", + " for y in range(ny):\n", + " print(f'row {y}:', end=' ')\n", + " for x in range(nx):\n", + " print(f\"{3*(y*nx + x) + k: 3d}\", end=' ')\n", + " print()\n", + " print('-'*30)\n", + " " + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/examples/xgenmm/test_anyres_handle_patches.cpp b/examples/xgenmm/test_anyres_handle_patches.cpp new file mode 100644 index 000000000..53fc5164d --- /dev/null +++ b/examples/xgenmm/test_anyres_handle_patches.cpp @@ -0,0 +1,637 @@ +#include +#include +#include +#include +#include +#include + +#include "clip.h" +#include "common.h" +#include "ggml.h" +#include "llama.h" +#include "xgenmm.h" +#ifndef _MSC_VER +#include +#endif +#include +#include +#include + +template +std::string type_name() +{ + typedef typename std::remove_reference::type TR; + std::unique_ptr own( +#ifndef _MSC_VER + abi::__cxa_demangle(typeid(TR).name(), nullptr, nullptr, nullptr), +#else + nullptr, +#endif + std::free); + std::string r = own != nullptr ? own.get() : typeid(TR).name(); + if (std::is_const::value) r += " const"; + if (std::is_volatile::value) r += " volatile"; + if (std::is_lvalue_reference::value) + r += "&"; + else if (std::is_rvalue_reference::value) + r += "&&"; + return r; +} + +struct clip_image_u8 +{ + int nx; + int ny; + + std::vector buf; +}; + +struct clip_image_f32 +{ + int nx; + int ny; + + std::vector buf; +}; + +inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } + +static bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height) +{ + const int nx = img.nx; + const int ny = img.ny; + + dst.nx = target_width; + dst.ny = target_height; + dst.buf.resize(3 * target_width * target_height); + + float Cc; + float C[5]; + float d0, d2, d3, a0, a1, a2, a3; + int i, j, k, jj; + int x, y; + float dx, dy; + float tx, ty; + + tx = (float)nx / (float)target_width; + ty = (float)ny / (float)target_height; + + // Bicubic interpolation; adapted from ViT.cpp, inspired from : + // -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36 + // -> https://en.wikipedia.org/wiki/Bicubic_interpolation + + for (i = 0; i < target_height; i++) + { + for (j = 0; j < target_width; j++) + { + x = (int)(tx * j); + y = (int)(ty * i); + + dx = tx * j - x; + dy = ty * i - y; + + for (k = 0; k < 3; k++) + { + for (jj = 0; jj <= 3; jj++) + { + d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - + img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - + img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - + img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k]; + + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + + C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx; + + d0 = C[0] - C[1]; + d2 = C[2] - C[1]; + d3 = C[3] - C[1]; + a0 = C[1]; + a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3; + a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2; + a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3; + Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy; + + const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f); + dst.buf[(i * target_width + j) * 3 + k] = float(Cc2); + } + } + } + } + + return true; +} + +enum projector_type +{ + PROJECTOR_TYPE_MLP, + PROJECTOR_TYPE_MLP_NORM, + PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_LDPV2, + PROJECTOR_TYPE_RESAMPLER, + PROJECTOR_TYPE_UNKNOWN, +}; + +static std::map PROJECTOR_TYPE_NAMES = { + {PROJECTOR_TYPE_MLP, "mlp"}, + {PROJECTOR_TYPE_LDP, "ldp"}, + {PROJECTOR_TYPE_LDPV2, "ldpv2"}, + {PROJECTOR_TYPE_RESAMPLER, "resampler"}, +}; + + + +struct clip_hparams +{ + int32_t image_size; + int32_t patch_size; + int32_t hidden_size; + int32_t n_intermediate; + int32_t projection_dim; + int32_t n_head; + int32_t n_layer; + + float eps; + + char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default) + + int32_t image_grid_pinpoints[32]; + int32_t image_crop_resolution; +}; + +struct clip_layer +{ + // attention + struct ggml_tensor* k_w; + struct ggml_tensor* k_b; + struct ggml_tensor* q_w; + struct ggml_tensor* q_b; + struct ggml_tensor* v_w; + struct ggml_tensor* v_b; + + struct ggml_tensor* o_w; + struct ggml_tensor* o_b; + + // layernorm 1 + struct ggml_tensor* ln_1_w; + struct ggml_tensor* ln_1_b; + + // ff + struct ggml_tensor* ff_i_w; + struct ggml_tensor* ff_i_b; + + struct ggml_tensor* ff_o_w; + struct ggml_tensor* ff_o_b; + + // layernorm 2 + struct ggml_tensor* ln_2_w; + struct ggml_tensor* ln_2_b; +}; + +struct clip_vision_model +{ + struct clip_hparams hparams; + + // embeddings + struct ggml_tensor* class_embedding; + struct ggml_tensor* patch_embeddings; + struct ggml_tensor* patch_bias; + struct ggml_tensor* position_embeddings; + + struct ggml_tensor* pre_ln_w; + struct ggml_tensor* pre_ln_b; + + std::vector layers; + + struct ggml_tensor* post_ln_w; + struct ggml_tensor* post_ln_b; + + struct ggml_tensor* projection; + + // LLaVA projection + struct ggml_tensor* mm_0_w = NULL; + struct ggml_tensor* mm_0_b = NULL; + struct ggml_tensor* mm_2_w = NULL; + struct ggml_tensor* mm_2_b = NULL; + + struct ggml_tensor* image_newline = NULL; + + // Yi type models with mlp+normalization projection + struct ggml_tensor* mm_1_w = NULL; // Yi type models have 0, 1, 3, 4 + struct ggml_tensor* mm_1_b = NULL; + struct ggml_tensor* mm_3_w = NULL; + struct ggml_tensor* mm_3_b = NULL; + struct ggml_tensor* mm_4_w = NULL; + struct ggml_tensor* mm_4_b = NULL; + + // MobileVLM projection + struct ggml_tensor* mm_model_mlp_1_w; + struct ggml_tensor* mm_model_mlp_1_b; + struct ggml_tensor* mm_model_mlp_3_w; + struct ggml_tensor* mm_model_mlp_3_b; + struct ggml_tensor* mm_model_block_1_block_0_0_w; + struct ggml_tensor* mm_model_block_1_block_0_1_w; + struct ggml_tensor* mm_model_block_1_block_0_1_b; + struct ggml_tensor* mm_model_block_1_block_1_fc1_w; + struct ggml_tensor* mm_model_block_1_block_1_fc1_b; + struct ggml_tensor* mm_model_block_1_block_1_fc2_w; + struct ggml_tensor* mm_model_block_1_block_1_fc2_b; + struct ggml_tensor* mm_model_block_1_block_2_0_w; + struct ggml_tensor* mm_model_block_1_block_2_1_w; + struct ggml_tensor* mm_model_block_1_block_2_1_b; + struct ggml_tensor* mm_model_block_2_block_0_0_w; + struct ggml_tensor* mm_model_block_2_block_0_1_w; + struct ggml_tensor* mm_model_block_2_block_0_1_b; + struct ggml_tensor* mm_model_block_2_block_1_fc1_w; + struct ggml_tensor* mm_model_block_2_block_1_fc1_b; + struct ggml_tensor* mm_model_block_2_block_1_fc2_w; + struct ggml_tensor* mm_model_block_2_block_1_fc2_b; + struct ggml_tensor* mm_model_block_2_block_2_0_w; + struct ggml_tensor* mm_model_block_2_block_2_1_w; + struct ggml_tensor* mm_model_block_2_block_2_1_b; + + // MobileVLM_V2 projection + struct ggml_tensor* mm_model_mlp_0_w; + struct ggml_tensor* mm_model_mlp_0_b; + struct ggml_tensor* mm_model_mlp_2_w; + struct ggml_tensor* mm_model_mlp_2_b; + struct ggml_tensor* mm_model_peg_0_w; + struct ggml_tensor* mm_model_peg_0_b; + + // MINICPMV projection + struct ggml_tensor* mm_model_pos_embed_k; + struct ggml_tensor* mm_model_query; + struct ggml_tensor* mm_model_proj; + struct ggml_tensor* mm_model_kv_proj; + struct ggml_tensor* mm_model_attn_q_w; + struct ggml_tensor* mm_model_attn_q_b; + struct ggml_tensor* mm_model_attn_k_w; + struct ggml_tensor* mm_model_attn_k_b; + struct ggml_tensor* mm_model_attn_v_w; + struct ggml_tensor* mm_model_attn_v_b; + struct ggml_tensor* mm_model_attn_o_w; + struct ggml_tensor* mm_model_attn_o_b; + struct ggml_tensor* mm_model_ln_q_w; + struct ggml_tensor* mm_model_ln_q_b; + struct ggml_tensor* mm_model_ln_kv_w; + struct ggml_tensor* mm_model_ln_kv_b; + struct ggml_tensor* mm_model_ln_post_w; + struct ggml_tensor* mm_model_ln_post_b; +}; + +struct clip_ctx +{ + bool has_text_encoder = false; + bool has_vision_encoder = false; + bool has_llava_projector = false; + bool has_minicpmv_projector = false; + bool has_xgenmm_projector = false; + int minicpmv_version = 2; + + struct clip_vision_model vision_model; + projector_type proj_type = PROJECTOR_TYPE_MLP; + + float image_mean[3]; + float image_std[3]; + bool use_gelu = false; + int32_t ftype = 1; + + bool has_class_embedding = true; + bool has_pre_norm = true; + bool has_post_norm = false; + bool has_patch_bias = false; + + struct gguf_context* ctx_gguf; + struct ggml_context* ctx_data; + + std::vector buf_compute_meta; + + // memory buffers to evaluate the model + ggml_backend_buffer_t params_buffer = NULL; + + ggml_backend_t backend = NULL; + ggml_gallocr_t compute_alloc = NULL; + + struct clip_image_size* load_image_size; +}; + +static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long* sizeOut) +{ + auto file = fopen(path, "rb"); + if (file == NULL) + { + LOG_TEE("%s: can't read file %s\n", __func__, path); + return false; + } + + fseek(file, 0, SEEK_END); + auto fileSize = ftell(file); + fseek(file, 0, SEEK_SET); + + auto buffer = (unsigned char*)malloc(fileSize); // Allocate memory to hold the file data + if (buffer == NULL) + { + LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + perror("Memory allocation error"); + fclose(file); + return false; + } + errno = 0; + size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer + if (ferror(file)) + { + die_fmt("read error: %s", strerror(errno)); + } + if (ret != (size_t)fileSize) + { + die("unexpectedly reached end of file"); + } + fclose(file); // Close the file + + *bytesOut = buffer; + *sizeOut = fileSize; + return true; +} + +void print_img(clip_image_u8* img) +{ + const int nx = img->nx; + const int ny = img->ny; + printf("num pixels: %d\n", img->buf.size()); + printf("raw img: nx:%d | ny:%d\n", nx, ny); + + const int n = nx * ny; + for (int k = 0; k < 3; k++) + { + for (int y = 0; y < 5; y++) + { + for (int x = 0; x < 10; x++) + { + // data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k]; + printf("%d ", img->buf[3 * (y * nx + x) + k]); + } + printf("\n"); + } + printf("\n"); + } +} + +void img_to_csv(clip_image_u8* img, const char* filename) +{ + std::ofstream outFile(filename); + if (!outFile.is_open()) + { + std::cerr << "Error opening file!" << std::endl; + } + const int nx = img->nx; + const int ny = img->ny; + + for (int k = 0; k < 3; k++) + { + for (int y = 0; y < ny; y++) + { + for (int x = 0; x < nx; x++) + { + outFile << int(img->buf[3 * (y * nx + x) + k]); + if (x < nx - 1) + { + outFile << ","; + } + } + outFile << std::endl; + } + outFile << std::endl; + } + + outFile.close(); + printf("file saved to %s\n", filename); +} + +void tensor_to_csv(clip_image_f32* img, const char* filename) +{ + + std::ofstream outFile(filename); + if (!outFile.is_open()) + { + std::cerr << "Error opening file!" << std::endl; + } + const int nx = img->nx; + const int ny = img->ny; + + for (int k = 0; k < 3; k++) + { + for (int y = 0; y < ny; y++) + { + for (int x = 0; x < nx; x++) + { + outFile << float(img->buf[3 * (y * nx + x) + k]); + if (x < nx - 1) + { + outFile << ","; + } + } + outFile << std::endl; + } + outFile << std::endl; + } + + outFile.close(); + printf("file saved to %s\n", filename); +} + +struct clip_image_grid_shape +{ + int first; + int second; +}; + +static std::pair select_best_resolution(const std::pair& original_size, + const std::vector>& possible_resolutions) +{ + int original_width = original_size.first; + int original_height = original_size.second; + + std::pair best_fit; + int max_effective_resolution = 0; + int min_wasted_resolution = std::numeric_limits::max(); + + for (const auto& resolution : possible_resolutions) + { + int width = resolution.first; + int height = resolution.second; + float scale = + std::min(static_cast(width) / original_width, static_cast(height) / original_height); + int downscaled_width = static_cast(original_width * scale); + int downscaled_height = static_cast(original_height * scale); + int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); + int wasted_resolution = (width * height) - effective_resolution; + // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, + // downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + if (effective_resolution > max_effective_resolution || + (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) + { + max_effective_resolution = effective_resolution; + min_wasted_resolution = wasted_resolution; + best_fit = resolution; + } + } + + return best_fit; +} + +static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair& image_size, + const std::vector>& grid_pinpoints, + int image_patch_size) +{ + /** + Conversion from gguf flat array to vector: + std::vector> possible_resolutions; + for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) { + possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]}); + } + */ + auto best_resolution = select_best_resolution(image_size, grid_pinpoints); + return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size}; +} + +int main(){ + + + const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf"; + struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2); + printf("Model loaded\n"); + for (int i=0; i < 3; i++){ + ctx->image_mean[i] = 0.5; + ctx->image_std[i] = 0.5; + } + LOG_TEE("v_image_mean %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]); + LOG_TEE("v_image_std %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]); + // [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]] + ctx->vision_model.hparams.image_grid_pinpoints[0] = 384; + ctx->vision_model.hparams.image_grid_pinpoints[1] = 768; + ctx->vision_model.hparams.image_grid_pinpoints[2] = 768; + ctx->vision_model.hparams.image_grid_pinpoints[3] = 384; + ctx->vision_model.hparams.image_grid_pinpoints[4] = 768; + ctx->vision_model.hparams.image_grid_pinpoints[5] = 768; + ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152; + ctx->vision_model.hparams.image_grid_pinpoints[7] = 384; + ctx->vision_model.hparams.image_grid_pinpoints[8] = 384; + ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152; + for (int i = 0; i < 10; i++) + { + printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]); + } + printf("\n"); + ctx->vision_model.hparams.image_size = 384; + printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size); + /* + part of: + llava_image_embed_make_with_filename + */ + const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg"; // Porcelain + // const char* image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg"; + unsigned char* image_bytes; + long image_bytes_length; + auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); + if (!loaded) + { + LOG_TEE("%s: failed to load %s\n", __func__, image_path); + return NULL; + } + + /* + part of: + llava_image_embed_make_with_bytes + */ + clip_image_u8* img = clip_image_u8_init(); + if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) + { + clip_image_u8_free(img); + LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); + return NULL; + } + + + /* + part of: + encode_image_with_clip + */ + clip_image_f32_batch img_res_v; + img_res_v.size = 0; + img_res_v.data = nullptr; + if (!clip_image_preprocess(ctx, img, &img_res_v)) + { + LOG_TEE("%s: unable to preprocess image\n", __func__); + delete[] img_res_v.data; + return false; + } + printf("img->nx:%ld | img->ny:%ld\n", img->nx, img->ny); + printf("Bacth size: img_res_v.size:%ld\n", img_res_v.size); + + // std::cout << "decltype(img_res_v.data) is " << type_name() << '\n'; + + // printf("Image Dimension in this batch: img_res_v.data->nx:%ld | img_res_v.data->nx:%ld\n", img_res_v.data->nx, + // img_res_v.data->ny); + // printf("img_res_v.data->buf.size():%ld\n", img_res_v.data->buf.size()); + + + // std::cout << "decltype(img_res_v.data[0]) is " << type_name() << '\n'; + // std::cout << "decltype(img_res_v.data[0].buf[0]) is " << type_name() << '\n'; + // for (size_t i = 0; i < img_res_v.size; i++) { + // const int nx = img_res_v.data[i].nx; + // const int ny = img_res_v.data[i].ny; + // const int vec_len = img_res_v.data[i].buf.size(); + // printf("i:%d | nx:%d | ny:%d | vec len:%d\n", i, nx, ny, vec_len); + // } + + const char* mm_patch_merge_type = clip_patch_merge_type(ctx); + printf("mm_patch_merge_type:%s\n", mm_patch_merge_type); + + struct clip_ctx* ctx_clip = ctx; + const int32_t* image_grid = clip_image_grid(ctx_clip); + + std::vector> grid_pinpoints; + for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) + { + grid_pinpoints.push_back({image_grid[i], image_grid[i + 1]}); + } + for (const auto& point : grid_pinpoints) + { + std::cout << "(" << point.first << ", " << point.second << ")" << std::endl; + } + + const int32_t image_size = clip_image_size(ctx_clip); + printf("image_size:%d\n", image_size); + + struct clip_image_grid_shape grid_shape = + get_anyres_image_grid_shape({img->nx, img->ny}, grid_pinpoints, image_size); + + printf("grid_shape.first:%d | grid_shape.second:%d\n", grid_shape.first, grid_shape.second); + + std::vector image_embd_v; + image_embd_v.resize(img_res_v.size); + printf("image_embd_v.size():%d\n", image_embd_v.size()); + for (size_t i = 0; i < img_res_v.size; i++) + { + image_embd_v[i] = + (float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 + const bool encoded = clip_image_encode( + ctx_clip, 1, &img_res_v.data[i], + image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside + if (!encoded) + { + LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1, (int)img_res_v.size); + return false; + } + } + + return 0; +} + + +// make test_anyres_handle_patches && ./bin/test_anyres_handle_patches \ No newline at end of file diff --git a/examples/xgenmm/test_anyres_img.cpp b/examples/xgenmm/test_anyres_img.cpp index 91015df1c..71b1f209d 100644 --- a/examples/xgenmm/test_anyres_img.cpp +++ b/examples/xgenmm/test_anyres_img.cpp @@ -255,19 +255,21 @@ struct clip_vision_model struct ggml_tensor* mm_model_ln_post_b; }; -struct clip_ctx { - bool has_text_encoder = false; - bool has_vision_encoder = false; +struct clip_ctx +{ + bool has_text_encoder = false; + bool has_vision_encoder = false; bool has_llava_projector = false; bool has_minicpmv_projector = false; - int minicpmv_version = 2; + bool has_xgenmm_projector = true; + int minicpmv_version = 2; struct clip_vision_model vision_model; - projector_type proj_type = PROJECTOR_TYPE_MLP; + projector_type proj_type = PROJECTOR_TYPE_MLP; - float image_mean[3]; - float image_std[3]; - bool use_gelu = false; + float image_mean[3]; + float image_std[3]; + bool use_gelu = false; int32_t ftype = 1; bool has_class_embedding = true; @@ -275,18 +277,18 @@ struct clip_ctx { bool has_post_norm = false; bool has_patch_bias = false; - struct gguf_context * ctx_gguf; - struct ggml_context * ctx_data; + struct gguf_context* ctx_gguf; + struct ggml_context* ctx_data; std::vector buf_compute_meta; // memory buffers to evaluate the model - ggml_backend_buffer_t params_buffer = NULL; + ggml_backend_buffer_t params_buffer = NULL; - ggml_backend_t backend = NULL; + ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; - struct clip_image_size * load_image_size; + struct clip_image_size* load_image_size; }; static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long* sizeOut) @@ -476,7 +478,7 @@ int main(){ } printf("\n"); ctx->vision_model.hparams.image_size = 384; - printf("params.image_size:%d\n", ctx->vision_model.hparams.image_size); + printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size); /* part of: llava_image_embed_make_with_filename @@ -504,17 +506,17 @@ int main(){ return NULL; } - print_img(img); + // print_img(img); - clip_image_u8* image_original_resize = clip_image_u8_init(); - bicubic_resize(*img, *image_original_resize, 384, 384); + // clip_image_u8* image_original_resize = clip_image_u8_init(); + // bicubic_resize(*img, *image_original_resize, 384, 384); - printf("**********************************\n"); + // printf("**********************************\n"); - print_img(image_original_resize); - img_to_csv(image_original_resize, "/export/home/llama.cpp/examples/xgenmm/imgs/image_original_resize.csv"); - printf("num pixels: %d\n", image_original_resize->buf.size()); - printf("raw img: nx:%d | ny:%d\n", image_original_resize->nx, image_original_resize->ny); + // print_img(image_original_resize); + // img_to_csv(image_original_resize, "/export/home/llama.cpp/examples/xgenmm/imgs/image_original_resize.csv"); + // printf("num pixels: %d\n", image_original_resize->buf.size()); + // printf("raw img: nx:%d | ny:%d\n", image_original_resize->nx, image_original_resize->ny); /* part of: @@ -523,7 +525,6 @@ int main(){ clip_image_f32_batch img_res_v; img_res_v.size = 0; img_res_v.data = nullptr; - if (!clip_image_preprocess(ctx, img, &img_res_v)) { LOG_TEE("%s: unable to preprocess image\n", __func__); @@ -531,36 +532,34 @@ int main(){ return false; } printf("img->nx:%ld | img->ny:%ld\n", img->nx, img->ny); - // printf("img_res_v.size:%ld\n", img_res_v.size); + printf("img_res_v.size:%ld\n", img_res_v.size); printf("img_res_v->nx:%ld | img_res_v->ny:%ld\n", img_res_v.data->nx, img_res_v.data->ny); - // std::cout << img_res_v.data->nx << " | " << img_res_v.data->ny << std::endl; - // std::cout << img_res_v.data->buf.size() << std::endl; const char* mm_patch_merge_type = clip_patch_merge_type(ctx); printf("mm_patch_merge_type:%s\n", mm_patch_merge_type); - std::string basename = "/export/home/llama.cpp/examples/xgenmm/imgs/image_res"; - for (size_t i = 0; i < img_res_v.size; i++) { - const int nx = img_res_v.data[i].nx; - const int ny = img_res_v.data[i].ny; - printf("i:%d | nx:%d | ny:%d\n", i, nx, ny); + // std::string basename = "/export/home/llama.cpp/examples/xgenmm/imgs/image_res"; + // for (size_t i = 0; i < img_res_v.size; i++) { + // const int nx = img_res_v.data[i].nx; + // const int ny = img_res_v.data[i].ny; + // printf("i:%d | nx:%d | ny:%d\n", i, nx, ny); - const int n = nx * ny; + // const int n = nx * ny; - for (int k = 0; k < 1; k++) { - for (int y = 0; y < 5; y++) { - for (int x = 0; x < 10; x++) { - // data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k]; - printf("%.4f ", img_res_v.data[i].buf[3 * (y * nx + x) + k]); - } - printf("\n"); - } - printf("\n"); - } - std::string fileName = basename + "_" + std::to_string(i) + ".csv"; - tensor_to_csv(&img_res_v.data[i], fileName.c_str()); - } + // for (int k = 0; k < 1; k++) { + // for (int y = 0; y < 5; y++) { + // for (int x = 0; x < 10; x++) { + // // data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k]; + // printf("%.4f ", img_res_v.data[i].buf[3 * (y * nx + x) + k]); + // } + // printf("\n"); + // } + // printf("\n"); + // } + // std::string fileName = basename + "_" + std::to_string(i) + ".csv"; + // tensor_to_csv(&img_res_v.data[i], fileName.c_str()); + // } // /* diff --git a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py index a550a9f8e..2699ab131 100644 --- a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py +++ b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py @@ -72,6 +72,60 @@ def bytes_to_unicode(): cs = [chr(n) for n in cs] return dict(zip(bs, cs)) +######################################### +#### belows are added for xgenmm +######################################## + +def _replace_name_vit(s,v): + s = "vision_model." + s + if re.match("vision_model.embeddings.position_embedding", s): + v = v.unsqueeze(0) + return {s: v} + return {s: v} + +def _replace_attn_layer(key, value): + # Check for the special case first + if re.match(r'layers\.(\d+)\.0\.to_kv\.weight', key): + idx = re.search(r'layers\.(\d+)\.0\.to_kv\.weight', key).group(1) + KVweight = value.chunk(2, dim=0) + return {f'blk.{idx}.attn.to_k.weight': KVweight[0], + f'blk.{idx}.attn.to_v.weight': KVweight[1] + } + + # Apply general replacements for other patterns + # Define the replacement patterns + patterns = [ + (r'layers\.(\d+)\.0\.norm_media\.(weight|bias)', r'blk.\1.attn.norm_media.\2'), + (r'layers\.(\d+)\.0\.norm_latents\.(weight|bias)', r'blk.\1.attn.norm_latents.\2'), + (r'layers\.(\d+)\.0\.to_q\.(weight)', r'blk.\1.attn.to_q.\2'), + (r'layers\.(\d+)\.0\.to_out\.(weight)', r'blk.\1.attn.to_out.\2'), + (r'layers\.(\d+)\.1\.0\.(weight|bias)', r'blk.\1.ffn.ln.\2'), + (r'layers\.(\d+)\.1\.1\.weight', r'blk.\1.ffn.linear_up.weight'), + (r'layers\.(\d+)\.1\.3\.weight', r'blk.\1.ffn.linear_down.weight'), + ] + for pattern, replacement in patterns: + key = re.sub(pattern, replacement, key) + + return {key: value} + +def replace_tensor_name_xgenmm_projector(ckpt): + identifier = 'perceiver_resampler.' + new_state_dict = {} + for k, v in ckpt.items(): + # handel the layer + if 'layers' in k: + new_kvs = _replace_attn_layer(k, v) + for new_k, new_v in new_kvs.items(): + new_state_dict[identifier+new_k] = new_v + elif k == 'norm.weight': + new_k = 'ln.weight' + new_state_dict[identifier+new_k] = v + elif k == 'norm.bias': + new_k = 'ln.bias' + new_state_dict[identifier+new_k] = v + else: + new_state_dict[identifier+k] = v + return new_state_dict class print_time(): def __init__(self, task): @@ -90,12 +144,14 @@ def get_args(): parser.add_argument("--surgery_dir", type=str, default='/export/share/yutong/xgenmm/llamacpp_wd') parser.add_argument('--version', type=str, default='siglip_kosmos_phi3_4k_instruct', help='help identify the version of the saved ckpt') # options kept from llama.cpp projects - parser.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16") - parser.add_argument("--text-only", action="store_true", required=False, + parser.add_argument("--use_f32", action="store_true", default=False, help="Use f32 instead of f16") + parser.add_argument("--text_only", action="store_true", required=False, help="Save a text-only model. It can't be used to encode images") - parser.add_argument("--vision-only", action="store_true", required=False, + parser.add_argument("--vision_only", action="store_true", required=False, help="Save a vision-only model. It can't be used to encode texts") - parser.add_argument("--xgenmm-projector", help="Path to minicpmv.projector file. If specified, save an image encoder for XgenMM models.") + parser.add_argument("--xgenmm_projector", help="Path to xgenmm projector file. If specified, save an image encoder for XgenMM models.") + parser.add_argument("--xgenmm_vit", help="Path to vit file.") + parser.add_argument("--output_dirname", default="gguf",help="Output directory") return parser.parse_args() @@ -108,7 +164,7 @@ if __name__ == "__main__": exit(1) if args.use_f32: - print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") + print("🟡 WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.") # possible data types # ftype == 0 -> float32 @@ -122,16 +178,16 @@ if __name__ == "__main__": ftype = 0 ckpt_dir = f"{args.surgery_dir}/{args.version}" - args.xgenmm_projector = f"ckpt_dir/xgenmm.projector" + if args.xgenmm_projector is None: + args.xgenmm_projector = f"{ckpt_dir}/xgenmm.projector" + if args.xgenmm_vit is None: + args.xgenmm_vit = f"{ckpt_dir}/vision_encoder/xgenmm.vision_encoder" + output_dir = f"{ckpt_dir}/{args.output_dirname}" - with print_time("Loading vision encoder"): - vision_encoder_config_path = f"{args.surgery_dir}/{args.version}/vision_encoder/config.json" - with open(vision_encoder_config_path, 'r') as f: - vision_config = json.load(f) - vision_encoder_config = SiglipVisionConfig(**vision_config) - # vision_encoder = SiglipVisionTransformer(vision_encoder_config) - # vision_encoder_ckpt = torch.load(f'{ckpt_dir}/vision_encoder/xgenmm.vision_encoder') - # vision_encoder.load_state_dict(vision_encoder_ckpt) + + vision_encoder_config_path = f"{args.surgery_dir}/{args.version}/vision_encoder/config.json" + with open(vision_encoder_config_path, 'r') as f: + vision_config = json.load(f) fname_middle = None has_text_encoder = True @@ -151,10 +207,9 @@ if __name__ == "__main__": fname_middle = "" - output_dir = f"{ckpt_dir}/gguf" if not os.path.exists(output_dir): os.makedirs(output_dir) - output_prefix = os.path.basename(output_dir).replace("ggml_", "") + fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") fout = GGUFWriter(path=fname_out, arch="clip") @@ -181,52 +236,146 @@ if __name__ == "__main__": "num_attention_heads", "layer_norm_eps", "num_hidden_layers", "hidden_act" unused: "attention_dropout", "model_type", "num_channels" """ - fout.add_uint32("clip.vision.image_size", vision_config["image_size"]) - fout.add_uint32("clip.vision.patch_size", vision_config["patch_size"]) - fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vision_config["hidden_size"]) - fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vision_config["intermediate_size"]) - # TODO: need to check the value of projection_dim; follow minicpmv to set it as 0 - fout.add_uint32("clip.vision.projection_dim", 0) - fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"]) - fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), vision_config["layer_norm_eps"]) - # TODO: chekck this as it might causes bugs - # orginial llaval implementation: - # block_count = vision_config["num_hidden_layers"] - 1 if has_xgenmm_projector else vision_config["num_hidden_layers"] - # we are different from llama1.6, which used the second to the last layer's hidden states as the image features. - block_count = vision_config["num_hidden_layers"] - fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) - print(KEY_BLOCK_COUNT) - # xgenmm use anyres with grids configuration - # 1*2, 2*1, 2*2, 3*1, 1*3, the same as the llava1.6, we just hard code it here - image_grid_pinpoints = [336, 672, 672, 336, 672, 672, 1008, 336, 336, 1008] - fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + with print_time("add vit configs to gguf"): + fout.add_uint32("clip.vision.image_size", vision_config["image_size"]) + fout.add_uint32("clip.vision.patch_size", vision_config["patch_size"]) + fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vision_config["hidden_size"]) + fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), vision_config["intermediate_size"]) + # TODO: need to check the value of projection_dim; follow minicpmv to set it as 0 + fout.add_uint32("clip.vision.projection_dim", 0) + fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vision_config["num_attention_heads"]) + fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), vision_config["layer_norm_eps"]) + # TODO: chekck this as it might causes bugs + # orginial llaval implementation: + # block_count = vision_config["num_hidden_layers"] - 1 if has_xgenmm_projector else vision_config["num_hidden_layers"] + # we are different from llama1.6, which used the second to the last layer's hidden states as the image features. + block_count = vision_config["num_hidden_layers"] + fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count) + # xgenmm use anyres with grids configuration + # 1*2, 2*1, 2*2, 3*1, 1*3, the same as the llava1.6, we just hard code it here + # the base resolution is 384 + image_grid_pinpoints = [384, 768, 768, 384, 768, 768, 1152, 384, 384, 1152] + fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints) + + + image_mean = [0.5, 0.5, 0.5] + image_std = [0.5, 0.5, 0.5] + fout.add_array("clip.vision.image_mean", image_mean) + fout.add_array("clip.vision.image_std", image_std) + + # vision_config["hidden_act"] is gelu_pytorch_tanh + # ggml implements gelu_with_tanh approximation + use_gelu = "gelu" in vision_config["hidden_act"].lower() + fout.add_bool("clip.use_gelu", use_gelu) + + # for VIT model + with print_time("Loading vision encoder and converting to gguf"): + vision_encoder_config = SiglipVisionConfig(**vision_config) + vision_encoder = SiglipVisionTransformer(vision_encoder_config) + vision_encoder_ckpt = torch.load(f'{ckpt_dir}/vision_encoder/xgenmm.vision_encoder') + vision_encoder.load_state_dict(vision_encoder_ckpt) + state_dict = vision_encoder.state_dict() + new_state_dict = {} + for k_, v_ in state_dict.items(): + kvs = _replace_name_vit(k_, v_) + for nk, nv in kvs.items(): + # split in_proj_weight to q_proj_weight, k_proj_weight, v_proj_weight + if nk == "vision_model.head.attention.in_proj_weight": + dim = int(nv.shape[0] / 3) + nk_1 = "vision_model.head.attention.q_proj_weight" + nv_1 = nv[:dim, :] + nk_2 = "vision_model.head.attention.k_proj_weight" + nv_2 = nv[dim:2*dim, :] + nk_3 = "vision_model.head.attention.v_proj_weight" + nv_3 = nv[2*dim:, :] + new_state_dict[nk_1] = nv_1 + new_state_dict[nk_2] = nv_2 + new_state_dict[nk_3] = nv_3 + # split in_proj_bias to q_proj_bias, k_proj_bias, v_proj_bias + elif nk == "vision_model.head.attention.in_proj_bias": + dim = int(nv.shape[0] / 3) + nk_1 = "vision_model.head.attention.q_proj_bias" + nv_1 = nv[:dim] + nk_2 = "vision_model.head.attention.k_proj_bias" + nv_2 = nv[dim:2*dim] + nk_3 = "vision_model.head.attention.v_proj_bias" + nv_3 = nv[2*dim:] + new_state_dict[nk_1] = nv_1 + new_state_dict[nk_2] = nv_2 + new_state_dict[nk_3] = nv_3 + else: + new_state_dict[nk] = nv + + state_dict = new_state_dict + for name, data in state_dict.items(): + if should_skip_tensor(name, has_text_encoder, has_vision_encoder, has_xgenmm_projector): + # we don't need this + print(f"skipping parameter: {name}") + continue + + name = get_tensor_name(name) + data = data.squeeze().numpy() + + n_dims = len(data.shape) + + ftype_cur = 0 + if n_dims == 4: + print(f"tensor {name} is always saved in f16") + data = data.astype(np.float16) + ftype_cur = 1 + elif ftype == 1: + if name[-7:] == ".weight" and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}") + fout.add_tensor(name, data) - - image_mean = [0.5, 0.5, 0.5] - image_std = [0.5, 0.5, 0.5] - fout.add_array("clip.vision.image_mean", image_mean) - fout.add_array("clip.vision.image_std", image_std) - - # TODO: need to check; vision_config["hidden_act"] is gelu_pytorch_tanh - use_gelu = "gelu" in vision_config["hidden_act"].lower() - fout.add_bool("clip.use_gelu", use_gelu) + print("🟢 Vit tensors added !") if has_xgenmm_projector: - projector = torch.load(args.xgenmm_projector) - fout.add_uint32("clip.projector.input_dim", projector["input_dim"]) - fout.add_uint32("clip.projector.output_dim", projector["output_dim"]) - fout.add_uint32("clip.projector.num_heads", projector["num_heads"]) - fout.add_uint32("clip.projector.num_layers", projector["num_layers"]) - fout.add_uint32("clip.projector.hidden_dim", projector["hidden_dim"]) - fout.add_float32("clip.projector.dropout", projector["dropout"]) - fout.add_string("clip.projector.activation", projector["activation"]) - fout.add_string("clip.projector.norm", projector["norm"]) - fout.add_string("clip.projector.pooling", projector["pooling"]) - fout.add_string("clip.projector.pooling_norm", projector["pooling_norm"]) - fout.add_string("clip.projector.pooling_activation", projector["pooling_activation + with print_time("Loading projector and converting to gguf"): + projector_ckpt = torch.load(args.xgenmm_projector) + projector = replace_tensor_name_xgenmm_projector(projector_ckpt) + if args.use_f32: + ftype = 0 + else: + ftype = 1 + ftype_cur = ftype + for name, tensor in projector.items(): + tensor = tensor.squeeze().numpy() + if ftype_cur == 1: + if 'ln.bias' in name or 'ln.weight' in name: + tensor = tensor.astype(np.float32) + ftype_cur = 0 + print(f'❗ {name} is set to np.float32') + else: + tensor = tensor.astype(np.float16) + ftype_cur = 1 + print(f'❗ {name} is set to np.float16') + else: + if tensor.dtype != np.float32: + tensor = tensor.astype(np.float32) + print(f'❗ {name} is set to np.float32') + ftype_cur = 0 + + print(f"{name} - {ftype_str[ftype_cur]} - shape = {tensor.shape}") + fout.add_tensor(name, tensor) + print("🟢 Projector tensors added\n") - fout.write_header_to_file() - fout.write_kv_data_to_file() - fout.write_tensors_to_file() - fout.close() - print("Done. Output file: " + fname_out) \ No newline at end of file + with print_time("write to gguf file"): + fout.write_header_to_file() + fout.write_kv_data_to_file() + fout.write_tensors_to_file() + fout.close() + print("🟢 Done. Output file: " + fname_out) \ No newline at end of file