From 6c1f137ba55adfffd42165abd269526ff9ed84ca Mon Sep 17 00:00:00 2001 From: Yutong Dai Date: Wed, 28 Aug 2024 23:36:09 +0000 Subject: [PATCH] another push --- examples/xgenmm/clip.cpp | 11 ++--------- examples/xgenmm/test_anyres_handle_patches.cpp | 12 ++++++++++++ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp index fdd489004..6a806963b 100644 --- a/examples/xgenmm/clip.cpp +++ b/examples/xgenmm/clip.cpp @@ -629,7 +629,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if (ctx->has_xgenmm_projector) { //TODO: implement something for example, image masks - printf("use has_xgenmm_projector\n"); + printf(" use has_xgenmm_projector\n"); } const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); @@ -667,18 +667,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_tensor * embeddings = inp; struct ggml_tensor * pos_embed = nullptr; if (ctx->has_llava_projector) { - printf(" use has_llava_projector\n"); // concat class_embeddings and patch_embeddings if (ctx->has_class_embedding) { embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); ggml_set_input(embeddings); embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - printf(" first acc worked\n"); embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); - printf(" second acc worked\n"); } } // printf(" after ctx->has_llava_projector\n"); @@ -2499,7 +2495,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // build the inference graph ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); - printf(" build graph done\n"); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); // set inputs const auto & model = ctx->vision_model; @@ -2546,7 +2541,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); free(data); } - printf(" before ctx->has_minicpmv_projector\n"); if (ctx->has_minicpmv_projector) { { // inspired from siglip: @@ -2617,6 +2611,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions)); free(positions_data); } + // FIXEME: this is a hack; // { // std::cout << __LINE__ << std::endl; // struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); @@ -2639,9 +2634,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_metal_set_n_cb(ctx->backend, n_threads); } #endif - printf(" before ggml_backend_graph_compute\n"); ggml_backend_graph_compute(ctx->backend, gf); - printf(" after ggml_backend_graph_compute\n"); // the last node is the embedding tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; // copy the embeddings to the location passed by the user diff --git a/examples/xgenmm/test_anyres_handle_patches.cpp b/examples/xgenmm/test_anyres_handle_patches.cpp index ace927b1f..d6bbad46a 100644 --- a/examples/xgenmm/test_anyres_handle_patches.cpp +++ b/examples/xgenmm/test_anyres_handle_patches.cpp @@ -13,6 +13,7 @@ #ifndef _MSC_VER #include #endif +#include #include #include #include @@ -620,6 +621,11 @@ int main(){ for (size_t i = 0; i < img_res_v.size; i++) { printf("encode patch %d\n", i); + const int nx = img_res_v.data[i].nx; + const int ny = img_res_v.data[i].ny; + const int vec_len = img_res_v.data[i].buf.size(); + printf(" i:%d | nx:%d | ny:%d | vec len:%d\n", i, nx, ny, vec_len); // 384^2 * 3(channel) = 442368 + auto start = std::chrono::high_resolution_clock::now(); image_embd_v[i] = (float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 const bool encoded = clip_image_encode( @@ -630,8 +636,14 @@ int main(){ LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1, (int)img_res_v.size); return false; } + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end - start; + std::cout << " Wall time: " << duration.count() << " seconds" << std::endl; } + // handle patches goes here + + return 0; }