From 6c1f137ba55adfffd42165abd269526ff9ed84ca Mon Sep 17 00:00:00 2001
From: Yutong Dai <rothdyt@gmail.com>
Date: Wed, 28 Aug 2024 23:36:09 +0000
Subject: [PATCH] another push

---
 examples/xgenmm/clip.cpp                       | 11 ++---------
 examples/xgenmm/test_anyres_handle_patches.cpp | 12 ++++++++++++
 2 files changed, 14 insertions(+), 9 deletions(-)
diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp
index fdd489004..6a806963b 100644
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
@@ -629,7 +629,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 
     if (ctx->has_xgenmm_projector) {
         //TODO: implement something for example, image masks
-        printf("use has_xgenmm_projector\n");
+        printf("    use has_xgenmm_projector\n");
     }
     const int patch_size           = hparams.patch_size;
     const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -667,18 +667,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     struct ggml_tensor * embeddings = inp;
     struct ggml_tensor * pos_embed = nullptr;
     if (ctx->has_llava_projector) {
-        printf("    use has_llava_projector\n");
         // concat class_embeddings and patch_embeddings
         if (ctx->has_class_embedding) {
             embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-            ggml_set_name(embeddings, "embeddings");
             ggml_set_input(embeddings);
             embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                     embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-            printf("    first acc worked\n");
             embeddings = ggml_acc(ctx0, embeddings, inp,
                     embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-            printf("    second acc worked\n");
         }
     }
     // printf("    after ctx->has_llava_projector\n");
@@ -2499,7 +2495,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
     // build the inference graph
     ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
-    printf("    build graph done\n");
     ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
     // set inputs
     const auto & model = ctx->vision_model;
@@ -2546,7 +2541,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
         free(data);
     }
-    printf("    before ctx->has_minicpmv_projector\n");
     if (ctx->has_minicpmv_projector) {
         {
             // inspired from siglip:
@@ -2617,6 +2611,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
             free(positions_data);
         }
+        // FIXEME: this is a hack;
         // {
         //            std::cout << __LINE__ << std::endl;
         //     struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
@@ -2639,9 +2634,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
     }
 #endif
-    printf("    before ggml_backend_graph_compute\n");
     ggml_backend_graph_compute(ctx->backend, gf);
-    printf("    after ggml_backend_graph_compute\n");
     // the last node is the embedding tensor
     struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
     // copy the embeddings to the location passed by the user
diff --git a/examples/xgenmm/test_anyres_handle_patches.cpp b/examples/xgenmm/test_anyres_handle_patches.cpp
index ace927b1f..d6bbad46a 100644
--- a/examples/xgenmm/test_anyres_handle_patches.cpp
+++ b/examples/xgenmm/test_anyres_handle_patches.cpp
@@ -13,6 +13,7 @@
 #ifndef _MSC_VER
 #include <cxxabi.h>
 #endif
+#include <chrono>
 #include <cstdlib>
 #include <memory>
 #include <string>
@@ -620,6 +621,11 @@ int main(){
     for (size_t i = 0; i < img_res_v.size; i++)
     {
         printf("encode patch %d\n", i);
+        const int nx = img_res_v.data[i].nx;
+        const int ny = img_res_v.data[i].ny;
+        const int vec_len = img_res_v.data[i].buf.size();
+        printf("    i:%d | nx:%d | ny:%d | vec len:%d\n", i, nx, ny, vec_len); // 384^2 * 3(channel) = 442368
+        auto start = std::chrono::high_resolution_clock::now();
         image_embd_v[i] =
             (float*)malloc(clip_embd_nbytes(ctx_clip));  // 576 patches * 4096 embeddings * 4 bytes = 9437184
         const bool encoded = clip_image_encode(
@@ -630,8 +636,14 @@ int main(){
             LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1, (int)img_res_v.size);
             return false;
         }
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> duration = end - start;
+        std::cout << "  Wall time: " << duration.count() << " seconds" << std::endl;
     }
 
+    // handle patches goes here
+    
+
     return 0;
 }