another push

2024-08-28 23:36:09 +00:00 · 2024-08-28 23:36:09 +00:00 · 6c1f137ba5
commit 6c1f137ba5
parent f70fdf5a86
2 changed files with 14 additions and 9 deletions
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
@ -629,7 +629,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

    if (ctx->has_xgenmm_projector) {
        //TODO: implement something for example, image masks
-        printf("use has_xgenmm_projector\n");
+        printf("    use has_xgenmm_projector\n");
    }
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
@ -667,18 +667,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    struct ggml_tensor * embeddings = inp;
    struct ggml_tensor * pos_embed = nullptr;
    if (ctx->has_llava_projector) {
-        printf("    use has_llava_projector\n");
        // concat class_embeddings and patch_embeddings
        if (ctx->has_class_embedding) {
            embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-            ggml_set_name(embeddings, "embeddings");
            ggml_set_input(embeddings);
            embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-            printf("    first acc worked\n");
            embeddings = ggml_acc(ctx0, embeddings, inp,
                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-            printf("    second acc worked\n");
        }
    }
    // printf("    after ctx->has_llava_projector\n");
@ -2499,7 +2495,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

    // build the inference graph
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
-    printf("    build graph done\n");
    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
    // set inputs
    const auto & model = ctx->vision_model;
@ -2546,7 +2541,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
        free(data);
    }
-    printf("    before ctx->has_minicpmv_projector\n");
    if (ctx->has_minicpmv_projector) {
        {
            // inspired from siglip:
@ -2617,6 +2611,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);
        }
+        // FIXEME: this is a hack;
        // {
        //            std::cout << __LINE__ << std::endl;
        //     struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
@ -2639,9 +2634,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
    }
 #endif
-    printf("    before ggml_backend_graph_compute\n");
    ggml_backend_graph_compute(ctx->backend, gf);
-    printf("    after ggml_backend_graph_compute\n");
    // the last node is the embedding tensor
    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
    // copy the embeddings to the location passed by the user
--- a/examples/xgenmm/test_anyres_handle_patches.cpp
+++ b/examples/xgenmm/test_anyres_handle_patches.cpp
@ -13,6 +13,7 @@
 #ifndef _MSC_VER
 #include <cxxabi.h>
 #endif
+#include <chrono>
 #include <cstdlib>
 #include <memory>
 #include <string>
@ -620,6 +621,11 @@ int main(){
    for (size_t i = 0; i < img_res_v.size; i++)
    {
        printf("encode patch %d\n", i);
+        const int nx = img_res_v.data[i].nx;
+        const int ny = img_res_v.data[i].ny;
+        const int vec_len = img_res_v.data[i].buf.size();
+        printf("    i:%d | nx:%d | ny:%d | vec len:%d\n", i, nx, ny, vec_len); // 384^2 * 3(channel) = 442368
+        auto start = std::chrono::high_resolution_clock::now();
        image_embd_v[i] =
            (float*)malloc(clip_embd_nbytes(ctx_clip));  // 576 patches * 4096 embeddings * 4 bytes = 9437184
        const bool encoded = clip_image_encode(
@ -630,8 +636,14 @@ int main(){
            LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1, (int)img_res_v.size);
            return false;
        }
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> duration = end - start;
+        std::cout << "  Wall time: " << duration.count() << " seconds" << std::endl;
    }

+    // handle patches goes here
+    
+
    return 0;
 }