Fix Vit & Patch merging

2024-10-05 00:41:54 +00:00 · 2024-10-05 00:41:54 +00:00 · aa23425236
commit aa23425236
parent 56e149d627
2 changed files with 40 additions and 65 deletions
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
@ -3168,8 +3168,8 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
    int image_size_width  = image_size;
    int image_size_height = image_size;
    const int patch_size    = hparams.patch_size;
-    // const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    // const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
    if(ctx->load_image_size==nullptr){
        ctx->load_image_size= clip_image_size_init();
    }
@ -3206,28 +3206,16 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
        free(data);
    }

-    // copy from minicpm implementation for positional embedding.
-    // inspired from siglip:
-    //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
-    //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+    {
+    // compute positions
    struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
    int* positions_data = (int*)malloc(ggml_nbytes(positions));
-    int bucket_coords_h[70];
-    int bucket_coords_w[70];
-    for (int i = 0; i < pos_h; i++){
-        bucket_coords_h[i] = std::floor(70.0*i/pos_h);
-    }
-    for (int i = 0; i < pos_w; i++){
-        bucket_coords_w[i] = std::floor(70.0*i/pos_w);
-    }
-    for (int i = 0, id = 0; i < pos_h; i++){
-        for (int j = 0; j < pos_w; j++){
-            positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
-        }
+    for (int i = 0; i < num_patches; i++){
+        positions_data[i] = i;
    }
    ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
    free(positions_data);
-
+    }


    if (ggml_backend_is_cpu(ctx->backend)) {
--- a/examples/xgenmm/xgenmm.cpp
+++ b/examples/xgenmm/xgenmm.cpp
@ -438,34 +438,21 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
    float* base_image_feature_data = (float*)base_image_feature->data;

    for (int i=0; i < dim0; i++)
-    {   
-        if (i==0)
+    {
+        for (int j=0; j < dim1; j++)
        {
-            // base_image_feature_data
-            float* image_embd = image_embd_v[i];
-            for (int j=0; j < dim1; j++)
+            for (int k=0; k < dim2; k++)
            {
-                for (int k=0; k < dim2; k++)
+                image_features_data[i * dim1 * dim2 + j * dim2 + k] =
+                    image_embd_v[i+1][j * dim2 + k];
+                if (i == 0)
                {
-                    base_image_feature_data[j * dim2 + k] = image_embd[j * dim2 + k];
-                }
-            }
-        }
-        else
-        {
-            // other sub-images
-            float* image_embd = image_embd_v[i+1];
-            for (int j=0; j < dim1; j++)
-            {
-                for (int k=0; k < dim2; k++)
-                {
-                    image_features_data[i * dim1 * dim2 + j * dim2 + k] = image_embd[j * dim2 + k];
+                    base_image_feature_data[j * dim2 + k] = image_embd_v[i][j * dim2 + k];
                }
            }
        }
    }

-
    struct ggml_tensor* image_features_patchview = ggml_view_4d(
        model.ctx, image_features, num_patches_per_side * hidden_size, num_patches_per_side,
        num_patches_width, num_patches_height, size_ele * num_patches_per_side * hidden_size,
@ -575,31 +562,31 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
    attention_mask = gf->nodes[gf->n_nodes - 1];
    // memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask));
    
-    {
-        printf((" =========================     DEBUG  =========================\n"));
-        printf("Load pre-computed image embeddings and attention_mask\n");
-        std::string      filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
-        tensor_from_gguf tensor;
-        bool             is_successful = load_tensor_from_file(filename.c_str(), tensor);
-        if (!is_successful)
-        {
-            fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
-            return 1;
-        }
-        result = tensor.data;
-        // print_tensor(result, "result", 1);
-        filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
-        is_successful = load_tensor_from_file(filename.c_str(), tensor);
-        if (!is_successful)
-        {
-            fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
-            return 1;
-        }
-        attention_mask = tensor.data;
-        // print_tensor(attention_mask, "attention_mask", 1);
-        num_patches_width = 2;
-        num_patches_height = 2;
-    }
+    // {
+    //     printf((" =========================     DEBUG  =========================\n"));
+    //     printf("Load pre-computed image embeddings and attention_mask\n");
+    //     std::string      filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
+    //     tensor_from_gguf tensor;
+    //     bool             is_successful = load_tensor_from_file(filename.c_str(), tensor);
+    //     if (!is_successful)
+    //     {
+    //         fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
+    //         return 1;
+    //     }
+    //     result = tensor.data;
+    //     // print_tensor(result, "result", 1);
+    //     filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
+    //     is_successful = load_tensor_from_file(filename.c_str(), tensor);
+    //     if (!is_successful)
+    //     {
+    //         fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
+    //         return 1;
+    //     }
+    //     attention_mask = tensor.data;
+    //     // print_tensor(attention_mask, "attention_mask", 1);
+    //     num_patches_width = 2;
+    //     num_patches_height = 2;
+    // }
    

    // compute attnetion masks outside of the graph
@ -1126,4 +1113,4 @@ void llava_image_embed_free(struct llava_image_embed *embed)
 {
    free(embed->embed);
    free(embed);
-}
+}