From 51e60c996f5cdce71c21eaf53da0f6afee87acd1 Mon Sep 17 00:00:00 2001
From: John <cmt-nct@users.noreply.github.com>
Date: Mon, 12 Feb 2024 04:02:54 +0100
Subject: [PATCH] Tensors are now properly permuted. Before the embeddings were
 inserted 1:1, now they are split into the 24x24 patches as in reference.

---
 examples/llava/clip.cpp  |   4 +-
 examples/llava/llava.cpp | 136 +++++++++++++--------------------------
 2 files changed, 46 insertions(+), 94 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 56d3fd0af..60d8e8e80 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1,6 +1,7 @@
 // NOTE: This is modified from clip.cpp only for LLaVA,
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
+// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -1622,7 +1623,6 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
                 possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
             }
             std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
-            // fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second);
             // clip_image_save_to_bmp(*img, "input.bmp");
             resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
             // clip_image_save_to_bmp(*temp, "resized.bmp");
@@ -1646,7 +1646,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std
             res_tensor.clear();
             for (auto& patch : patches) {
                 clip_image_f32 *temp_image_f32 = clip_image_f32_init();
-                normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true);
+                normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, false); // set to true for pytorch fp16 value replication
                 res_tensor.push_back(temp_image_f32);
             }
 
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 5ba9d072d..42d00082b 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -34,44 +34,40 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
         /*.mem_buffer =*/ NULL,
         /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
     };
+    // Python reference code for full unpad:
+    /*
+        base_image_feature = image_feature[0]
+        image_feature = image_feature[1:]
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+        image_feature = torch.cat((
+            image_feature,
+            self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+        ), dim=-1)
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+    */
+    // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
+    // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
+    // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
+    // Once all images are processed to prepended the base_image_features without any changes.
 
-        // Python reference for full unpad:
-        // base_image_feature = image_feature[0]
-        // image_feature = image_feature[1:]
-        // image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
-        // image_feature = image_feature.flatten(1, 2).flatten(2, 3)
-        // image_feature = unpad_image(image_feature, image_sizes[image_idx])
-        // image_feature = torch.cat((
-        //     image_feature,
-        //     self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
-        // ), dim=-1)
-        // image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-        // image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+    // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
+    /*
+        image_feature = image_feature.view(2, 2, 24, 24, 4096)
+        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        image_feature = image_feature.view(2, 24, 2, 24, 4096)
+        image_feature = image_feature.flatten(0, 3)
 
-        // embeddings -> tokens -> 24 x 24
-        /**
-         * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
-         * In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet
-         * Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
-         * Once all images are processed to prepended the base_image_features without any changes.
-         */
-    /**
-        Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
-        # image_feature = image_feature.view(2, 2, 24, 24, 4096)
-        # image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
-        # image_feature = image_feature.view(2, 24, 2, 24, 4096)
-        # image_feature = image_feature.flatten(0, 3)
-
-        # Reshape to 4D tensor by merging the last two dimensions
+        // Reshape to 4D tensor by merging the last two dimensions
         image_feature = image_feature.view(2, 2, 24, 24*4096)
         image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
         image_feature = image_feature.view(-1, 4096)
-     *
-     */
-    model.ctx = ggml_init(params);
+    */
 
+    model.ctx = ggml_init(params);
     ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
-    // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
 
     ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
     model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
@@ -88,83 +84,39 @@ static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_emb
         }
     }
 
-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip));
-    // fill it with the image embeddings, ignoring the first
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), image_embd_v.size() - 1); // example: 4096 x 576 x 4
+    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
+    // fill it with the image embeddings, ignoring the base
     for (int i = 1; i < image_embd_v.size(); i++)
     {
-        // printf("Copying image_embd_v[%d] to image_features tensor\n", i);
         size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
-
-        // for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc:
-        // float *floatPtr = static_cast<float*>(image_embd_v[i]);
-        // for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++)
-        // {
-        //     // floatPtr[j] = (j + 1) / 10000.0f;
-        //     int feature = j % clip_n_mmproj_embd(ctx_clip) + 1;
-        //     floatPtr[j] = i + feature / 10000.0f;
-        // }
         memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
     }
-    // printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1));
 
     struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
-    // image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
     size_t size_ele = ggml_type_size(GGML_TYPE_F32);
-    // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
-
-    struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features,
-                                                                    num_patches_height,
-                                                                    num_patches_width,
-                                                                    num_patches_per_side * num_patches_per_side,
-                                                                    clip_n_mmproj_embd(ctx_clip),
-
-                                                                    size_ele * num_patches_height,
-                                                                    size_ele * num_patches_height * num_patches_width,
-                                                                    size_ele * num_patches_height * num_patches_width * num_patches_per_side,
-                                                                    0);
 
     struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
-                                                                num_patches_height,
-                                                                num_patches_width,
-                                                                num_patches_per_side,
                                                                 num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
-
-                                                                size_ele * num_patches_height,
-                                                                size_ele * num_patches_height * num_patches_width,
-                                                                size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0);
-
+                                                                num_patches_per_side,
+                                                                num_patches_width,
+                                                                num_patches_height,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip), 
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
+    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
     struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
-    permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug
-
-    struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0);
-    struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed
-    // struct ggml_tensor *prepared_cont = prepared; // the view only flattens
-
-    ggml_build_forward_expand(gf, prepared_cont);
-
+    // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
+    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
+    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
+    ggml_build_forward_expand(gf, flatten);
     ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-
     struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
-    //  ggml_tensor_printf(image_features,"image_features",__LINE__,false,true);
-    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true);
-    // ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true);
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
-    // append without newline tokens:
-    // memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
-    // append with newline tokens:
-    for (size_t i = 0; i < image_embd_v.size() - 1; ++i) {
-        // we append with +1 offset (base image is prepended)
-        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i,
-            (float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip),
-            clip_embd_nbytes(ctx_clip));
-        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i ,
-            (float*)model.newline->data,
-            ggml_nbytes(model.newline));
-    }
-
-    size_t newline_tokens = image_embd_v.size()-1;
-    *n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens;
+    // append without newline tokens (default behavior in llava_arch when not using unpad ):
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
+    *n_img_pos_out = result->ne[1]+clip_n_patches(ctx_clip);
 
     // Debug: Test single segments
     // Current findings: sending base image, sending a segment embedding all works similar to python