diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp
index 5d73ef7a0..fdd489004 100644
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
@@ -667,22 +667,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     struct ggml_tensor * embeddings = inp;
     struct ggml_tensor * pos_embed = nullptr;
     if (ctx->has_llava_projector) {
-        printf("use has_llava_projector\n");
+        printf("    use has_llava_projector\n");
         // concat class_embeddings and patch_embeddings
         if (ctx->has_class_embedding) {
-            printf("I am in!\n");
             embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-            printf("created embeddings new 3d tensors\n");
             ggml_set_name(embeddings, "embeddings");
             ggml_set_input(embeddings);
-            printf("ggml_set_input\n");
             embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                     embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+            printf("    first acc worked\n");
             embeddings = ggml_acc(ctx0, embeddings, inp,
                     embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+            printf("    second acc worked\n");
         }
     }
-    // printf("hi1!");
+    // printf("    after ctx->has_llava_projector\n");
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
     ggml_set_name(positions, "positions");
     ggml_set_input(positions);
@@ -2500,6 +2499,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
     // build the inference graph
     ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true);
+    printf("    build graph done\n");
     ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
     // set inputs
     const auto & model = ctx->vision_model;
@@ -2546,6 +2546,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
         free(data);
     }
+    printf("    before ctx->has_minicpmv_projector\n");
     if (ctx->has_minicpmv_projector) {
         {
             // inspired from siglip:
@@ -2638,7 +2639,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
     }
 #endif
+    printf("    before ggml_backend_graph_compute\n");
     ggml_backend_graph_compute(ctx->backend, gf);
+    printf("    after ggml_backend_graph_compute\n");
     // the last node is the embedding tensor
     struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
     // copy the embeddings to the location passed by the user
diff --git a/examples/xgenmm/convert.sh b/examples/xgenmm/convert.sh
index e62e71a5f..3c05c73f4 100644
--- a/examples/xgenmm/convert.sh
+++ b/examples/xgenmm/convert.sh
@@ -1,12 +1,12 @@
-# source /export/share/yutong/miniconda3/bin/activate
-# conda activate xgenmm-flamingo
+source /export/share/yutong/miniconda3/bin/activate
+conda activate xgenmm-flamingo
 # which python
 # # step 1: surgery
 # python xgenmm_surgery.py
 
 # step 2: convert to gguf (vit + projector)
 
-python examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py \
+python xgenmm_convert_image_encoder_to_gguf.py \
     --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
     --output_dirname gguf_test \
     --version siglip_kosmos_phi3_4k_instruct \
diff --git a/examples/xgenmm/test_anyres_handle_patches.cpp b/examples/xgenmm/test_anyres_handle_patches.cpp
index 53fc5164d..ace927b1f 100644
--- a/examples/xgenmm/test_anyres_handle_patches.cpp
+++ b/examples/xgenmm/test_anyres_handle_patches.cpp
@@ -502,33 +502,34 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 int main(){
 
 
-    const char*      clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf";
+    // const char*      clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf";
+    const char*      clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf";
     struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2);
-    printf("Model loaded\n");
-    for (int i=0; i < 3; i++){
-        ctx->image_mean[i] = 0.5;
-        ctx->image_std[i] = 0.5;
-    }
-    LOG_TEE("v_image_mean       %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]);
-    LOG_TEE("v_image_std        %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]);
-    // [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
-    ctx->vision_model.hparams.image_grid_pinpoints[0] = 384;
-    ctx->vision_model.hparams.image_grid_pinpoints[1] = 768;
-    ctx->vision_model.hparams.image_grid_pinpoints[2] = 768;
-    ctx->vision_model.hparams.image_grid_pinpoints[3] = 384;
-    ctx->vision_model.hparams.image_grid_pinpoints[4] = 768;
-    ctx->vision_model.hparams.image_grid_pinpoints[5] = 768;
-    ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152;
-    ctx->vision_model.hparams.image_grid_pinpoints[7] = 384;
-    ctx->vision_model.hparams.image_grid_pinpoints[8] = 384;
-    ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152;
-    for (int i = 0; i < 10; i++)
-    {
-        printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]);
-    }
-    printf("\n");
-    ctx->vision_model.hparams.image_size = 384;
-    printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size);
+    // printf("Model loaded\n");
+    // for (int i=0; i < 3; i++){
+    //     ctx->image_mean[i] = 0.5;
+    //     ctx->image_std[i] = 0.5;
+    // }
+    // LOG_TEE("v_image_mean       %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]);
+    // LOG_TEE("v_image_std        %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]);
+    // // [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
+    // ctx->vision_model.hparams.image_grid_pinpoints[0] = 384;
+    // ctx->vision_model.hparams.image_grid_pinpoints[1] = 768;
+    // ctx->vision_model.hparams.image_grid_pinpoints[2] = 768;
+    // ctx->vision_model.hparams.image_grid_pinpoints[3] = 384;
+    // ctx->vision_model.hparams.image_grid_pinpoints[4] = 768;
+    // ctx->vision_model.hparams.image_grid_pinpoints[5] = 768;
+    // ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152;
+    // ctx->vision_model.hparams.image_grid_pinpoints[7] = 384;
+    // ctx->vision_model.hparams.image_grid_pinpoints[8] = 384;
+    // ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152;
+    // for (int i = 0; i < 10; i++)
+    // {
+    //     printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]);
+    // }
+    // printf("\n");
+    // ctx->vision_model.hparams.image_size = 384;
+    // printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size);
     /* 
         part of: 
             llava_image_embed_make_with_filename
@@ -618,6 +619,7 @@ int main(){
     printf("image_embd_v.size():%d\n", image_embd_v.size());
     for (size_t i = 0; i < img_res_v.size; i++)
     {
+        printf("encode patch %d\n", i);
         image_embd_v[i] =
             (float*)malloc(clip_embd_nbytes(ctx_clip));  // 576 patches * 4096 embeddings * 4 bytes = 9437184
         const bool encoded = clip_image_encode(
diff --git a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py
index 2699ab131..a5b3026cf 100644
--- a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py
+++ b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py
@@ -267,6 +267,8 @@ if __name__ == "__main__":
             # ggml implements gelu_with_tanh approximation
             use_gelu = "gelu" in vision_config["hidden_act"].lower()
             fout.add_bool("clip.use_gelu", use_gelu)
+            fout.add_string("clip.vision.mm_patch_merge_type", 'spatial_unpad')
+            print("hard coded mm_patch_merge_type as spatial_unpad")
     
     # for VIT model
     with print_time("Loading vision encoder and converting to gguf"):