the difference is from resize

2024-08-22 00:04:54 +00:00 · 2024-08-22 00:04:54 +00:00 · ba0861e384
commit ba0861e384
parent b31dc0b5ed
11 changed files with 4200 additions and 0 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -39,6 +39,7 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(quantize)
    add_subdirectory(retrieval)
+    add_subdirectory(xgenmm)
    if (GGML_RPC)
        add_subdirectory(rpc)
    endif()
--- a/examples/xgenmm/CMakeLists.txt
+++ b/examples/xgenmm/CMakeLists.txt
@ -0,0 +1,51 @@
+add_library(xgenmm OBJECT
+            xgenmm.cpp
+            xgenmm.h
+            clip.cpp
+            clip.h
+            )
+
+target_link_libraries(xgenmm PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(xgenmm PUBLIC .)
+target_include_directories(xgenmm PUBLIC ../..)
+target_include_directories(xgenmm PUBLIC ../../common)
+
+target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+add_library(xgenmm_static STATIC $<TARGET_OBJECTS:xgenmm>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(xgenmm PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(xgenmm PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(xgenmm_shared SHARED $<TARGET_OBJECTS:xgenmm>)
+    target_link_libraries(xgenmm_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS xgenmm_shared LIBRARY)
+endif()
+
+if (NOT MSVC)
+    target_compile_options(xgenmm PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+if(TARGET BUILD_INFO)
+    add_dependencies(xgenmm BUILD_INFO)
+endif()
+
+
+set(TARGET test_anyres_img)
+add_executable(test_anyres_img test_anyres_img.cpp)
+install(TARGETS test_anyres_img RUNTIME)
+target_link_libraries(test_anyres_img PRIVATE common xgenmm ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+
+# not implemented yet
+# set(TARGET xgenmm-cli)
+# add_executable(xgenmm-cli xgenmm-cli.cpp)
+# install(TARGETS xgenmm-cli RUNTIME)
+# target_link_libraries(xgenmm-cli PRIVATE common xgenmm_io xgenmm ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(xgenmm PRIVATE cxx_std_11)
+
+# add_library(xgenmm_io OBJECT
+#         xgenmm_io.cpp
+# )
+# target_link_libraries(xgenmm_io PRIVATE xgenmm ${CMAKE_THREAD_LIBS_INIT})            
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
--- a/examples/xgenmm/clip.h
+++ b/examples/xgenmm/clip.h
@ -0,0 +1,98 @@
+/* 
+08/18/2024 - Yutong - The file is adpated from examples/llava/llava.h in the llama.cpp repository.
+*/
+
+#ifndef CLIP_H
+#define CLIP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define CLIP_API __declspec(dllexport)
+#        else
+#            define CLIP_API __declspec(dllimport)
+#        endif
+#    else
+#        define CLIP_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define CLIP_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_ctx;
+
+struct clip_image_size {
+    int width;
+    int height;
+};
+
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};
+
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
+
+CLIP_API void clip_free(struct clip_ctx * ctx);
+
+CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+
+CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+
+CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
+CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
+CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
+
+CLIP_API struct clip_image_size * clip_image_size_init();
+CLIP_API struct clip_image_u8  * clip_image_u8_init ();
+CLIP_API struct clip_image_f32 * clip_image_f32_init();
+
+CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
+CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+
+CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+
+/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+
+/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+
+CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
+
+CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+
+CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CLIP_H
--- a/examples/xgenmm/debug.py
+++ b/examples/xgenmm/debug.py
@ -0,0 +1,15 @@
+from torchvision.transforms import Resize
+from torchvision.transforms import InterpolationMode
+from PIL import Image
+import numpy as np
+
+n_px = 384
+resize_func = Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True)
+
+img_dir = "./imgs"
+image_path_1 = f'{img_dir}/image-1d100e9-1.jpg'
+image_path_2 = f'{img_dir}/image-1d100e9.jpg'
+image_1 = Image.open(image_path_1).convert('RGB')
+image_2 = Image.open(image_path_2).convert('RGB')
+
+print(np.asarray(resize_func(image_2))[:5, :10, 0])
--- a/examples/xgenmm/imgs/image-1d100e9-1.jpg
+++ b/examples/xgenmm/imgs/image-1d100e9-1.jpg
--- a/examples/xgenmm/imgs/image-1d100e9.jpg
+++ b/examples/xgenmm/imgs/image-1d100e9.jpg
--- a/examples/xgenmm/model_breakdown.ipynb
+++ b/examples/xgenmm/model_breakdown.ipynb
@ -1,5 +1,242 @@
 {
 "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Image Resize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from omegaconf import OmegaConf\n",
+    "from open_flamingo.train.any_res_data_utils import process_images\n",
+    "from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, Lambda\n",
+    "from torchvision.transforms import InterpolationMode\n",
+    "BICUBIC = InterpolationMode.BICUBIC\n",
+    "from PIL import Image\n",
+    "from functools import partial\n",
+    "\n",
+    "cfg = dict(\n",
+    "    model_family = 'kosmos',\n",
+    "    lm_path = 'microsoft/Phi-3-mini-4k-instruct',\n",
+    "    # vision_encoder_path = 'ViT-H-14-378-quickgelu',\n",
+    "    # vision_encoder_pretrained = 'dfn5b',\n",
+    "    vision_encoder_path = 'google/siglip-so400m-patch14-384',\n",
+    "    vision_encoder_pretrained = 'google',\n",
+    "    num_vision_tokens = 128,\n",
+    "    image_aspect_ratio = 'anyres',\n",
+    "    anyres_patch_sampling = True,\n",
+    "    anyres_grids=[[1,2],[2,1],[2,2],[3,1],[1,3]],\n",
+    "    ckpt_pth = '/export/share/manli_shu/models/open-flamingo-dev/anyres_ablation_HFSiglip_patch128-kosmos_non_instruct-phi3_4k_instruct_nq128_pre_V3_5-llava_1p6_ocrmathmix_v4-8x8-ckpt2/checkpoint_0.pt',\n",
+    ")\n",
+    "cfg = OmegaConf.create(cfg)\n",
+    "n_px = 384\n",
+    "image_processor = Compose([\n",
+    "        Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True),\n",
+    "        Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),\n",
+    "        ToTensor(),\n",
+    "        Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))\n",
+    "    ])\n",
+    "image_proc = partial(process_images, image_processor=image_processor, model_cfg=cfg)\n",
+    "base_img_size = image_processor.transforms[0].size[0]\n",
+    "anyres_grids = []\n",
+    "for (m,n) in cfg.anyres_grids:\n",
+    "    anyres_grids.append([base_img_size*m, base_img_size*n])\n",
+    "cfg.anyres_grids = anyres_grids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "image_aspect_ratio: anyres\n",
+      "anyres_grids: [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "image_aspect_ratio = cfg.image_aspect_ratio\n",
+    "print(f\"image_aspect_ratio: {image_aspect_ratio}\")\n",
+    "anyres_grids = cfg.anyres_grids\n",
+    "print(f\"anyres_grids: {anyres_grids}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_dir = \"./imgs\"\n",
+    "image_path_1 = f'{img_dir}/image-1d100e9-1.jpg'\n",
+    "image_path_2 = f'{img_dir}/image-1d100e9.jpg'\n",
+    "image_1 = Image.open(image_path_1).convert('RGB')\n",
+    "image_2 = Image.open(image_path_2).convert('RGB')\n",
+    "images = [image_1, image_2]\n",
+    "image_size = [image_1.size, image_2.size]\n",
+    "image_size = [image_size]\n",
+    "vision_x = [image_proc([img]) for img in images]\n",
+    "vision_x = [vision_x]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'torchvision.transforms.transforms.Resize'>\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[Resize(size=(384, 384), interpolation=bicubic, max_size=None, antialias=True),\n",
+       " Lambda(),\n",
+       " ToTensor(),\n",
+       " Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))]"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(type(image_processor.transforms[0]))\n",
+    "image_processor.transforms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[217, 212, 211, 213, 213, 210, 210, 210, 213, 214],\n",
+       "       [213, 211, 212, 212, 209, 212, 211, 210, 210, 211],\n",
+       "       [213, 211, 211, 212, 210, 213, 212, 211, 210, 210],\n",
+       "       [215, 211, 209, 212, 212, 211, 210, 210, 210, 210],\n",
+       "       [211, 208, 209, 211, 210, 211, 211, 211, 211, 211]], dtype=uint8)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "np.asarray(image_processor.transforms[0](image_2))[:5, :10, 0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cubic_interpolate(p, x):\n",
+    "    return (\n",
+    "        p[1] +\n",
+    "        0.5 * x * (p[2] - p[0] + \n",
+    "        x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + \n",
+    "        x * (3.0 * (p[1] - p[2]) + p[3] - p[0])))\n",
+    "    )\n",
+    "\n",
+    "def bicubic_interpolate(p, x, y):\n",
+    "    arr = np.array([cubic_interpolate(p[i], y) for i in range(4)])\n",
+    "    return cubic_interpolate(arr, x)\n",
+    "\n",
+    "def resize_bicubic_pil(image, new_width, new_height):\n",
+    "    # Convert the PIL image to a NumPy array\n",
+    "    image_np = np.array(image)\n",
+    "    \n",
+    "    height, width, channels = image_np.shape\n",
+    "    resized_image = np.zeros((new_height, new_width, channels))\n",
+    "\n",
+    "    x_ratio = width / new_width\n",
+    "    y_ratio = height / new_height\n",
+    "\n",
+    "    for i in range(new_height):\n",
+    "        for j in range(new_width):\n",
+    "            x = j * x_ratio\n",
+    "            y = i * y_ratio\n",
+    "\n",
+    "            x_int = int(x)\n",
+    "            y_int = int(y)\n",
+    "\n",
+    "            x_diff = x - x_int\n",
+    "            y_diff = y - y_int\n",
+    "\n",
+    "            p = np.zeros((4, 4, channels))\n",
+    "\n",
+    "            for m in range(-1, 3):\n",
+    "                for n in range(-1, 3):\n",
+    "                    xm = min(max(x_int + m, 0), width - 1)\n",
+    "                    yn = min(max(y_int + n, 0), height - 1)\n",
+    "                    p[m + 1, n + 1] = image_np[yn, xm]\n",
+    "\n",
+    "            for c in range(channels):\n",
+    "                resized_image[i, j, c] = bicubic_interpolate(p[:, :, c], x_diff, y_diff)\n",
+    "\n",
+    "    # Convert the NumPy array back to a PIL image\n",
+    "    resized_image = np.clip(resized_image, 0, 255).astype(np.uint8)\n",
+    "    return Image.fromarray(resized_image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[222, 217, 214, 216, 218, 213, 212, 214, 216, 218],\n",
+       "       [213, 209, 209, 211, 209, 209, 209, 209, 208, 210],\n",
+       "       [212, 210, 211, 212, 209, 213, 212, 209, 209, 210],\n",
+       "       [217, 212, 211, 212, 212, 212, 211, 210, 210, 211],\n",
+       "       [212, 208, 208, 210, 210, 210, 211, 211, 211, 210]], dtype=uint8)"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res = resize_bicubic_pil(image_2, base_img_size, base_img_size)\n",
+    "np.asarray(res)[:5, :10, 0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model surgery"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 3,
--- a/examples/xgenmm/test_anyres_img.cpp
+++ b/examples/xgenmm/test_anyres_img.cpp
@ -0,0 +1,530 @@
+#include "ggml.h"
+#include "common.h"
+#include "clip.h"
+#include "xgenmm.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+
+
+struct clip_image_u8
+{
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+struct clip_image_f32
+{
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+inline int  clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); }
+
+static bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height)
+{
+    const int nx = img.nx;
+    const int ny = img.ny;
+
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    float Cc;
+    float C[5];
+    float d0, d2, d3, a0, a1, a2, a3;
+    int   i, j, k, jj;
+    int   x, y;
+    float dx, dy;
+    float tx, ty;
+
+    tx = (float)nx / (float)target_width;
+    ty = (float)ny / (float)target_height;
+
+    // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+    //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+    //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+    for (i = 0; i < target_height; i++)
+    {
+        for (j = 0; j < target_width; j++)
+        {
+            x = (int)(tx * j);
+            y = (int)(ty * i);
+
+            dx = tx * j - x;
+            dy = ty * i - y;
+
+            for (k = 0; k < 3; k++)
+            {
+                for (jj = 0; jj <= 3; jj++)
+                {
+                    d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] -
+                         img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] -
+                         img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] -
+                         img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+
+                    C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+                    d0 = C[0] - C[1];
+                    d2 = C[2] - C[1];
+                    d3 = C[3] - C[1];
+                    a0 = C[1];
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+                    const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+                    dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+enum projector_type
+{
+    PROJECTOR_TYPE_MLP,
+    PROJECTOR_TYPE_MLP_NORM,
+    PROJECTOR_TYPE_LDP,
+    PROJECTOR_TYPE_LDPV2,
+    PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_UNKNOWN,
+};
+
+static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
+    {PROJECTOR_TYPE_MLP, "mlp"},
+    {PROJECTOR_TYPE_LDP, "ldp"},
+    {PROJECTOR_TYPE_LDPV2, "ldpv2"},
+    {PROJECTOR_TYPE_RESAMPLER, "resampler"},
+};
+
+
+
+struct clip_hparams
+{
+    int32_t image_size;
+    int32_t patch_size;
+    int32_t hidden_size;
+    int32_t n_intermediate;
+    int32_t projection_dim;
+    int32_t n_head;
+    int32_t n_layer;
+
+    float eps;
+
+    char mm_patch_merge_type[32] = "flat";  // spatial_unpad or flat (default)
+
+    int32_t image_grid_pinpoints[32];
+    int32_t image_crop_resolution;
+};
+
+struct clip_layer
+{
+    // attention
+    struct ggml_tensor* k_w;
+    struct ggml_tensor* k_b;
+    struct ggml_tensor* q_w;
+    struct ggml_tensor* q_b;
+    struct ggml_tensor* v_w;
+    struct ggml_tensor* v_b;
+
+    struct ggml_tensor* o_w;
+    struct ggml_tensor* o_b;
+
+    // layernorm 1
+    struct ggml_tensor* ln_1_w;
+    struct ggml_tensor* ln_1_b;
+
+    // ff
+    struct ggml_tensor* ff_i_w;
+    struct ggml_tensor* ff_i_b;
+
+    struct ggml_tensor* ff_o_w;
+    struct ggml_tensor* ff_o_b;
+
+    // layernorm 2
+    struct ggml_tensor* ln_2_w;
+    struct ggml_tensor* ln_2_b;
+};
+
+struct clip_vision_model
+{
+    struct clip_hparams hparams;
+
+    // embeddings
+    struct ggml_tensor* class_embedding;
+    struct ggml_tensor* patch_embeddings;
+    struct ggml_tensor* patch_bias;
+    struct ggml_tensor* position_embeddings;
+
+    struct ggml_tensor* pre_ln_w;
+    struct ggml_tensor* pre_ln_b;
+
+    std::vector<clip_layer> layers;
+
+    struct ggml_tensor* post_ln_w;
+    struct ggml_tensor* post_ln_b;
+
+    struct ggml_tensor* projection;
+
+    // LLaVA projection
+    struct ggml_tensor* mm_0_w = NULL;
+    struct ggml_tensor* mm_0_b = NULL;
+    struct ggml_tensor* mm_2_w = NULL;
+    struct ggml_tensor* mm_2_b = NULL;
+
+    struct ggml_tensor* image_newline = NULL;
+
+    // Yi type models with mlp+normalization projection
+    struct ggml_tensor* mm_1_w = NULL;  // Yi type models have 0, 1, 3, 4
+    struct ggml_tensor* mm_1_b = NULL;
+    struct ggml_tensor* mm_3_w = NULL;
+    struct ggml_tensor* mm_3_b = NULL;
+    struct ggml_tensor* mm_4_w = NULL;
+    struct ggml_tensor* mm_4_b = NULL;
+
+    // MobileVLM projection
+    struct ggml_tensor* mm_model_mlp_1_w;
+    struct ggml_tensor* mm_model_mlp_1_b;
+    struct ggml_tensor* mm_model_mlp_3_w;
+    struct ggml_tensor* mm_model_mlp_3_b;
+    struct ggml_tensor* mm_model_block_1_block_0_0_w;
+    struct ggml_tensor* mm_model_block_1_block_0_1_w;
+    struct ggml_tensor* mm_model_block_1_block_0_1_b;
+    struct ggml_tensor* mm_model_block_1_block_1_fc1_w;
+    struct ggml_tensor* mm_model_block_1_block_1_fc1_b;
+    struct ggml_tensor* mm_model_block_1_block_1_fc2_w;
+    struct ggml_tensor* mm_model_block_1_block_1_fc2_b;
+    struct ggml_tensor* mm_model_block_1_block_2_0_w;
+    struct ggml_tensor* mm_model_block_1_block_2_1_w;
+    struct ggml_tensor* mm_model_block_1_block_2_1_b;
+    struct ggml_tensor* mm_model_block_2_block_0_0_w;
+    struct ggml_tensor* mm_model_block_2_block_0_1_w;
+    struct ggml_tensor* mm_model_block_2_block_0_1_b;
+    struct ggml_tensor* mm_model_block_2_block_1_fc1_w;
+    struct ggml_tensor* mm_model_block_2_block_1_fc1_b;
+    struct ggml_tensor* mm_model_block_2_block_1_fc2_w;
+    struct ggml_tensor* mm_model_block_2_block_1_fc2_b;
+    struct ggml_tensor* mm_model_block_2_block_2_0_w;
+    struct ggml_tensor* mm_model_block_2_block_2_1_w;
+    struct ggml_tensor* mm_model_block_2_block_2_1_b;
+
+    // MobileVLM_V2 projection
+    struct ggml_tensor* mm_model_mlp_0_w;
+    struct ggml_tensor* mm_model_mlp_0_b;
+    struct ggml_tensor* mm_model_mlp_2_w;
+    struct ggml_tensor* mm_model_mlp_2_b;
+    struct ggml_tensor* mm_model_peg_0_w;
+    struct ggml_tensor* mm_model_peg_0_b;
+
+    // MINICPMV projection
+    struct ggml_tensor* mm_model_pos_embed_k;
+    struct ggml_tensor* mm_model_query;
+    struct ggml_tensor* mm_model_proj;
+    struct ggml_tensor* mm_model_kv_proj;
+    struct ggml_tensor* mm_model_attn_q_w;
+    struct ggml_tensor* mm_model_attn_q_b;
+    struct ggml_tensor* mm_model_attn_k_w;
+    struct ggml_tensor* mm_model_attn_k_b;
+    struct ggml_tensor* mm_model_attn_v_w;
+    struct ggml_tensor* mm_model_attn_v_b;
+    struct ggml_tensor* mm_model_attn_o_w;
+    struct ggml_tensor* mm_model_attn_o_b;
+    struct ggml_tensor* mm_model_ln_q_w;
+    struct ggml_tensor* mm_model_ln_q_b;
+    struct ggml_tensor* mm_model_ln_kv_w;
+    struct ggml_tensor* mm_model_ln_kv_b;
+    struct ggml_tensor* mm_model_ln_post_w;
+    struct ggml_tensor* mm_model_ln_post_b;
+};
+
+struct clip_ctx {
+    bool has_text_encoder    = false;
+    bool has_vision_encoder  = false;
+    bool has_llava_projector = false;
+    bool has_minicpmv_projector = false;
+    int minicpmv_version = 2;
+
+    struct clip_vision_model vision_model;
+    projector_type proj_type = PROJECTOR_TYPE_MLP;
+
+    float image_mean[3];
+    float image_std[3];
+    bool use_gelu = false;
+    int32_t ftype = 1;
+
+    bool has_class_embedding = true;
+    bool has_pre_norm = true;
+    bool has_post_norm = false;
+    bool has_patch_bias = false;
+
+    struct gguf_context * ctx_gguf;
+    struct ggml_context * ctx_data;
+
+    std::vector<uint8_t> buf_compute_meta;
+
+    // memory buffers to evaluate the model
+    ggml_backend_buffer_t params_buffer  = NULL;
+
+    ggml_backend_t backend       = NULL;
+    ggml_gallocr_t compute_alloc = NULL;
+
+    struct clip_image_size * load_image_size;
+};
+
+static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long* sizeOut)
+{
+    auto file = fopen(path, "rb");
+    if (file == NULL)
+    {
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    auto fileSize = ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    auto buffer = (unsigned char*)malloc(fileSize);  // Allocate memory to hold the file data
+    if (buffer == NULL)
+    {
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        perror("Memory allocation error");
+        fclose(file);
+        return false;
+    }
+    errno = 0;
+    size_t ret = fread(buffer, 1, fileSize, file);  // Read the file into the buffer
+    if (ferror(file))
+    {
+        die_fmt("read error: %s", strerror(errno));
+    }
+    if (ret != (size_t)fileSize)
+    {
+        die("unexpectedly reached end of file");
+    }
+    fclose(file);  // Close the file
+
+    *bytesOut = buffer;
+    *sizeOut = fileSize;
+    return true;
+}
+
+void print_img(clip_image_u8* img)
+{
+    const int nx = img->nx;
+    const int ny = img->ny;
+    printf("num pixels: %d\n", img->buf.size());
+    printf("raw img: nx:%d | ny:%d\n", nx, ny);
+
+    const int n = nx * ny;
+    for (int k = 0; k < 3; k++)
+    {
+        for (int y = 0; y < 5; y++)
+        {
+            for (int x = 0; x < 10; x++)
+            {
+                // data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k];
+                printf("%d ", img->buf[3 * (y * nx + x) + k]);
+            }
+            printf("\n");
+        }
+        printf("\n");
+    }
+}
+
+int main(){
+    /*
+    Pytorch Image Processing Pipeline
+        n_px = hf_processor.image_processor.size['height']
+        image_processor = Compose([
+            Resize((n_px, n_px), interpolation=InterpolationMode.BICUBIC, antialias=True),
+            Lambda(lambda x: x.convert('RGB') if x.mode != 'RGB' else x),
+            ToTensor(),
+            Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
+        ])
+        anyres_grids = [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
+        grid_pinpoints = anyres_grids
+        best_resolution = select_best_resolution(image.size, possible_resolutions)
+        image_padded = resize_and_pad_image(image, best_resolution)
+        processor_size = processor.transforms[0].size
+        patches = divide_to_patches(image_padded, processor_size[0])
+        image_original_resize = image.resize((processor_size[0], processor_size[0]))
+        image_patches = [image_original_resize] + patches
+        image_patches = [processor(image_patch) for image_patch in image_patches]
+        return torch.stack(image_patches, dim=0)
+
+        this part is already implemented in the clip_image_preprocess function in clip.cpp
+    */
+
+    const char*      clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf";
+    // struct ggml_context* meta = NULL;
+
+    // struct gguf_init_params params = {
+    //     /*.no_alloc = */ true,
+    //     /*.ctx      = */ &meta,
+    // };
+
+    // struct gguf_context* ctx = gguf_init_from_file(clip_path, params);
+    // if (!ctx)
+    // {
+    //     throw std::runtime_error(
+    //         format("%s: failed to load CLIP model from %s. Does this file exist?\n", __func__, clip_path));
+    // }
+    struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2);
+    printf("Model loaded\n");
+    for (int i=0; i < 3; i++){
+        ctx->image_mean[i] = 0.5;
+        ctx->image_std[i] = 0.5;
+    }
+    LOG_TEE("v_image_mean       %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]);
+    LOG_TEE("v_image_std        %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]);
+    // [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]]
+    ctx->vision_model.hparams.image_grid_pinpoints[0] = 384;
+    ctx->vision_model.hparams.image_grid_pinpoints[1] = 768;
+    ctx->vision_model.hparams.image_grid_pinpoints[2] = 768;
+    ctx->vision_model.hparams.image_grid_pinpoints[3] = 384;
+    ctx->vision_model.hparams.image_grid_pinpoints[4] = 768;
+    ctx->vision_model.hparams.image_grid_pinpoints[5] = 768;
+    ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152;
+    ctx->vision_model.hparams.image_grid_pinpoints[7] = 384;
+    ctx->vision_model.hparams.image_grid_pinpoints[8] = 384;
+    ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152;
+    for (int i = 0; i < 10; i++)
+    {
+        printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]);
+    }
+    printf("\n");
+    ctx->vision_model.hparams.image_size = 384;
+    printf("params.image_size:%d\n", ctx->vision_model.hparams.image_size);
+    /* 
+        part of: 
+            llava_image_embed_make_with_filename
+    */
+    const char*    image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg";  // Porcelain
+    // const char*    image_path = "/export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg";
+    unsigned char* image_bytes;
+    long           image_bytes_length;
+    auto           loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
+    if (!loaded)
+    {
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        return NULL;
+    }
+
+    /*
+    part of:
+        llava_image_embed_make_with_bytes
+    */
+    clip_image_u8* img = clip_image_u8_init();
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img))
+    {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        return NULL;
+    }
+
+    print_img(img);
+
+    clip_image_u8* image_original_resize = clip_image_u8_init();
+    bicubic_resize(*img, *image_original_resize, 384, 384);
+
+    print_img(image_original_resize);
+
+    // printf("num pixels: %d\n", image_original_resize->buf.size());
+    // printf("raw img: nx:%d | ny:%d\n", image_original_resize->nx, image_original_resize->ny);
+
+    // /*
+    //     part of:
+    //     encode_image_with_clip
+    // */
+    // clip_image_f32_batch img_res_v;
+    // img_res_v.size = 0;
+    // img_res_v.data = nullptr;
+    
+    // if (!clip_image_preprocess(ctx, img, &img_res_v))
+    // {
+    //     LOG_TEE("%s: unable to preprocess image\n", __func__);
+    //     delete[] img_res_v.data;
+    //     return false;
+    // }
+    // printf("img->nx:%ld | img->ny:%ld\n", img->nx, img->ny);
+    // // printf("img_res_v.size:%ld\n", img_res_v.size);
+    // printf("img_res_v->nx:%ld | img_res_v->ny:%ld\n", img_res_v.data->nx, img_res_v.data->ny);
+    // // std::cout << img_res_v.data->nx << " | " << img_res_v.data->ny << std::endl;
+    // // std::cout << img_res_v.data->buf.size() << std::endl;
+
+    // const char* mm_patch_merge_type = clip_patch_merge_type(ctx);
+    // printf("mm_patch_merge_type:%s\n", mm_patch_merge_type);
+
+
+    // for (size_t i = 0; i < img_res_v.size; i++) {
+    //     const int nx = img_res_v.data[i].nx;
+    //     const int ny = img_res_v.data[i].ny;
+    //     printf("i:%d | nx:%d | ny:%d\n", i, nx, ny);
+
+    //     const int n = nx * ny;
+
+ 
+    //     for (int k = 0; k < 1; k++) {
+    //         for (int y = 0; y < 5; y++) {
+    //             for (int x = 0; x < 10; x++) {
+    //                 // data[(i * 3 * n) + k * n + y * nx + x] = imgs->data[i].buf[3 * (y * nx + x) + k];
+    //                 printf("%.4f ", img_res_v.data[i].buf[3 * (y * nx + x) + k]);
+    //             }
+    //             printf("\n");
+    //         }
+    //         printf("\n");
+    //     }
+        
+    // }
+    
+
+    // /*
+    // part of:
+    // clip_image_encode
+    // */
+    // clip_image_f32_batch imgs{};
+    // imgs.size = 1;
+    // imgs.data = &img_res_v.data[0];
+
+
+    // /*
+    // part of:
+    // clip_image_batch_encode
+    // */
+    // const clip_image_f32_batch * imgs_f32_const = &imgs;
+    // int batch_size = imgs_f32_const->size;
+    // if (ctx->has_llava_projector) {
+    //     GGML_ASSERT(batch_size == 1); // TODO: support multiple images
+    // }
+    // if (ctx->has_minicpmv_projector) {
+    //     GGML_ASSERT(batch_size == 1);
+    // }
+
+
+    
+
+    return 0;
+}
+
+
+// make test_anyres_img && ./bin/test_anyres_img
--- a/examples/xgenmm/xgenmm.cpp
+++ b/examples/xgenmm/xgenmm.cpp
@ -0,0 +1,597 @@
+/* 
+08/18/2024 - Yutong - The file is adpated from examples/llava/llava.h in the llama.cpp repository.
+*/
+
+
+#include <cstdio>
+#include <cstdlib>
+#include <numeric>
+#include <vector>
+
+#include "base64.hpp"
+#include "clip.h"
+#include "common.h"
+#include "llama.h"
+#include "xgenmm.h"
+
+// RGB uint8 image
+struct clip_image_u8
+{
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32
+{
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+struct clip_image_grid_shape
+{
+    int first;
+    int second;
+};
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>              &original_size,
+                                                  const std::vector<std::pair<int, int>> &possible_resolutions)
+{
+    int original_width = original_size.first;
+    int original_height = original_size.second;
+
+    std::pair<int, int> best_fit;
+    int                 max_effective_resolution = 0;
+    int                 min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto &resolution : possible_resolutions)
+    {
+        int   width = resolution.first;
+        int   height = resolution.second;
+        float scale =
+            std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale,
+        // downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution ||
+            (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution))
+        {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+
+/**
+ * @brief Get the anyres image grid shape object
+ *
+ * @param image_size
+ * @param grid_pinpoints
+ * @param image_patch_size
+ * @return <int, int>
+ */
+static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>              &image_size,
+                                                                const std::vector<std::pair<int, int>> &grid_pinpoints,
+                                                                int image_patch_size)
+{
+    /**
+        Conversion from gguf flat array to vector:
+        std::vector<std::pair<int, int>> possible_resolutions;
+        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+        }
+     */
+    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into
+// preallocated memory (image_embd_out)
+static bool clip_llava_handle_patches(clip_ctx *ctx_clip, std::vector<float *> &image_embd_v,
+                                      struct clip_image_grid_shape grid_shape, float *image_embd_out,
+                                      int *n_img_pos_out)
+{
+    struct
+    {
+        struct ggml_context *ctx;
+    } model;
+
+    const int32_t image_size = clip_image_size(ctx_clip);
+    const int32_t patch_size = clip_patch_size(ctx_clip);
+
+    int32_t num_patches_per_side =
+        image_size / patch_size;  // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
+
+    int num_patches_width = grid_shape.first;    // grid 1-4
+    int num_patches_height = grid_shape.second;  // grid 1-4
+
+    const size_t num_images = num_patches_width * num_patches_height + 1;
+
+    // TODO: size calculation is not calculated - it's only tens of MB
+    size_t ctx_size = 0;
+
+    {
+        ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8;  // image_features
+        ctx_size += 1024 * 1024 * ggml_type_size(GGML_TYPE_F32);
+    }
+
+    struct ggml_init_params params
+    {
+        /*.mem_size   =*/ctx_size,
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/false,  // NOTE: this should be false when using the legacy API
+    };
+
+    // Python reference code for full unpad:
+    /*
+        base_image_feature = image_feature[0]
+        image_feature = image_feature[1:]
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+        image_feature = torch.cat((
+            image_feature,
+            self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+        ), dim=-1)
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+    */
+    // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
+    // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D
+    // tensors are not supported in ggml yet. Without unpad we have to split the sub-image embeddings into patches of 24
+    // features each and permute them. Once all images are processed to prepended the base_image_features without any
+    // changes.
+
+    // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2
+    // grid image (676x676 scaling))
+    /*
+        image_feature = image_feature.view(2, 2, 24, 24, 4096)
+        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        image_feature = image_feature.view(2, 24, 2, 24, 4096)
+        image_feature = image_feature.flatten(0, 3)
+
+        // Reshape to 4D tensor by merging the last two dimensions
+        image_feature = image_feature.view(2, 2, 24, 24*4096)
+        image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
+        image_feature = image_feature.view(-1, 4096)
+    */
+
+    model.ctx = ggml_init(params);
+
+    struct ggml_tensor *image_features =
+        ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip),
+                           num_images - 1);  // example: 4096 x 576 x 4
+    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
+    // fill it with the image embeddings, ignoring the base
+    for (size_t i = 1; i < num_images; i++)
+    {
+        size_t offset = (i - 1) * clip_embd_nbytes(ctx_clip);
+        memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
+    }
+
+    struct ggml_cgraph *gf = ggml_new_graph(model.ctx);
+    size_t              size_ele = ggml_type_size(GGML_TYPE_F32);
+
+    struct ggml_tensor *image_features_patchview = ggml_view_4d(
+        model.ctx, image_features, num_patches_per_side * clip_n_mmproj_embd(ctx_clip), num_patches_per_side,
+        num_patches_width, num_patches_height, size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+        size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
+        size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
+    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
+    struct ggml_tensor *permuted_cont =
+        ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
+    /**
+     At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
+         image_feature = torch.cat((
+        image_feature,
+        self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+    ), dim=-1)
+     *
+     */
+
+    // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
+    struct ggml_tensor *flatten =
+        ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip),
+                     num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,
+                     size_ele * clip_n_mmproj_embd(ctx_clip), 0);
+    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
+    ggml_build_forward_expand(gf, flatten);
+    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+    struct ggml_tensor *result = gf->nodes[gf->n_nodes - 1];
+
+    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip));  // main image as global context
+    // append without newline tokens (default behavior in llava_arch when not using unpad ):
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float *)result->data,
+           clip_embd_nbytes(ctx_clip) * (num_images - 1));  // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1] + clip_n_patches(ctx_clip));
+
+    // Debug: Test single segments
+    // Current findings: sending base image, sending a segment embedding all works similar to python
+    // However, permuted embeddings do not work yet (stride issue?)
+    // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
+    // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
+    // *n_img_pos_out=576;
+
+    ggml_free(model.ctx);
+    return true;
+}
+
+static clip_image_f32 *only_v2_5_reshape_by_patch(clip_image_f32 *image, int patch_size)
+{
+    int             width = image->nx;
+    int             height = image->ny;
+    int             num_patches = (height / patch_size) * (width / patch_size);
+    clip_image_f32 *patch = clip_image_f32_init();
+    patch->nx = patch_size * num_patches;
+    patch->ny = patch_size;
+    patch->buf.resize(3 * patch->nx * patch->ny);
+
+    int patch_index = 0;
+
+    for (int i = 0; i < height; i += patch_size)
+    {
+        for (int j = 0; j < width; j += patch_size)
+        {
+            for (int pi = 0; pi < patch_size; ++pi)
+            {
+                for (int pj = 0; pj < patch_size; ++pj)
+                {
+                    int input_index = ((i + pi) * width + (j + pj)) * 3;
+                    int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
+                    patch->buf[output_index] = image->buf[input_index];
+                    patch->buf[output_index + 1] = image->buf[input_index + 1];
+                    patch->buf[output_index + 2] = image->buf[input_index + 2];
+                }
+            }
+            patch_index++;
+        }
+    }
+    return patch;
+}
+
+static bool encode_image_with_clip(clip_ctx *ctx_clip, int n_threads, const clip_image_u8 *img, float *image_embd,
+                                   int *n_img_pos)
+{
+    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB -
+    // different to the python implementation which is N x 3 x 336 x 336
+    clip_image_f32_batch img_res_v;
+    img_res_v.size = 0;
+    img_res_v.data = nullptr;
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v))
+    {
+        LOG_TEE("%s: unable to preprocess image\n", __func__);
+        delete[] img_res_v.data;
+        return false;
+    }
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+
+    const char *mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
+
+    if (clip_is_minicpmv(ctx_clip))
+    {
+        std::vector<float *> image_embd_v;
+        image_embd_v.resize(img_res_v.size);
+        struct clip_image_size *load_image_size = clip_image_size_init();
+        for (size_t i = 0; i < img_res_v.size; i++)
+        {
+            const int64_t t_img_enc_step_start_us = ggml_time_us();
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
+            int patch_size = 14;
+            load_image_size->width = img_res_v.data[i].nx;
+            load_image_size->height = img_res_v.data[i].ny;
+            clip_add_load_image_size(ctx_clip, load_image_size);
+            bool encoded = false;
+            int  has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
+            if (has_minicpmv_projector == 2)
+            {
+                encoded = clip_image_encode(
+                    ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
+            }
+            else if (has_minicpmv_projector == 3)
+            {
+                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
+            }
+            if (!encoded)
+            {
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1,
+                        (int)img_res_v.size);
+                return false;
+            }
+            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
+            LOG_TEE("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i + 1, (int)img_res_v.size,
+                    (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+        }
+        const int64_t t_img_enc_batch_us = ggml_time_us();
+        LOG_TEE("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size,
+                (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+
+        int n_img_pos_out = 0;
+        for (size_t i = 0; i < image_embd_v.size(); i++)
+        {
+            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i],
+                        clip_embd_nbytes(ctx_clip));
+            n_img_pos_out += clip_n_patches(ctx_clip);
+        }
+        *n_img_pos = n_img_pos_out;
+        for (size_t i = 0; i < image_embd_v.size(); i++)
+        {
+            free(image_embd_v[i]);
+        }
+        image_embd_v.clear();
+        load_image_size->width = img->nx;
+        load_image_size->height = img->ny;
+        clip_add_load_image_size(ctx_clip, load_image_size);
+        LOG_TEE("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+    }
+    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0)
+    {
+        // flat / default llava-1.5 type embedding
+        *n_img_pos = clip_n_patches(ctx_clip);
+        bool encoded =
+            clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);  // image_embd shape is 576 x 4096
+        delete[] img_res_v.data;
+        if (!encoded)
+        {
+            LOG_TEE("Unable to encode image\n");
+
+            return false;
+        }
+    }
+    else
+    {
+        // spatial_unpad llava-1.6 type embedding
+        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a
+        // solution to quickly get batching working
+        std::vector<float *> image_embd_v;
+        image_embd_v.resize(img_res_v.size);
+        for (size_t i = 0; i < img_res_v.size; i++)
+        {
+            image_embd_v[i] =
+                (float *)malloc(clip_embd_nbytes(ctx_clip));  // 576 patches * 4096 embeddings * 4 bytes = 9437184
+            const bool encoded = clip_image_encode(
+                ctx_clip, n_threads, &img_res_v.data[i],
+                image_embd_v[i]);  // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            if (!encoded)
+            {
+                LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int)i + 1,
+                        (int)img_res_v.size);
+                return false;
+            }
+        }
+        const int64_t t_img_enc_batch_us = ggml_time_us();
+        LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size,
+                (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+
+        const int32_t *image_grid = clip_image_grid(ctx_clip);
+
+        std::vector<std::pair<int, int>> grid_pinpoints;
+        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2)
+        {
+            grid_pinpoints.push_back({image_grid[i], image_grid[i + 1]});
+        }
+
+        // free all img_res_v - not needed anymore
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
+
+        const int32_t image_size = clip_image_size(ctx_clip);
+
+        struct clip_image_grid_shape grid_shape =
+            get_anyres_image_grid_shape({img->nx, img->ny}, grid_pinpoints, image_size);
+
+        int n_img_pos_out;
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        *n_img_pos = n_img_pos_out;
+
+        for (size_t i = 0; i < image_embd_v.size(); i++)
+        {
+            free(image_embd_v[i]);
+        }
+        image_embd_v.clear();
+
+        // debug image/segment/normalization content:
+        // clip_image_u8 * tmp = clip_image_u8_init();
+        // clip_image_convert_f32_to_u8(*image_feature, *tmp);
+        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
+    }
+
+    LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+
+    const int64_t t_img_enc_end_us = ggml_time_us();
+    float         t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+
+    LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms,
+            t_img_enc_ms / *n_img_pos);
+
+    return true;
+}
+
+bool llava_validate_embed_size(const llama_context *ctx_llama, const clip_ctx *ctx_clip)
+{
+    // make sure that the correct mmproj was used, i.e., compare apples to apples
+    int  n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
+    if (n_image_embd != n_llama_embd)
+    {
+        LOG_TEE(
+            "%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you "
+            "use the correct mmproj file.\n",
+            __func__, n_image_embd, n_llama_embd);
+        return false;
+    }
+    return true;
+}
+
+bool llava_image_embed_make_with_clip_img(clip_ctx *ctx_clip, int n_threads, const clip_image_u8 *img,
+                                          float **image_embd_out, int *n_img_pos_out)
+{
+    int num_max_patches = 6;
+    if (clip_is_minicpmv(ctx_clip))
+    {
+        num_max_patches = 10;
+    }
+    float *image_embd =
+        (float *)malloc(clip_embd_nbytes(ctx_clip) * num_max_patches);  // TODO: base on gridsize/llava model
+    if (!image_embd)
+    {
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        return false;
+    }
+
+    int n_img_pos;
+    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos))
+    {
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        free(image_embd);
+        return false;
+    }
+    *image_embd_out = image_embd;
+    *n_img_pos_out = n_img_pos;
+
+    return true;
+}
+
+bool llava_eval_image_embed(llama_context *ctx_llama, const struct llava_image_embed *image_embed, int n_batch,
+                            int *n_past)
+{
+    int n_embd = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < image_embed->n_image_pos; i += n_batch)
+    {
+        int n_eval = image_embed->n_image_pos - i;
+        if (n_eval > n_batch)
+        {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {
+            int32_t(n_eval),
+            nullptr,
+            (image_embed->embed + i * n_embd),
+            nullptr,
+            nullptr,
+            nullptr,
+            nullptr,
+            *n_past,
+            1,
+            0,
+        };
+        if (llama_decode(ctx_llama, batch))
+        {
+            LOG_TEE("%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+struct llava_image_embed *llava_image_embed_make_with_bytes(struct clip_ctx *ctx_clip, int n_threads,
+                                                            const unsigned char *image_bytes, int image_bytes_length)
+{
+    clip_image_u8 *img = clip_image_u8_init();
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img))
+    {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        return NULL;
+    }
+
+    float *image_embed = NULL;
+    int    n_image_pos = 0;
+    bool   image_embed_result =
+        llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
+    if (!image_embed_result)
+    {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: coulnd't embed the image\n", __func__);
+        return NULL;
+    }
+
+    clip_image_u8_free(img);
+    auto result = (llava_image_embed *)malloc(sizeof(llava_image_embed));
+    result->embed = image_embed;
+    result->n_image_pos = n_image_pos;
+    return result;
+}
+
+static bool load_file_to_bytes(const char *path, unsigned char **bytesOut, long *sizeOut)
+{
+    auto file = fopen(path, "rb");
+    if (file == NULL)
+    {
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    auto fileSize = ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    auto buffer = (unsigned char *)malloc(fileSize);  // Allocate memory to hold the file data
+    if (buffer == NULL)
+    {
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        perror("Memory allocation error");
+        fclose(file);
+        return false;
+    }
+    errno = 0;
+    size_t ret = fread(buffer, 1, fileSize, file);  // Read the file into the buffer
+    if (ferror(file))
+    {
+        die_fmt("read error: %s", strerror(errno));
+    }
+    if (ret != (size_t)fileSize)
+    {
+        die("unexpectedly reached end of file");
+    }
+    fclose(file);  // Close the file
+
+    *bytesOut = buffer;
+    *sizeOut = fileSize;
+    return true;
+}
+
+struct llava_image_embed *llava_image_embed_make_with_filename(struct clip_ctx *ctx_clip, int n_threads,
+                                                               const char *image_path)
+{
+    unsigned char *image_bytes;
+    long           image_bytes_length;
+    auto           loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
+    if (!loaded)
+    {
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        return NULL;
+    }
+
+    llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    free(image_bytes);
+
+    return embed;
+}
+
+void llava_image_embed_free(struct llava_image_embed *embed)
+{
+    free(embed->embed);
+    free(embed);
+}
--- a/examples/xgenmm/xgenmm.h
+++ b/examples/xgenmm/xgenmm.h
@ -0,0 +1,53 @@
+/*
+08/18/2024 - Yutong - The file is adpated from examples/llava/llava.h in the llama.cpp repository.
+*/
+
+#ifndef LLAVA_H
+#define LLAVA_H
+
+#include "ggml.h"
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define XGENMM_API __declspec(dllexport)
+#        else
+#            define XGENMM_API __declspec(dllimport)
+#        endif
+#    else
+#        define XGENMM_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define XGENMM_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_ctx;
+struct llava_image_embed {
+    float * embed;
+    int n_image_pos;
+};
+
+/** sanity check for clip <-> llava embed size match */
+XGENMM_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
+
+XGENMM_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+
+/** build an image embed from image file bytes */
+XGENMM_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+/** build an image embed from a path to an image filename */
+XGENMM_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+/** free an embedding made with llava_image_embed_make_* */
+XGENMM_API void llava_image_embed_free(struct llava_image_embed * embed);
+
+/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+XGENMM_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif