From 37a147ebf9c492af646bba349ee0d26e76bd6035 Mon Sep 17 00:00:00 2001
From: John <78893154+cmp-nct@users.noreply.github.com>
Date: Thu, 8 Feb 2024 07:42:49 +0100
Subject: [PATCH] Clip: Bugfix for normalization (it did not loat the 3 std and
 mean values) Clip: bicubic resize function Clip: added save-to-bmp/pil for
 debugging and conversion from/to 32/8 images Clip: added normalization with
 FP16 precision simulation (image tensors match HF implementation, can be
 switched off, only used for llava-1.6) Clip: added newline tensor, mergetype
 kv, image-grid kv, new resize-pad function with resolution from gridpoints
 Clip: clip_image_preprocess now returns a float * vector instead of float,
 this way llava 1.5 and 1.6 is supported llava: added ggml cpu graph for
 embedding patching, added spatial_unpad preliminary support, added a lot of
 comments that need to be cleaned when all is final convert-image-encoder:
 fixed image-grid flattening

---
 examples/llava/clip.cpp                       | 581 ++++++++++++++++--
 examples/llava/clip.h                         |  41 +-
 .../llava/convert-image-encoder-to-gguf.py    |   3 +-
 examples/llava/llava-surgery-v2.py            |   5 +-
 examples/llava/llava.cpp                      | 264 +++++++-
 examples/server/server.cpp                    |   5 +-
 6 files changed, 841 insertions(+), 58 deletions(-)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9129052a2..8193945ee 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -71,6 +71,11 @@ static std::string format(const char * fmt, ...) {
 #define KEY_IMAGE_STD "clip.vision.image_std"
 #define KEY_PROJ_TYPE "clip.projector_type"
 
+#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
+#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
+#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+
+
 //
 // tensor name constants
 //
@@ -94,6 +99,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_LLAVA_PROJ "mm.%d.%s"
 #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
+#define TN_IMAGE_NEWLINE "model.image_newline"
 
 
 enum projector_type {
@@ -233,26 +239,6 @@ static projector_type clip_projector_type_from_string(const std::string & name)
     return PROJECTOR_TYPE_UNKNOWN;
 }
 
-//
-// image data
-//
-
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
 
 //
 // clip layers
@@ -309,6 +295,7 @@ struct clip_vision_model {
     struct ggml_tensor * mm_0_b = NULL;
     struct ggml_tensor * mm_2_w = NULL;
     struct ggml_tensor * mm_2_b = NULL;
+    struct ggml_tensor * image_newline = NULL;
 
     // Yi type models with mlp+normalization projection
     struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
@@ -370,6 +357,10 @@ struct clip_ctx {
     ggml_allocr * compute_alloc = NULL;
 };
 
+const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx) {
+    return ctx->vision_model.hparams;
+}
+
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
@@ -382,6 +373,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
     const int image_size = hparams.image_size;
     const int patch_size = hparams.patch_size;
     const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
+    const int num_patches_per_side = image_size / patch_size;
     const int num_positions = num_patches + 1;
     const int hidden_size = hparams.hidden_size;
     const int n_head = hparams.n_head;
@@ -582,7 +574,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
             embeddings = ggml_gelu(ctx0, embeddings);
-
             embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
             embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
 
@@ -966,12 +957,37 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         hparams.patch_size     = get_u32(ctx, KEY_PATCH_SIZE);
         hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
         hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+        try {
+            int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS);
+            int n = gguf_get_arr_n(ctx, idx);
+            const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx);
+            for (int i = 0; i < 32 && pinpoints[i] != 0; ++i) {
+                hparams.image_grid_pinpoints[i] = pinpoints[i];
+            }
+            hparams.image_grid_pinpoints[n] = 0;
+        } catch (std::runtime_error & e) {  
+            hparams.image_grid_pinpoints[0]=0;
+        }
+        try {
+            int idx = get_key_idx(ctx, KEY_MM_PATCH_MERGE_TYPE);
+            strcpy(hparams.mm_patch_merge_type, gguf_get_val_str(ctx, idx));
+        } catch (std::runtime_error & e) {
+            strcpy(hparams.mm_patch_merge_type, "flat");
+        }
+        try {
+            hparams.image_crop_resolution = get_u32(ctx, KEY_IMAGE_CROP_RESOLUTION); // llava-1.6
+        }
+        catch(const std::exception& e) {
+            hparams.image_crop_resolution = hparams.image_size;
+        }        
 
         int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
         int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD);
+        const float * mean_data = (const float *)gguf_get_arr_data(ctx, idx_mean);
+        const float * std_data  = (const float *)gguf_get_arr_data(ctx, idx_std);
         for (int i = 0; i < 3; ++i) {
-            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i]  = *((const float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_mean[i] = mean_data[i];
+            new_clip->image_std[i]  = std_data[i];
         }
 
         if (verbosity >= 2) {
@@ -983,14 +999,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
             printf("v_projection_dim   %d\n", hparams.projection_dim);
             printf("v_n_head           %d\n", hparams.n_head);
             printf("v_n_layer          %d\n", hparams.n_layer);
+            printf("v_eps              %f\n", hparams.eps);
+            printf("v_image_mean       %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
+            printf("v_image_std        %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
+            printf("v_image_grid_pinpoints: ");
+            for (int i = 0; i < 32 & hparams.image_grid_pinpoints[i]!=0; ++i) {
+                printf("%d ", hparams.image_grid_pinpoints[i]);
+            }
+            printf("\n");
+            printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
+
         }
-
-        vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
-        vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-
+        try
+        {
+            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
+            vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
+            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
+            vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
+        }
+        catch(const std::exception& e)
+        {
+            fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
+        }
+        
         // LLaVA projection
         if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
             vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
@@ -1015,6 +1047,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                 vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
                 vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
             } catch (std::runtime_error & e) {  }
+            try {
+                vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
+                // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
+            } catch (std::runtime_error & e) {  }
         }
         else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
             // MobileVLM projection
@@ -1134,13 +1170,423 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
     return true;
 }
 
+
+void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return;
+    }
+
+    // PPM header: P6 format, width, height, and max color value
+    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
+
+    // Write pixel data
+    for (size_t i = 0; i < img.buf.size(); i += 3) {
+        // PPM expects binary data in RGB format, which matches our image buffer
+        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
+    }
+
+    file.close();
+}
+void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
+    std::ofstream file(filename, std::ios::binary);
+    if (!file.is_open()) {
+        std::cerr << "Failed to open file for writing: " << filename << std::endl;
+        return;
+    }
+
+    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
+    int bytesPerPixel = 3;
+    int widthInBytes = img.nx * bytesPerPixel;
+    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
+    int stride = widthInBytes + paddingAmount;
+
+    // Bitmap file header
+    unsigned char fileHeader[14] = {
+        'B','M',     // Signature
+        0,0,0,0,    // Image file size in bytes
+        0,0,0,0,    // Reserved
+        54,0,0,0    // Start of pixel array
+    };
+
+    // Total file size
+    fileSize = 54 + (stride * img.ny);
+    fileHeader[2] = (unsigned char)(fileSize);
+    fileHeader[3] = (unsigned char)(fileSize >> 8);
+    fileHeader[4] = (unsigned char)(fileSize >> 16);
+    fileHeader[5] = (unsigned char)(fileSize >> 24);
+
+    // Bitmap information header (BITMAPINFOHEADER)
+    unsigned char infoHeader[40] = {
+        40,0,0,0,   // Size of this header (40 bytes)
+        0,0,0,0,    // Image width
+        0,0,0,0,    // Image height
+        1,0,        // Number of color planes
+        24,0,       // Bits per pixel
+        0,0,0,0,    // No compression
+        0,0,0,0,    // Image size (can be 0 for no compression)
+        0,0,0,0,    // X pixels per meter (not specified)
+        0,0,0,0,    // Y pixels per meter (not specified)
+        0,0,0,0,    // Total colors (color table not used)
+        0,0,0,0     // Important colors (all are important)
+    };
+
+    // Width and height in the information header
+    infoHeader[4] = (unsigned char)(img.nx);
+    infoHeader[5] = (unsigned char)(img.nx >> 8);
+    infoHeader[6] = (unsigned char)(img.nx >> 16);
+    infoHeader[7] = (unsigned char)(img.nx >> 24);
+    infoHeader[8] = (unsigned char)(img.ny);
+    infoHeader[9] = (unsigned char)(img.ny >> 8);
+    infoHeader[10] = (unsigned char)(img.ny >> 16);
+    infoHeader[11] = (unsigned char)(img.ny >> 24);
+
+    // Write file headers
+    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
+    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
+
+    // Pixel data
+    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
+    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
+        for (int x = 0; x < img.nx; ++x) {
+            // Each pixel
+            size_t pixelIndex = (y * img.nx + x) * 3;
+            unsigned char pixel[3] = {
+                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
+                img.buf[pixelIndex + 1],
+                img.buf[pixelIndex]
+            };
+            file.write(reinterpret_cast<char*>(pixel), 3);
+        }
+        // Write padding for the row
+        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
+    }
+
+    file.close();
+}
+
+// Linear interpolation between two points
+inline float lerp(float s, float e, float t) {
+    return s + (e - s) * t;
+}
+// Bilinear resize function 
+void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+    
+    float x_ratio = static_cast<float>(src.nx - 1) / target_width;
+    float y_ratio = static_cast<float>(src.ny - 1) / target_height;
+    
+    for (int y = 0; y < target_height; y++) {
+        for (int x = 0; x < target_width; x++) {
+            float px = x_ratio * x;
+            float py = y_ratio * y;
+            int x_floor = static_cast<int>(px);
+            int y_floor = static_cast<int>(py);
+            float x_lerp = px - x_floor;
+            float y_lerp = py - y_floor;
+
+            for (int c = 0; c < 3; c++) {
+                float top = lerp(
+                    static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
+                    static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
+                    x_lerp
+                );
+                float bottom = lerp(
+                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
+                    static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
+                    x_lerp
+                );
+                dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+            }
+        }
+    }
+}
+
+// for replication purposes `.to(model.device, dtype=torch.float16)`
+// converts a float to half precision and back to float
+float simulateFloat16Precision(float value) {
+    // Convert float32 to float16
+    uint32_t f32 = *reinterpret_cast<uint32_t*>(&value);
+    uint32_t sign = (f32 >> 16) & 0x8000; // Top bit (sign bit)
+    uint32_t exponent = ((f32 >> 23) & 0xFF) - 112; // Adjust bias (112 is bias of float16, 127 is bias of float32)
+    uint32_t mantissa = (f32 >> 13) & 0x3FF; // Keep top 10 bits (10 bits of precision in float16, 23 in float32)
+
+    // Handle overflow/underflow
+    if ((f32 & 0x7FFFFFFF) > 0x477FE000) { // Not representable
+        exponent = 0x1F;
+        mantissa = 0;
+    } else if ((f32 & 0x7FFFFFFF) < 0x38800000) { // Too small for normal half precision
+        exponent = 0;
+        mantissa = 0;
+    }
+
+    uint16_t f16 = sign | (exponent << 10) | mantissa;
+
+    // Convert back to float32
+    uint32_t sign32 = (f16 & 0x8000) << 16;
+    uint32_t exponent32 = ((f16 >> 10) & 0x1F);
+    uint32_t mantissa32 = (f16 & 0x3FF) << 13;
+
+    // Adjust bias back
+    exponent32 = exponent32 == 0 ? 0 : exponent32 + 112;
+
+    uint32_t f32Result = sign32 | (exponent32 << 23) | mantissa32;
+    float result = *reinterpret_cast<float*>(&f32Result);
+
+    return result;
+}
+// Normalize image to float32 - supports float16 replication as in pytorch .to(model.device, dtype=torch.float16)
+void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3], bool replicate_float16) {
+    dst->nx = src->nx;
+    dst->ny = src->ny;
+    dst->buf.resize(src->buf.size());
+    
+    for (size_t i = 0; i < src->buf.size(); ++i) {
+        int c = i % 3; // rgb
+        dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
+        
+        if (replicate_float16) {
+            dst->buf[i] = simulateFloat16Precision(dst->buf[i]);
+        }
+    }
+}
+inline float clip(float x, float lower, float upper)
+{
+    return std::max(lower, std::min(x, upper));
+}
+bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height)
+{
+    const int nx = img.nx;
+    const int ny = img.ny;
+
+    dst.nx = target_width;
+    dst.ny = target_height;
+    dst.buf.resize(3 * target_width * target_height);
+
+    int a, b, c, d, index;
+    float Ca, Cb, Cc;
+    float C[5];
+    float d0, d2, d3, a0, a1, a2, a3;
+    int i, j, k, ii, jj;
+    int x, y;
+    float dx, dy;
+    float tx, ty;
+
+    tx = (float)nx / (float)target_width;
+    ty = (float)ny / (float)target_height;
+
+    float scale = std::max(tx, ty);
+
+    // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+    //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+    //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+    for (i = 0; i < target_height; i++)
+    {
+        for (j = 0; j < target_width; j++)
+        {
+            x = (int)(tx * j);
+            y = (int)(ty * i);
+
+            dx = tx * j - x;
+            dy = ty * i - y;
+
+            index = (y * nx + x) * 3;
+            a = (y * nx + (x + 1)) * 3;
+            b = ((y + 1) * nx + x) * 3;
+            c = ((y + 1) * nx + (x + 1)) * 3;
+
+            for (k = 0; k < 3; k++)
+            {
+                for (jj = 0; jj <= 3; jj++)
+                {
+                    d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                    a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+                    d0 = C[0] - C[1];
+                    d2 = C[2] - C[1];
+                    d3 = C[3] - C[1];
+                    a0 = C[1];
+                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                    a2 = 1.0 / 2 * d0 + 1.0 / 2 * d2;
+                    a3 = -1.0 / 6 * d0 - 1.0 / 2 * d2 + 1.0 / 6 * d3;
+                    Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+                    const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+                    dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+// llava-1.6 type of resize_and_pad (black)
+void resize_and_pad_image(const clip_image_u8& image, clip_image_u8 &image_output, const std::pair<int, int>& target_resolution) {
+    int target_width = target_resolution.first;
+    int target_height = target_resolution.second;
+
+    float scale_w = static_cast<float>(target_width) / image.nx;
+    float scale_h = static_cast<float>(target_height) / image.ny;
+
+    int new_width, new_height;
+
+    if (scale_w < scale_h) {
+        new_width = target_width;
+        new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
+    } else {
+        new_height = target_height;
+        new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
+    }
+
+    clip_image_u8 resized_image;
+    // bilinear_resize(image, resized_image, new_width, new_height);
+    bicubic_resize(image, resized_image, new_width, new_height);
+
+    clip_image_u8 padded_image;
+    padded_image.nx = target_width;
+    padded_image.ny = target_height;
+    padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black
+
+    // Calculate padding offsets
+    int pad_x = (target_width - new_width) / 2;
+    int pad_y = (target_height - new_height) / 2;
+
+    // Copy the resized image into the center of the padded buffer
+    for (int y = 0; y < new_height; ++y) {
+        for (int x = 0; x < new_width; ++x) {
+            for (int c = 0; c < 3; ++c) {
+                padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
+            }
+        }
+    }
+
+    image_output = std::move(padded_image);
+}
+
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+    int original_width = original_size.first;
+    int original_height = original_size.second;
+    std::pair<int, int> best_fit;
+    int max_effective_resolution = 0;
+    int min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto& resolution : possible_resolutions) {
+        int width = resolution.first;
+        int height = resolution.second;
+        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+
+
+std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8& image, int patch_size) {
+    std::vector<clip_image_u8*> patches;
+    int width = image.nx;
+    int height = image.ny;
+    for (int i = 0; i < height; i += patch_size) {
+        for (int j = 0; j < width; j += patch_size) {
+            clip_image_u8 *patch = clip_image_u8_init();
+            patch->nx = std::min(patch_size, width - j);
+            patch->ny = std::min(patch_size, height - i);
+            patch->buf.resize(3 * patch->nx * patch->ny);
+            for (int y = 0; y < patch->ny; ++y) {
+                for (int x = 0; x < patch->nx; ++x) {
+                    for (int c = 0; c < 3; ++c) {
+                        patch->buf[3 * (y * patch->nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
+                    }
+                }
+            }
+            patches.push_back(patch);
+        }
+    }
+    return patches;
+}
+
+
+// debug function to convert f32 to u8
+void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(3 * src.nx * src.ny);
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
+    }
+}
+
+/**
+ * @brief Get the anyres image grid shape object
+ * 
+ * @param image_size 
+ * @param grid_pinpoints 
+ * @param image_patch_size 
+ * @return <int, int> 
+ */
+struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size) {
+    /**
+        Conversion from gguf flat array to vector: 
+        std::vector<std::pair<int, int>> possible_resolutions;
+        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+        }
+     */
+    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+
 // normalize: x = (x - mean) / std
 // TODO: implement bicubic interpolation instead of linear.
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
+// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patche tensors as a vector
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square) {
     if (!ctx->has_vision_encoder) {
         printf("This gguf file seems to have no vision encoder\n");
         return false;
     }
+    auto & params = ctx->vision_model.hparams;
+    // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
+    if (strcmp(params.mm_patch_merge_type, "spatial_unpad") == 0) {
+        pad2square = false;
+    } else {
+        // pad2square = true; // todo: consider automatic decisions on that options for all models
+    }
+    // free the previous res_tensor
+    if (res_tensor.size() > 0) {
+        for (size_t i = 0; i < res_tensor.size(); i++) {
+            clip_image_f32_free(res_tensor[i]);
+        }
+        res_tensor.clear();
+    }
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -1151,7 +1597,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
         temp->nx = longer_side;
         temp->ny = longer_side;
         temp->buf.resize(3 * longer_side * longer_side);
-        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
+        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)
 
         // fill with background color
         for (size_t i = 0; i < temp->buf.size(); i++) {
@@ -1169,18 +1615,65 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
             }
         }
     } else {
-        temp->nx = img->nx;
-        temp->ny = img->ny;
-        temp->buf.resize(img->buf.size());
-        memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+        if (params.image_grid_pinpoints[0] != 0)
+        {
+            // "spatial_unpad" with "anyres" processing for llava-1.6
+            std::vector<std::pair<int, int>> possible_resolutions;
+            for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+                possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+            }
+            std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
+            // fprintf(stderr, "%s - Working with resolution: %d %d\n", __func__, best_resolution.first, best_resolution.second);
+            // clip_image_save_to_bmp(*img, "input.bmp");
+            resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
+            // clip_image_save_to_bmp(*temp, "resized.bmp");
+            // visually verify normalized image:
+            // normalize_image_u8_to_f32(*temp, *res, ctx->image_mean, ctx->image_std); 
+            // {
+            //     clip_image_u8 * temp2 = clip_image_u8_init();
+            //     clip_image_convert_f32_to_u8(*res, *temp2);
+            //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32.bmp");
+            //     clip_image_u8_free(temp2);
+            // }
+
+            std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
+            // fprintf(stderr, "patches: %d, %d\n", patches.size(), params.image_size);
+        
+            clip_image_u8 *image_original_resize = clip_image_u8_init();
+            // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
+            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square ?
+            patches.insert(patches.begin(), image_original_resize);
+
+            res_tensor.clear();
+            for (auto& patch : patches) {
+                clip_image_f32 *temp_image_f32 = clip_image_f32_init();
+                normalize_image_u8_to_f32(patch, temp_image_f32, ctx->image_mean, ctx->image_std, true);
+                res_tensor.push_back(temp_image_f32);
+            }
+
+            for (size_t i = 0; i < patches.size(); i++) {
+                // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                clip_image_u8_free(patches[i]);
+            }
+
+            clip_image_u8_free(temp);
+            
+            return true;
+        } else {
+            temp->nx = img->nx;
+            temp->ny = img->ny;
+            temp->buf.resize(img->buf.size());
+            memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+        }
     }
 
     const int nx = temp->nx;
     const int ny = temp->ny;
+    // clip_image_save_to_bmp(*temp, "resized_vanilla.bmp");
 
     const int nx2 = ctx->vision_model.hparams.image_size;
     const int ny2 = ctx->vision_model.hparams.image_size;
-
+    clip_image_f32 * res = clip_image_f32_init();
     res->nx = nx2;
     res->ny = ny2;
     res->buf.resize(3 * nx2 * ny2);
@@ -1234,6 +1727,13 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
     }
     clip_image_u8_free(temp);
 
+    // {
+    //     clip_image_u8 * temp2 = clip_image_u8_init();
+    //     clip_image_convert_f32_to_u8(*res, *temp2);
+    //     clip_image_save_to_bmp(*temp2, "resized_normalized_f32_vanilla.bmp");
+    //     clip_image_u8_free(temp2);
+    // }
+    res_tensor.push_back(res);
     return true;
 }
 
@@ -1302,6 +1802,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
     type = static_cast<ggml_type>(itype);
 
     auto * ctx_clip = clip_model_load(fname_inp, 2);
+    
 
     const auto & ctx_src = ctx_clip->ctx_gguf;
     const auto & ctx_data = ctx_clip->ctx_data;
@@ -1495,6 +1996,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
     }
 }
 
+ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx) {
+    return ctx->vision_model.image_newline;
+}
+
 int clip_n_patches(const struct clip_ctx * ctx) {
     auto & params = ctx->vision_model.hparams;
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
@@ -1506,4 +2011,4 @@ int clip_n_patches(const struct clip_ctx * ctx) {
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
     return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
-}
+}
\ No newline at end of file
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 458a256a1..09346b603 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -3,6 +3,8 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <string>
+#include <vector>
 
 #ifdef LLAMA_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -32,10 +34,20 @@ struct clip_vision_hparams {
     int32_t projection_dim;
     int32_t n_head;
     int32_t n_layer;
+
     float eps;
+
+    char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
+    int32_t image_grid_pinpoints[32]; 
+    int32_t image_crop_resolution;
+
 };
 
+struct clip_ctx;
+CLIP_API const struct clip_vision_hparams clip_get_vision_hparams(const struct clip_ctx * ctx);
+
 CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
 
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
@@ -44,6 +56,24 @@ CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
 CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
+// RGB uint8 image
+CLIP_API struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+ CLIP_API struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+
 struct clip_image_u8_batch {
     struct clip_image_u8 * data;
     size_t size;
@@ -53,6 +83,10 @@ struct clip_image_f32_batch {
     struct clip_image_f32 * data;
     size_t size;
 };
+CLIP_API struct clip_image_grid_shape {
+    int first;
+    int second;
+};
 
 CLIP_API struct clip_image_u8  * clip_image_u8_init ();
 CLIP_API struct clip_image_f32 * clip_image_f32_init();
@@ -61,11 +95,16 @@ CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
 CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 
 CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+CLIP_API void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename);
+CLIP_API void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst);
 
 /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
 CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+/** preprocess img and store the result in res_tensor, pad2square may be overriden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, std::vector<clip_image_f32*>& res_tensor, bool pad2square);
+CLIP_API struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int>& image_size, const std::vector<std::pair<int, int>>& grid_pinpoints, int image_patch_size);
+CLIP_API struct ggml_tensor *clip_get_newline_tensor(const struct clip_ctx * ctx);
 
-CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py
index 115b6b35b..ea331f2fe 100644
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -240,7 +240,8 @@ if has_vision_encoder:
         # flatten it
         image_grid_pinpoints = []
         for pinpoint in v_hparams["image_grid_pinpoints"]:
-            image_grid_pinpoints.extend(pinpoint)
+            for p in pinpoint:
+                image_grid_pinpoints.append(p)
         fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
     if "image_crop_resolution" in v_hparams:
         fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
diff --git a/examples/llava/llava-surgery-v2.py b/examples/llava/llava-surgery-v2.py
index a5850b96e..6b4fac80d 100644
--- a/examples/llava/llava-surgery-v2.py
+++ b/examples/llava/llava-surgery-v2.py
@@ -13,11 +13,12 @@ def is_safetensor_file(file_path):
 # Unified loading function
 def load_model(file_path):
     if is_safetensor_file(file_path):
-        # return safe_load(file_path,framework="pt", device="cpu"), 'safetensor'
         tensors = {}
         with safe_open(file_path, framework="pt", device="cpu") as f:
             for key in f.keys():
                 tensors[key] = f.get_tensor(key).clone()
+                # output shape
+                print(f"{key} : {tensors[key].shape}")
         return tensors, 'safetensor'
     else:
         return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
@@ -156,4 +157,4 @@ if len(first_mm_tensors) > 0:
 
 print("Done!")
 print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
-print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
+print(f"Also, use {args.model}/llava.projector to prepare a llava-encoder.gguf file.")
\ No newline at end of file
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index d42e7582e..3a0c4a8a4 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -6,27 +6,261 @@
 #include <cstdio>
 #include <cstdlib>
 #include <vector>
+#include <numeric>
 
 #include "base64.hpp"
 
-static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    clip_image_f32 * img_res = clip_image_f32_init();
-    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
-        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
-        clip_image_f32_free(img_res);
-        return false;
+// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
+static bool handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+    struct temp_model {
+        struct ggml_tensor *newline;
+        struct ggml_context * ctx; 
+    } model;
+
+    auto & vparams = clip_get_vision_hparams(ctx_clip);
+    auto num_patches_per_side = vparams.image_size / vparams.patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
+    int num_patches_width = grid_shape.first; // grid 1-4
+    int num_patches_height = grid_shape.second; // grid 1-4
+    
+    // TODO: size calculation is not calculated - it's only tens of MB
+    size_t ctx_size = 0;
+    {
+        ctx_size += clip_embd_nbytes(ctx_clip) * image_embd_v.size() * 8; // image_features
+        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32); // 
+    }
+    
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
+    };
+
+        // Python reference for full unpad:
+        // base_image_feature = image_feature[0]
+        // image_feature = image_feature[1:]
+        // image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        // image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        // image_feature = unpad_image(image_feature, image_sizes[image_idx])
+        // image_feature = torch.cat((
+        //     image_feature,
+        //     self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+        // ), dim=-1)
+        // image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        // image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+        
+        // embeddings -> tokens -> 24 x 24
+        /**
+         * We now have two options: unpad or no unpad - unpad removes tokens for faster llm eval
+         * In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet
+         * Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
+         * Once all images are processed to prepended the base_image_features without any changes.
+         */
+    /**
+        Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
+        # image_feature = image_feature.view(2, 2, 24, 24, 4096)
+        # image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        # image_feature = image_feature.view(2, 24, 2, 24, 4096)
+        # image_feature = image_feature.flatten(0, 3)
+
+        # Reshape to 4D tensor by merging the last two dimensions
+        image_feature = image_feature.view(2, 2, 24, 24*4096)
+        image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
+        image_feature = image_feature.view(-1, 4096)
+     * 
+     */
+    model.ctx = ggml_init(params);
+    
+    ggml_context *ctx_noalloc = ggml_init({2048, NULL, true});
+    // struct ggml_tensor * image_features = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (image_embd_v.size() - 1));
+    
+    ggml_tensor *newline_tmp = clip_get_newline_tensor(ctx_clip);
+    model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
+    if (newline_tmp->backend != GGML_BACKEND_CPU) {
+        if (newline_tmp->buffer == NULL) {
+            printf("newline_tmp tensor buffer is NULL\n");
+        }
+        ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
+    } else
+    {
+        model.newline->data = newline_tmp->data;
+        if (model.newline->data == NULL) {
+            printf("newline_tmp tensor data is NULL\n");
+        }
     }
 
-    *n_img_pos = clip_n_patches(ctx_clip);
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, image_embd_v.size() - 1, clip_n_patches(ctx_clip), clip_n_mmproj_embd(ctx_clip));
+    // fill it with the image embeddings, ignoring the first
+    for (int i = 1; i < image_embd_v.size(); i++)
+    {
+        // printf("Copying image_embd_v[%d] to image_features tensor\n", i);
+        size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
+
+        // for debugging we now try and set the entire tensor row to 0.0001f,0.0002f,0.0003f,0.0004f etc:
+        // float *floatPtr = static_cast<float*>(image_embd_v[i]);
+        // for (int j = 0; j < clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip); j++)
+        // {
+        //     // floatPtr[j] = (j + 1) / 10000.0f;
+        //     int feature = j % clip_n_mmproj_embd(ctx_clip) + 1;
+        //     floatPtr[j] = i + feature / 10000.0f;
+        // }
+        memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
+    }
+    // printf("image_features size = %d\n", clip_embd_nbytes(ctx_clip) * (image_embd_v.size() - 1));
+
+    struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
+    // image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
+    size_t size_ele = ggml_type_size(GGML_TYPE_F32);
+    // struct ggml_tensor *dummy = ggml_new_tensor_4d(ctx_noalloc, GGML_TYPE_F32, num_patches_height, num_patches_width, num_patches_per_side, num_patches_per_side * clip_n_mmproj_embd(ctx_clip));
+
+    struct ggml_tensor *image_features_view = ggml_view_4d(model.ctx, image_features, 
+                                                                    num_patches_height, // nb0 : 4 byte für jedes 
+                                                                    num_patches_width, 
+                                                                    num_patches_per_side * num_patches_per_side, 
+                                                                    clip_n_mmproj_embd(ctx_clip), 
+
+                                                                    size_ele * num_patches_height,
+                                                                    size_ele * num_patches_height * num_patches_width,
+                                                                    size_ele * num_patches_height * num_patches_width * num_patches_per_side,
+                                                                    0);
+                                                                
+    struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features, 
+                                                                num_patches_height, 
+                                                                num_patches_width, 
+                                                                num_patches_per_side, 
+                                                                num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+                                                                
+                                                                size_ele * num_patches_height,
+                                                                size_ele * num_patches_height * num_patches_width,
+                                                                size_ele * num_patches_height * num_patches_width * num_patches_per_side, 0);
+
+    struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3)); 
+    permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, permuted_cont, 0, 2, 1, 3)); // permute back to before - todo: fix bug
+
+    struct ggml_tensor *prepared = ggml_view_2d(model.ctx, permuted_cont, num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, clip_n_mmproj_embd(ctx_clip), size_ele * num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, 0);
+    struct ggml_tensor *prepared_cont = ggml_cont(model.ctx, prepared); // not needed
+    // struct ggml_tensor *prepared_cont = prepared; // the view only flattens
+
+    ggml_build_forward_expand(gf, prepared_cont);
+
+    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+
+    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+    //  ggml_tensor_printf(image_features,"image_features",__LINE__,false,true);
+    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,true);
+    // ggml_tensor_printf(prepared_cont,"prepared_cont",__LINE__,false,true);
+
+    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
+    // append without newline tokens:
+    // memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip) * (image_embd_v.size()-1)); // grid patches
+    // append with newline tokens:
+    for (size_t i = 0; i < image_embd_v.size() - 1; ++i) {
+        // we append with +1 offset (base image is prepended)
+        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+1) + model.newline->ne[0] * i,
+            (float*)prepared_cont->data + i * clip_n_mmproj_embd(ctx_clip) * clip_n_patches(ctx_clip),
+            clip_embd_nbytes(ctx_clip));
+        memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip) * (i+2) + model.newline->ne[0] * i ,
+            (float*)model.newline->data,
+            ggml_nbytes(model.newline));
+    }
+
+    size_t newline_tokens = image_embd_v.size()-1;
+    *n_img_pos_out = prepared_cont->ne[0]+clip_n_patches(ctx_clip) + newline_tokens;
+
+    // Debug: Test single segments
+    // Current findings: sending base image, sending a segment embedding all works similar to python
+    // However, permuted embeddings do not work yet (stride issue?)
+    // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
+    // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
+    // *n_img_pos_out=576;
+    
+    ggml_free(model.ctx);
+
+    return true;
+}
+
+
+static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
+    std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
+    if (!clip_image_preprocess(ctx_clip, img, img_res_v, /*pad2square =*/ true)) {
+        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
+        for (auto img_res : img_res_v) {
+            clip_image_f32_free(img_res);
+        }
+        return false;
+    }
 
     const int64_t t_img_enc_start_us = ggml_time_us();
-    bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
-    clip_image_f32_free(img_res);
-    if (!encoded) {
-        fprintf(stderr, "Unable to encode image\n");
+    auto & vparams = clip_get_vision_hparams(ctx_clip);
+    // DEBUG print the "shape" and the first 10 rows and 10 cols of img_res_v in exp format
+    // for (int i = 0; i < img_res_v.size(); i++)
+    // {
+    //     printf("img_res_v[%d] shape: %d x %d\n", i, img_res_v[i]->nx, img_res_v[i]->ny);
+    //     for (int j = 0; j < 10; j++)
+    //     {
+    //         for (int k = 0; k < 10; k++)
+    //         {
+    //             printf("%e ", img_res_v[i]->buf[j*img_res_v[i]->ny + k]);
+    //         }
+    //         printf("\n");
+    //     }
+    // }
+
+    if (strcmp(vparams.mm_patch_merge_type, "spatial_unpad") != 0) 
+    {
+        // flat / default llava-1.5 type embedding
+        *n_img_pos = clip_n_patches(ctx_clip);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[0], image_embd); // image_embd shape is 576 x 4096
+        clip_image_f32_free(img_res_v[0]);
+        if (!encoded) {
+            fprintf(stderr, "Unable to encode image\n");
+
+            return false;
+        }
+    } else
+    {
+        // spatial_unpad llava-1.6 type embedding
+        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
+        std::vector<float *> image_embd_v;
+        image_embd_v.resize(img_res_v.size());
+        for (int i = 0; i < img_res_v.size(); i++)
+        {
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
+            bool encoded = clip_image_encode(ctx_clip, n_threads, img_res_v[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            clip_image_f32_free(img_res_v[i]);
+            if (!encoded) {
+                fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", i+1, img_res_v.size());
+                return false;
+            }
+        }
+        const int64_t t_img_enc_batch_us = ggml_time_us();
+        printf("%s: %d segments encoded in %8.2f ms\n", __func__, img_res_v.size(), (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);        
+
+
+        std::vector<std::pair<int, int>> grid_pinpoints;
+        for (int i = 0; i < 32 && vparams.image_grid_pinpoints[i] != 0; i+=2) {
+            grid_pinpoints.push_back({vparams.image_grid_pinpoints[i], vparams.image_grid_pinpoints[i+1]});
+        }
+        img_res_v.clear();
+        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, vparams.image_size);
+
+        int n_img_pos_out;
+        handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        *n_img_pos = n_img_pos_out;
+
+        for (int i = 0; i < image_embd_v.size(); i++)
+        {
+            free(image_embd_v[i]);
+        }
+        image_embd_v.clear();
+
+        // debug image/segment/normalization content:
+        // clip_image_u8 * tmp = clip_image_u8_init();
+        // clip_image_convert_f32_to_u8(*image_feature, *tmp);
+        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
 
-        return false;
     }
+    printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+    
 
     const int64_t t_img_enc_end_us = ggml_time_us();
     float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
@@ -36,6 +270,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     return true;
 }
 
+
+
 bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
         // make sure that the correct mmproj was used, i.e., compare apples to apples
     int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
@@ -48,7 +284,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
 }
 
 static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
-    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
     if (!image_embd) {
         fprintf(stderr, "Unable to allocate memory for image embeddings\n");
         free(image_embd);
@@ -151,7 +387,7 @@ LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct
         return NULL;
     }
 
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    llava_image_embed *embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
     free(image_bytes);
 
     return embed;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index ea77125ea..353bd8976 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -943,13 +943,14 @@ struct llama_server_context
             {
                 continue;
             }
-            clip_image_f32 * img_res = clip_image_f32_init();
-            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/ true))
+            std::vector<clip_image_f32*> img_res_v;
+            if (!clip_image_preprocess(clp_ctx, img.img_data, img_res_v, /*pad2square =*/ true))
             {
                 LOG_TEE("Error processing the given image");
                 clip_free(clp_ctx);
                 return false;
             }
+            clip_image_f32 * img_res = img_res_v[0];
             img.image_tokens = clip_n_patches(clp_ctx);
             img.image_embedding = (float *)malloc(clip_embd_nbytes(clp_ctx));
             if (!img.image_embedding)