add returned string type (const char*) for nexa-omni-audio

2024-11-07 11:19:50 +08:00 · 2024-11-07 11:19:50 +08:00 · 3dfac7817f
commit 3dfac7817f
parent 5edadffd88
17 changed files with 4112 additions and 0 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -53,6 +53,7 @@ else()
    # add_subdirectory(speculative)
    # add_subdirectory(tokenize)
    add_subdirectory(omni-vlm)
+    add_subdirectory(omni-vlm-v2)
    add_subdirectory(nexa-omni-audio)
    add_subdirectory(qwen2-audio)
 endif()
--- a/examples/omni-vlm-v2/CMakeLists.txt
+++ b/examples/omni-vlm-v2/CMakeLists.txt
@ -0,0 +1,50 @@
+add_library(omni_vlm_v2 OBJECT
+            omni-vlm-v2.cpp
+            omni-vlm-v2.h
+            clip-v2.cpp
+            clip-v2.h
+            )
+
+target_link_libraries(omni_vlm_v2 PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(omni_vlm_v2 PUBLIC .)
+target_include_directories(omni_vlm_v2 PUBLIC ../..)
+target_include_directories(omni_vlm_v2 PUBLIC ../../common)
+
+target_compile_features(omni_vlm_v2 PRIVATE cxx_std_11)
+
+add_library(omni_vlm_v2_static STATIC $<TARGET_OBJECTS:omni_vlm_v2>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(omni_vlm_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(omni_vlm_v2 PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(omni_vlm_v2_shared SHARED $<TARGET_OBJECTS:omni_vlm_v2>)
+    target_link_libraries(omni_vlm_v2_shared PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS omni_vlm_v2_shared LIBRARY)
+endif()
+
+if (NOT MSVC)
+    target_compile_options(omni_vlm_v2 PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+if(TARGET BUILD_INFO)
+    add_dependencies(omni_vlm_v2 BUILD_INFO)
+endif()
+
+set(TARGET omni-vlm-v2-cli)
+add_executable(${TARGET} omni-vlm-v2-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-v2-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common omni_vlm_v2 ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+#=== for omni-vlm-wrapper
+add_library(omni_vlm_v2_wrapper_shared SHARED omni-vlm-v2-wrapper.cpp $<TARGET_OBJECTS:omni_vlm_v2>)
+target_link_libraries(omni_vlm_v2_wrapper_shared PRIVATE common ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS omni_vlm_v2_wrapper_shared LIBRARY)
+
+# set(TARGET omni-vlm-wrapper-cli)
+# add_executable(${TARGET} omni-vlm-wrapper-cli.cpp)
+# set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-wrapper-cli)
+# install(TARGETS ${TARGET} RUNTIME)
+# target_link_libraries(${TARGET} PRIVATE omni_vlm_v2_wrapper_shared ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/omni-vlm-v2/README.md
+++ b/examples/omni-vlm-v2/README.md
@ -0,0 +1,110 @@
+# omni-vlm
+
+Currently this implementation supports [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr) variants,
+
+After API is confirmed, more models will be supported / uploaded.
+
+## Usage
+Build with cmake in the `llama.cpp` folder:
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
+cmake --build build --verbose -j
+```
+After building, run: `./omni-vlm-v2-cli` to see the usage. For example:
+
+```bash
+./omni-vlm-v2-cli \
+    -m Nano-Llm-v2-494M-F16.gguf \
+    --mmproj mmproj-omni-vlm-v2-f16.gguf \
+    --image example/omni-vlm-v2/latex.png \
+    --prompt "Describe this image for me"
+```
+
+See next section to convert gguf files from original safetensors.
+
+[comment]: # (TODO:
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+)
+
+## Omni-vlm gguf conversion
+1) First clone omni-vlm model:
+```console
+git clone https://huggingface.co/NexaAIDev/vlm-81-ocr
+```
+
+2) Install the required Python packages:
+
+```sh
+pip install -r examples/omni-vlm/requirements.txt
+```
+
+3) Run `omni_vlm_surgery.py`:
+```console
+python omni_vlm_surgery.py \
+  --clean-vision-tower \
+  --model <PATH TO vlm-81-ocr>
+```
+- you will find an `omni_vlm.projector` and an `omni_vlm.clip` file in `vlm-81-ocr/` directory
+
+4) Create a soft link `pytorch_model.bin` to `omni_vlm.clip`:
+```bash
+# in vlm-81-ocr/ folder
+ln -s omni_vlm.clip pytorch_model.bin
+```
+5) Go back to `llama.cpp` project folder and create the visual gguf model:
+
+clone `nano-vlm-processor` model directory (You may need to obtain authorization to access NexaAIDev space).
+```console
+git clone https://huggingface.co/NexaAIDev/nano-vlm-processor
+```
+
+```console
+python ./examples/omni-vlm/convert_image_encoder_to_gguf.py \
+    -m <PATH TO vlm-81-ocr> \
+    --output-dir <PATH TO vlm-81-ocr> \
+    -p <PATH TO nano-vlm-processor>
+```
+- You will get a pure vision model part of CLIP named `<PATH TO vlm-81-ocr>/mmproj-omni-vlm-f16.gguf`.
+
+6) Then convert the LLM portion to gguf format:
+* Run python snippet below to extract LLM portion from original omni-vlm model.
+```python
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+tensors = {}
+with safe_open("<PATH TO vlm-81-ocr>/model.safetensors", framework="pt", device=0) as f:
+    for k in f.keys():
+        if k.startswith('language_model'):
+            k2 = k.replace('language_model.', '')
+            tensors[k2] = f.get_tensor(k)
+
+    save_file(tensors, "<PATH TO nano-vlm-processor>/model.safetensors")
+```
+
+```console
+python convert_hf_to_gguf.py <PATH TO nano-vlm-processor>
+```
+Finally we will get LLM GGUF model: `<PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.ggu`
+
+7) And finally we can run the omni-vlm demo of C++ version:
+```console
+./build/bin/omni-vlm-cli \
+    -m  <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
+    --mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-v2-f16.gguf \
+    --image example/omni-vlm/cat.png
+```
+The results will print on the screen:
+> The image depicts a grey and white cat with its head pressed against the camera, appearing as if it is staring directly into the lens. The cat is surrounded by black and white stripes, adding a unique touch to its appearance. The black background creates a strong contrast and highlights the cat's features, making it a captivating scene.
+
+8) Python interface:
+
+After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
+```console
+python omni_vlm_v2_demo.py \
+  --model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
+  --mmproj <PATH TO vlm-81-ocr>/mmproj-omni-vlm-f16.gguf \
+  --prompt="Describe this image for me" \
+  --image-path latex.png
+```
--- a/examples/omni-vlm-v2/clip-v2.cpp
+++ b/examples/omni-vlm-v2/clip-v2.cpp
--- a/examples/omni-vlm-v2/clip-v2.h
+++ b/examples/omni-vlm-v2/clip-v2.h
@ -0,0 +1,94 @@
+#ifndef CLIP_H
+#define CLIP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define CLIP_API __declspec(dllexport)
+#        else
+#            define CLIP_API __declspec(dllimport)
+#        endif
+#    else
+#        define CLIP_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define CLIP_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_ctx;
+
+struct clip_image_size {
+    int width;
+    int height;
+};
+
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};
+
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
+
+CLIP_API void clip_free(struct clip_ctx * ctx);
+
+CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+
+CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);
+
+// TODO: should be enum, not string
+CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
+
+CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
+
+CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+
+CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
+CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
+
+CLIP_API struct clip_image_size * clip_image_size_init();
+CLIP_API struct clip_image_u8  * clip_image_u8_init ();
+CLIP_API struct clip_image_f32 * clip_image_f32_init();
+
+CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
+CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
+CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
+
+CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+
+/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
+CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
+
+/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
+CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
+
+CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
+
+CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
+CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
+
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+
+CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CLIP_H
--- a/examples/omni-vlm-v2/convert_image_encoder_to_gguf.py
+++ b/examples/omni-vlm-v2/convert_image_encoder_to_gguf.py
@ -0,0 +1,208 @@
+import argparse
+import os
+import json
+import re
+
+import torch
+import numpy as np
+from gguf import *
+# from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+
+VISION = "siglip.vision"
+
+
+def k(raw_key: str, arch: str) -> str:
+    return raw_key.format(arch=arch)
+
+
+def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_omni_vlm: bool) -> bool:
+    if name in (
+        "logit_scale",
+        "text_model.embeddings.position_ids",
+        "vision_model.embeddings.position_ids",
+    ):
+        return True
+
+    # if name.startswith("vision_model.post_layernorm") or name.startswith("vision_model.head"):
+    #     return True
+
+    if name.startswith("v") and not has_vision:
+        return True
+
+    if name.startswith("t") and not has_text:
+        return True
+
+    return False
+
+
+def get_tensor_name(name: str) -> str:
+    if "projection" in name:
+        return name
+    if "multi_modal_projector" in name:
+        name = name.replace("multi_modal_projector", "mm")
+        name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1)
+        name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1)
+        return name
+
+    return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln")
+
+
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model-dir", help="Path to model directory cloned from HF Hub", required=True)
+ap.add_argument("-p", "--processor-dir", help="Path to vlm-processor directory cloned from HF Hub", required=True)
+ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
+ap.add_argument("--use-f32", action="store_true", default=False, help="Use f32 instead of f16")
+# TODO: whether update this info?
+# default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+# default_image_std = [0.26862954, 0.26130258, 0.27577711]
+default_image_mean = [0.5, 0.5, 0.5]
+default_image_std = [0.5, 0.5, 0.5]
+
+# with proper
+args = ap.parse_args()
+
+if args.use_f32:
+    print("WARNING: Weights for the convolution op is always saved in f16, as the convolution op in GGML does not support 32-bit kernel weights yet.")
+
+# output in the same directory as the model if output_dir is None
+dir_model = args.model_dir
+dir_processor = args.processor_dir
+
+with open(dir_processor + "/preprocessor_config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if args.use_f32:
+    ftype = 0
+
+has_omni_vlm_projector = True
+fname_middle = "mmproj-"
+output_dir = args.output_dir if args.output_dir is not None else dir_model
+os.makedirs(output_dir, exist_ok=True)
+
+fname_out = os.path.join(output_dir, f"{fname_middle}omni-vlm-v2-{ftype_str[ftype]}.gguf")
+fout = GGUFWriter(path=fname_out, arch="siglip")
+
+fout.add_bool("siglip.has_omni_vlm_projector", has_omni_vlm_projector)
+fout.add_file_type(ftype)
+fout.add_name("omni-vlm")
+fout.add_description("image encoder for omni-vlm")
+
+
+fout.add_uint32("siglip.vision.image_size", 384)
+fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), 1152)
+fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), 16) #TODO: to be confirmed
+fout.add_uint32("siglip.vision.patch_size", 14)
+# block_count = (27 - 1)
+block_count = 27
+fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)
+
+fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
+fout.add_uint32("siglip.vision.projection_dim", 4096)
+    # fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), v_hparams["layer_norm_eps"])
+    # if "image_grid_pinpoints" in v_hparams:
+    #     # flatten it
+    #     image_grid_pinpoints = []
+    #     for pinpoint in v_hparams["image_grid_pinpoints"]:
+    #         for p in pinpoint:
+    #             image_grid_pinpoints.append(p)
+#     fout.add_array("clip.vision.image_grid_pinpoints", image_grid_pinpoints)
+#     if "image_crop_resolution" in v_hparams:
+#         fout.add_uint32("clip.vision.image_crop_resolution", v_hparams["image_crop_resolution"])
+#     if "image_aspect_ratio" in v_hparams:
+#         fout.add_string("clip.vision.image_aspect_ratio", v_hparams["image_aspect_ratio"])
+#     if "image_split_resolution" in v_hparams:
+#         fout.add_uint32("clip.vision.image_split_resolution", v_hparams["image_split_resolution"])
+#     if "mm_patch_merge_type" in v_hparams:
+#         fout.add_string("clip.vision.mm_patch_merge_type", v_hparams["mm_patch_merge_type"])
+#     if "mm_projector_type" in v_hparams:
+#         fout.add_string("clip.vision.mm_projector_type", v_hparams["mm_projector_type"])
+#
+#
+#     if processor is not None:
+#         image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean  # pyright: ignore[reportAttributeAccessIssue]
+#         image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std  # pyright: ignore[reportAttributeAccessIssue]
+#     else:
+#         image_mean = args.image_mean if args.image_mean is not None else default_image_mean
+#         image_std = args.image_std if args.image_std is not None else default_image_std
+#     fout.add_array("clip.vision.image_mean", image_mean)
+#     fout.add_array("clip.vision.image_std", image_std)
+#
+fout.add_array("siglip.vision.image_mean", default_image_mean)
+fout.add_array("siglip.vision.image_std", default_image_std)
+
+# use_gelu = v_hparams["hidden_act"] == "gelu"
+# fout.add_bool("clip.use_gelu", use_gelu)
+
+model = torch.load(os.path.join(dir_model, "omni_vlm.clip"), map_location='cpu')
+    # model.vision_model.encoder.layers.pop(-1)
+projector = torch.load(os.path.join(dir_model, "omni_vlm.projector"), map_location='cpu')
+for name, data in projector.items():
+    name = get_tensor_name(name)
+    # pw and dw conv ndim==4
+    if data.ndim == 2 or data.ndim == 4:
+        data = data.squeeze().cpu().numpy().astype(np.float16)
+    else:
+        data = data.squeeze().cpu().numpy().astype(np.float32)
+
+    fout.add_tensor(name, data)
+
+print("Projector tensors added\n")
+
+
+# state_dict = model.state_dict()
+state_dict = dict(model)
+for name, data in state_dict.items():
+    if should_skip_tensor(name, False, True, True):
+        # we don't need this
+        print(f"skipping parameter: {name}")
+        continue
+
+    # if name.startswith(f"vision_model.encoder.layers.{block_count}"):
+    #     continue
+
+    name = get_tensor_name(name)
+    # data = data.astype(np.float16)
+    # print(data)
+    data = data.squeeze().float().numpy()
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if n_dims == 4:
+        print(f"tensor {name} is always saved in f16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1:
+        if name[-7:] == ".weight" and n_dims == 2:
+            print("  Converting to float16")
+            data = data.astype(np.float16)
+            ftype_cur = 1
+        else:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+    else:
+        if data.dtype != np.float32:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+    print(f"{name} - {ftype_str[ftype_cur]} - shape = {data.shape}")
+    fout.add_tensor(name, data)
+
+
+fout.write_header_to_file()
+fout.write_kv_data_to_file()
+fout.write_tensors_to_file()
+fout.close()
+
+print("Done. Output file: " + fname_out)
--- a/examples/omni-vlm-v2/latex.png
+++ b/examples/omni-vlm-v2/latex.png
--- a/examples/omni-vlm-v2/omni-vlm-v2-cli.cpp
+++ b/examples/omni-vlm-v2/omni-vlm-v2-cli.cpp
@ -0,0 +1,299 @@
+// #include "arg.h"
+#include "base64.hpp"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip-v2.h"
+#include "omni-vlm-v2.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+static const char * sample(struct llama_sampling_context * ctx_sampling,
+                           struct llama_context * ctx_llama,
+                           int * n_past) {
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    static std::string ret;
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
+
+static const std::string IMG_PAD = "<|image_pad|>";
+
+static void find_image_tag_in_prompt(const std::string& prompt, size_t& idx) {
+    // begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
+    // end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
+    idx = prompt.find(IMG_PAD);
+}
+
+static bool prompt_contains_image(const std::string& prompt) {
+    size_t begin;
+    find_image_tag_in_prompt(prompt, begin);
+    return (begin != std::string::npos);
+}
+
+// replaces the base64 image tag in the prompt with `replacement`
+static omni_image_embed * omnivlm_image_embed_make_with_prompt(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
+    size_t idx;
+    find_image_tag_in_prompt(prompt, idx);
+    if (idx == std::string::npos) {
+        LOG_TEE("%s: invalid base64 image tag. must be %s\n", __func__, IMG_PAD.c_str());
+        return NULL;
+    }
+
+    auto base64_str = prompt.substr(idx, IMG_PAD.size());
+
+    auto required_bytes = base64::required_encode_size(base64_str.size());
+    auto img_bytes = std::vector<unsigned char>(required_bytes);
+    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
+
+    auto embed = omnivlm_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+    if (!embed) {
+        LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+        return NULL;
+    }
+
+    return embed;
+}
+
+static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
+    size_t begin;
+    find_image_tag_in_prompt(prompt, begin);
+    if (begin == std::string::npos) {
+        return prompt;
+    }
+    auto pre = prompt.substr(0, begin);
+    auto post = prompt.substr(begin + IMG_PAD.size());
+    return pre + replacement + post;
+}
+
+struct omnivlm_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct omni_image_embed * load_image(omnivlm_context * ctx_omnivlm, gpt_params * params, const std::string & fname) {
+
+    // load and preprocess the image
+    omni_image_embed * embed = NULL;
+    embed = omnivlm_image_embed_make_with_filename(ctx_omnivlm->ctx_clip, params->n_threads, fname.c_str());
+    if (!embed) {
+        fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
+        return NULL;
+    }
+
+    return embed;
+}
+
+static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+    int n_past = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    std::string system_prompt, user_prompt;
+    size_t image_pos = prompt.find("<|image_pad|>");
+    // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+    system_prompt = prompt.substr(0, image_pos);
+    user_prompt = prompt.substr(image_pos + std::string("<|image_pad|>").length());
+    if (params->verbose_prompt) {
+        auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
+        for (int i = 0; i < (int) tmp.size(); i++) {
+            LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
+        }
+    }
+    LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+    if (params->verbose_prompt) {
+        auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
+        for (int i = 0; i < (int) tmp.size(); i++) {
+            LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
+        }
+    }
+
+    eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
+    omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
+    eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+
+    // generate the response
+
+    LOG("\n");
+
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+    if (!ctx_sampling) {
+        LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
+    std::string response = "";
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past);
+        response += tmp;
+        if (strcmp(tmp, "<|im_end|>") == 0) break;
+        if (strcmp(tmp, "</s>") == 0) break;
+        // if (strstr(tmp, "###")) break; // Yi-VL behavior
+        printf("%s", tmp);
+        // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+        // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+        // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+
+        fflush(stdout);
+    }
+
+    llama_sampling_free(ctx_sampling);
+    printf("\n");
+}
+
+static struct llama_model * omnivlm_init(gpt_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_TEE("%s: unable to load model\n" , __func__);
+        return NULL;
+    }
+    return model;
+}
+
+static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_model * model) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
+
+
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        LOG_TEE("%s: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    auto * ctx_omnivlm = (struct omnivlm_context *)malloc(sizeof(omnivlm_context));
+
+    ctx_omnivlm->ctx_llama = ctx_llama;
+    ctx_omnivlm->ctx_clip = ctx_clip;
+    ctx_omnivlm->model = model;
+    return ctx_omnivlm;
+}
+
+static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) {
+    if (ctx_omnivlm->ctx_clip) {
+        clip_free(ctx_omnivlm->ctx_clip);
+        ctx_omnivlm->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_omnivlm->ctx_llama);
+    llama_free_model(ctx_omnivlm->model);
+    llama_backend_free();
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    // if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
+    //     return 1;
+    // }
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
+    }
+
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+        print_usage(argc, argv, {});
+        return 1;
+    }
+
+    if(params.prompt.empty()) {
+        LOG_TEE("%s: prompt is empty. Terminating\n\n", __func__);
+        print_usage(argc, argv, {});
+        return 1;
+    }
+
+    params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+
+    auto * model = omnivlm_init(&params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
+        return 1;
+    }
+
+
+    auto * ctx_omnivlm = omnivlm_init_context(&params, model);
+    for (auto & image : params.image) {
+        auto * image_embed = load_image(ctx_omnivlm, &params, image);
+        if (!image_embed) {
+            LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
+            return 1;
+        }
+        // process the prompt
+        process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);
+
+        llama_print_timings(ctx_omnivlm->ctx_llama);
+        omnivlm_image_embed_free(image_embed);
+    }
+    ctx_omnivlm->model = NULL;
+    omnivlm_free(ctx_omnivlm);
+
+    llama_free_model(model);
+
+    return 0;
+}
--- a/examples/omni-vlm-v2/omni-vlm-v2-wrapper-cli.cpp
+++ b/examples/omni-vlm-v2/omni-vlm-v2-wrapper-cli.cpp
@ -0,0 +1,16 @@
+// WARNING: this .cpp file is only for debugging. do not user directly.
+#include "omni-vlm-v2-wrapper.h"
+
+int main(int argc, char ** argv) {
+    const char* llm_model = "<path to llm gguf.>";
+    const char* mmproj_model = "<path to mm projector gguf>";
+    const char* image_path = "<path where image is located.>";
+    const char* prompt = "";
+
+    omnivlm_init(llm_model, mmproj_model);
+    omnivlm_inference(prompt, image_path);
+    omnivlm_inference(prompt, image_path);
+    omnivlm_free();
+
+    return 0;
+}
--- a/examples/omni-vlm-v2/omni-vlm-v2-wrapper.cpp
+++ b/examples/omni-vlm-v2/omni-vlm-v2-wrapper.cpp
@ -0,0 +1,259 @@
+#include "base64.hpp"
+#include "log.h"
+#include "common.h"
+#include "sampling.h"
+#include "clip-v2.h"
+#include "omni-vlm-v2.h"
+#include "llama.h"
+#include "ggml.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <stdexcept>
+#include <vector>
+#include <string>
+#include <iostream>
+
+#include "omni-vlm-v2-wrapper.h"
+
+
+struct omnivlm_context {
+    struct clip_ctx * ctx_clip = NULL;
+    struct llama_context * ctx_llama = NULL;
+    struct llama_model * model = NULL;
+};
+
+void* internal_chars = nullptr;
+
+static struct gpt_params params;
+static struct llama_model* model;
+static struct omnivlm_context* ctx_omnivlm;
+
+static struct omni_image_embed * load_image(omnivlm_context * ctx_omnivlm, gpt_params * params, const std::string & fname) {
+
+    // load and preprocess the image
+    omni_image_embed * embed = NULL;
+    embed = omnivlm_image_embed_make_with_filename(ctx_omnivlm->ctx_clip, params->n_threads, fname.c_str());
+    if (!embed) {
+        fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
+        return NULL;
+    }
+
+    return embed;
+}
+
+static struct llama_model * omnivlm_init(gpt_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);
+
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
+    if (model == NULL) {
+        LOG_TEE("%s: unable to load model\n" , __func__);
+        return NULL;
+    }
+    return model;
+}
+
+static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_model * model) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
+
+
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        LOG_TEE("%s: failed to create the llama_context\n" , __func__);
+        return NULL;
+    }
+
+    ctx_omnivlm = (struct omnivlm_context *)malloc(sizeof(omnivlm_context));
+
+    ctx_omnivlm->ctx_llama = ctx_llama;
+    ctx_omnivlm->ctx_clip = ctx_clip;
+    ctx_omnivlm->model = model;
+    return ctx_omnivlm;
+}
+
+static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+static const char * sample(struct llama_sampling_context * ctx_sampling,
+                           struct llama_context * ctx_llama,
+                           int * n_past) {
+    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
+    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
+    static std::string ret;
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
+
+static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+    int n_past = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
+                                + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    size_t image_pos = full_prompt.find("<|image_pad|>");
+    std::string system_prompt, user_prompt;
+
+    // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
+    system_prompt = full_prompt.substr(0, image_pos);
+    user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length());
+    if (params->verbose_prompt) {
+        auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
+        for (int i = 0; i < (int) tmp.size(); i++) {
+            LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
+        }
+    }
+    // LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+    if (params->verbose_prompt) {
+        auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
+        for (int i = 0; i < (int) tmp.size(); i++) {
+            LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
+        }
+    }
+
+    eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
+    omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
+    eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+
+    // generate the response
+
+    LOG("\n");
+
+    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+    if (!ctx_sampling) {
+        LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
+        exit(1);
+    }
+
+    std::string response = "";
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past);
+        if (strcmp(tmp, "<|im_end|>") == 0) break;
+        if (strcmp(tmp, "</s>") == 0) break;
+        // if (strstr(tmp, "###")) break; // Yi-VL behavior
+        // printf("%s", tmp);
+        response += tmp;
+        // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
+        // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
+        // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
+
+        fflush(stdout);
+    }
+
+    llama_sampling_free(ctx_sampling);
+    printf("\n");
+
+    // const char* ret_char_ptr = (const char*)(malloc(sizeof(char)*response.size()));
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
+}
+
+static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) {
+    if (ctx_omnivlm->ctx_clip) {
+        clip_free(ctx_omnivlm->ctx_clip);
+        ctx_omnivlm->ctx_clip = NULL;
+    }
+
+    llama_free(ctx_omnivlm->ctx_llama);
+    llama_free_model(ctx_omnivlm->model);
+    llama_backend_free();
+}
+
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+// inference interface definition
+void omnivlm_init(const char* llm_model_path, const char* projector_model_path) {
+    const char* argv = "hello-omni-vlm-wrapper-cli";
+    char* nc_argv = const_cast<char*>(argv);
+    if (!gpt_params_parse(1, &nc_argv, params)) {
+        print_usage(1, &nc_argv, {});
+        throw std::runtime_error("init params error.");
+    }
+    params.model = llm_model_path;
+    params.mmproj = projector_model_path;
+    model = omnivlm_init(&params);
+    if (model == nullptr) {
+        fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
+        throw std::runtime_error("Failed to init omnivlm model");
+    }
+    ctx_omnivlm = omnivlm_init_context(&params, model);
+}
+
+const char* omnivlm_inference(const char *prompt, const char *imag_path) {
+    std::string image = imag_path;
+    params.prompt = prompt;
+    auto * image_embed = load_image(ctx_omnivlm, &params, image);
+    if (!image_embed) {
+        LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
+        throw std::runtime_error("failed to load image " + image);
+    }
+    // process the prompt
+    const char* ret_chars = process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);
+
+    // llama_perf_print(ctx_omnivlm->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
+    omnivlm_image_embed_free(image_embed);
+
+    return ret_chars;
+}
+
+void omnivlm_free() {
+    if(internal_chars != nullptr) { free(internal_chars); }
+    ctx_omnivlm->model = NULL;
+    omnivlm_free(ctx_omnivlm);
+    llama_free_model(model);
+}
--- a/examples/omni-vlm-v2/omni-vlm-v2-wrapper.h
+++ b/examples/omni-vlm-v2/omni-vlm-v2-wrapper.h
@ -0,0 +1,33 @@
+
+#ifndef OMNIVLMWRAPPER_H
+#define OMNIVLMWRAPPER_H
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define OMNIVLM_API __declspec(dllexport)
+#        else
+#            define OMNIVLM_API __declspec(dllimport)
+#        endif
+#    else
+#        define OMNIVLM_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define OMNIVLM_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path);
+
+OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path);
+
+OMNIVLM_API void omnivlm_free();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/examples/omni-vlm-v2/omni-vlm-v2.cpp
+++ b/examples/omni-vlm-v2/omni-vlm-v2.cpp
@ -0,0 +1,434 @@
+#include "clip-v2.h"
+#include "omni-vlm-v2.h"
+
+#include "llama.h"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <vector>
+#include <common/log.h>
+
+#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
+#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
+
+// #define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+// #define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+// #define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
+// #define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
+
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
+struct clip_image_grid_shape {
+    int first;
+    int second;
+};
+
+/**
+ * Selects the best resolution from a list of possible resolutions based on the original size.
+ *
+ * @param original_size The original size of the image in the format (width, height).
+ * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
+ * @return The best fit resolution in the format (width, height).
+ */
+static std::pair<int, int> select_best_resolution(const std::pair<int, int>& original_size, const std::vector<std::pair<int, int>>& possible_resolutions) {
+    int original_width  = original_size.first;
+    int original_height = original_size.second;
+
+    std::pair<int, int> best_fit;
+    int max_effective_resolution = 0;
+    int min_wasted_resolution = std::numeric_limits<int>::max();
+
+    for (const auto& resolution : possible_resolutions) {
+        int width = resolution.first;
+        int height = resolution.second;
+        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
+        int downscaled_width  = static_cast<int>(original_width * scale);
+        int downscaled_height = static_cast<int>(original_height * scale);
+        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
+        int wasted_resolution = (width * height) - effective_resolution;
+        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_resolution = wasted_resolution;
+            best_fit = resolution;
+        }
+    }
+
+    return best_fit;
+}
+
+/**
+ * @brief Get the anyres image grid shape object
+ *
+ * @param image_size
+ * @param grid_pinpoints
+ * @param image_patch_size
+ * @return <int, int>
+ */
+static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<int, int> & image_size, const std::vector<std::pair<int, int>> & grid_pinpoints, int image_patch_size) {
+    /**
+        Conversion from gguf flat array to vector:
+        std::vector<std::pair<int, int>> possible_resolutions;
+        for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i+=2) {
+            possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
+        }
+     */
+    auto best_resolution = select_best_resolution(image_size, grid_pinpoints);
+    return {best_resolution.first / image_patch_size, best_resolution.second / image_patch_size};
+}
+
+// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
+static bool clip_omnivlm_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+    struct {
+        struct ggml_context * ctx;
+    } model;
+
+    const int32_t image_size = clip_image_size(ctx_clip);
+    const int32_t patch_size = clip_patch_size(ctx_clip);
+
+    int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)
+
+    int num_patches_width  = grid_shape.first;  // grid 1-4
+    int num_patches_height = grid_shape.second; // grid 1-4
+
+    const size_t num_images = num_patches_width * num_patches_height + 1;
+
+    // TODO: size calculation is not calculated - it's only tens of MB
+    size_t ctx_size = 0;
+
+    {
+        ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
+        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
+    }
+
+    struct ggml_init_params params {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
+    };
+
+    // Python reference code for full unpad:
+    /*
+        base_image_feature = image_feature[0]
+        image_feature = image_feature[1:]
+        image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
+        image_feature = image_feature.flatten(1, 2).flatten(2, 3)
+        image_feature = unpad_image(image_feature, image_sizes[image_idx])
+        image_feature = torch.cat((
+            image_feature,
+            self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1)
+        ), dim=-1)
+        image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+        image_feature = torch.cat((base_image_feature, image_feature), dim=0)
+    */
+    // We now have two options: unpad or no unpad. Unpad removes tokens for faster llm eval.
+    // In terms of result quality it appears to make no difference, so we'll start with the easier approach given 5D tensors are not supported in ggml yet.
+    // Without unpad we have to split the sub-image embeddings into patches of 24 features each and permute them.
+    // Once all images are processed to prepended the base_image_features without any changes.
+
+    // Pytorch reference simplified, modified for ggml compatibility - confirmed identical output in python (for a 2x2 grid image (676x676 scaling))
+    /*
+        image_feature = image_feature.view(2, 2, 24, 24, 4096)
+        image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
+        image_feature = image_feature.view(2, 24, 2, 24, 4096)
+        image_feature = image_feature.flatten(0, 3)
+
+        // Reshape to 4D tensor by merging the last two dimensions
+        image_feature = image_feature.view(2, 2, 24, 24*4096)
+        image_feature = image_feature.permute(0, 2, 1, 3).contiguous()
+        image_feature = image_feature.view(-1, 4096)
+    */
+
+    model.ctx = ggml_init(params);
+
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+    // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
+    // fill it with the image embeddings, ignoring the base
+    for (size_t i = 1; i < num_images; i++) {
+        size_t offset = (i-1) * clip_embd_nbytes(ctx_clip);
+        memcpy((uint8_t *)(image_features->data) + offset, image_embd_v[i], clip_embd_nbytes(ctx_clip));
+    }
+
+    struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
+    size_t size_ele = ggml_type_size(GGML_TYPE_F32);
+
+    struct ggml_tensor *image_features_patchview = ggml_view_4d(model.ctx, image_features,
+                                                                num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+                                                                num_patches_per_side,
+                                                                num_patches_width,
+                                                                num_patches_height,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip),
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side,
+                                                                size_ele * num_patches_per_side * clip_n_mmproj_embd(ctx_clip) * num_patches_per_side * num_patches_width, 0);
+    // ggml_tensor_printf(image_features_patchview,"image_features_patchview",__LINE__,false,false);
+    struct ggml_tensor *permuted_cont = ggml_cont(model.ctx, ggml_permute(model.ctx, image_features_patchview, 0, 2, 1, 3));
+    /**
+     At the end of each row we have to add the row_end embeddings, which are the same as the newline embeddings
+         image_feature = torch.cat((
+        image_feature,
+        self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
+    ), dim=-1)
+     *
+     */
+
+    // ggml_tensor_printf(permuted_cont,"permuted_cont",__LINE__,false,false);
+    struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
+    // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
+    ggml_build_forward_expand(gf, flatten);
+    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+
+    memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
+    // append without newline tokens (default behavior in llava_arch when not using unpad ):
+    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+
+    // Debug: Test single segments
+    // Current findings: sending base image, sending a segment embedding all works similar to python
+    // However, permuted embeddings do not work yet (stride issue?)
+    // memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as context
+    // memcpy(image_embd_out, (float*)prepared_cont->data, clip_embd_nbytes(ctx_clip)); // main image as context
+    // *n_img_pos_out=576;
+
+    ggml_free(model.ctx);
+    return true;
+}
+
+static clip_image_f32 * only_v2_5_reshape_by_patch(clip_image_f32 * image, int patch_size) {
+    int width = image->nx;
+    int height = image->ny;
+    int num_patches = (height / patch_size) * (width / patch_size);
+    clip_image_f32 * patch = clip_image_f32_init();
+    patch->nx = patch_size * num_patches;
+    patch->ny = patch_size;
+    patch->buf.resize(3 * patch->nx * patch->ny);
+
+    int patch_index = 0;
+
+    for (int i = 0; i < height; i += patch_size) {
+        for (int j = 0; j < width; j += patch_size) {
+            for (int pi = 0; pi < patch_size; ++pi) {
+                for (int pj = 0; pj < patch_size; ++pj) {
+                    int input_index = ((i + pi) * width + (j + pj)) * 3;
+                    int output_index = (pi * patch_size * num_patches + patch_index * patch_size + pj) * 3;
+                    patch->buf[output_index] = image->buf[input_index];
+                    patch->buf[output_index+1] = image->buf[input_index+1];
+                    patch->buf[output_index+2] = image->buf[input_index+2];
+                }
+            }
+            patch_index++;
+        }
+    }
+    return patch;
+}
+
+static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
+    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
+    clip_image_f32_batch img_res_v;
+    img_res_v.size = 0;
+    img_res_v.data = nullptr;
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
+        LOG_TEE("%s: unable to preprocess image\n", __func__);
+        delete[] img_res_v.data;
+        return false;
+    }
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+
+    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);
+
+    *n_img_pos = clip_n_patches(ctx_clip);
+    bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
+
+    LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
+
+    const int64_t t_img_enc_end_us = ggml_time_us();
+    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+
+    LOG("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
+
+    return true;
+}
+
+bool omnivlm_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
+        // make sure that the correct mmproj was used, i.e., compare apples to apples
+    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
+    if (n_image_embd != n_llama_embd) {
+        LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
+        return false;
+    }
+    return true;
+}
+
+bool omnivlm_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
+    int num_max_patches = 6;
+    if (clip_is_minicpmv(ctx_clip)) {
+        num_max_patches = 10;
+    }
+
+    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*num_max_patches); // TODO: base on gridsize/llava model
+    if (!image_embd) {
+        LOG_TEE("Unable to allocate memory for image embeddings\n");
+        return false;
+    }
+
+    int n_img_pos;
+    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
+        LOG_TEE("%s: cannot encode image, aborting\n", __func__);
+        free(image_embd);
+        return false;
+    }
+    *image_embd_out = image_embd;
+    *n_img_pos_out = n_img_pos;
+
+    return true;
+}
+
+struct omnivlm_embd_batch {
+    std::vector<llama_pos>      pos;
+    std::vector<int32_t>        n_seq_id;
+    std::vector<llama_seq_id>   seq_id_0;
+    std::vector<llama_seq_id *> seq_ids;
+    std::vector<int8_t>         logits;
+    llama_batch batch;
+    omnivlm_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
+        pos     .resize(n_tokens);
+        n_seq_id.resize(n_tokens);
+        seq_ids .resize(n_tokens + 1);
+        logits  .resize(n_tokens);
+        seq_id_0.resize(1);
+        seq_id_0[0] = seq_id;
+        seq_ids [n_tokens] = nullptr;
+        batch = {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ embd,
+            /*pos            =*/ pos.data(),
+            /*n_seq_id       =*/ n_seq_id.data(),
+            /*seq_id         =*/ seq_ids.data(),
+            /*logits         =*/ logits.data(),
+        };
+        for (int i = 0; i < n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+};
+
+bool omnivlm_eval_image_embed(llama_context * ctx_llama, const struct omni_image_embed * image_embed, int n_batch, int * n_past) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
+        int n_eval = image_embed->n_image_pos - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        float * embd = image_embed->embed+i*n_embd;
+        llama_batch batch = {int32_t(n_eval), nullptr, embd, nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        if (llama_decode(ctx_llama, batch)) {
+            LOG_TEE("%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+struct omni_image_embed * omnivlm_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
+    clip_image_u8 * img = clip_image_u8_init();
+    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
+        return NULL;
+    }
+
+    float* image_embed = NULL;
+    int n_image_pos = 0;
+    bool image_embed_result = omnivlm_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
+    if (!image_embed_result) {
+        clip_image_u8_free(img);
+        LOG_TEE("%s: couldn't embed the image\n", __func__);
+        return NULL;
+    }
+
+    clip_image_u8_free(img);
+    auto result = (omni_image_embed*)malloc(sizeof(omni_image_embed));
+    result->embed = image_embed;
+    result->n_image_pos = n_image_pos;
+    return result;
+}
+
+static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
+    auto* file = fopen(path, "rb");
+    if (file == NULL) {
+        LOG_TEE("%s: can't read file %s\n", __func__, path);
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    auto fileSize = ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
+    if (buffer == NULL) {
+        LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
+        perror("Memory allocation error");
+        fclose(file);
+        return false;
+    }
+    errno = 0;
+    size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
+    if (ferror(file)) {
+        die_fmt("read error: %s", strerror(errno));
+    }
+    if (ret != (size_t) fileSize) {
+        die("unexpectedly reached end of file");
+    }
+    fclose(file); // Close the file
+
+    *bytesOut = buffer;
+    *sizeOut = fileSize;
+    return true;
+}
+
+struct omni_image_embed * omnivlm_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
+    unsigned char* image_bytes;
+    long image_bytes_length;
+    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
+    if (!loaded) {
+        LOG_TEE("%s: failed to load %s\n", __func__, image_path);
+        return NULL;
+    }
+
+    omni_image_embed *embed = omnivlm_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
+    free(image_bytes);
+
+    return embed;
+}
+
+void omnivlm_image_embed_free(struct omni_image_embed * embed) {
+    free(embed->embed);
+    free(embed);
+}
--- a/examples/omni-vlm-v2/omni-vlm-v2.h
+++ b/examples/omni-vlm-v2/omni-vlm-v2.h
@ -0,0 +1,46 @@
+#ifndef OMNIVLM_H
+#define OMNIVLM_H
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define OMNIVLM_API __declspec(dllexport)
+#        else
+#            define OMNIVLM_API __declspec(dllimport)
+#        endif
+#    else
+#        define OMNIVLM_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define OMNIVLM_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct clip_ctx;
+struct omni_image_embed {
+    float * embed;
+    int n_image_pos;
+};
+
+OMNIVLM_API bool omnivlm_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
+
+OMNIVLM_API bool omnivlm_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
+
+/** build an image embed from image file bytes */
+OMNIVLM_API struct omni_image_embed * omnivlm_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
+/** build an image embed from a path to an image filename */
+OMNIVLM_API struct omni_image_embed * omnivlm_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
+/** free an embedding made with OMNIVLM_image_embed_make_* */
+OMNIVLM_API void omnivlm_image_embed_free(struct omni_image_embed * embed);
+
+/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
+OMNIVLM_API bool omnivlm_eval_image_embed(struct llama_context * ctx_llama, const struct omni_image_embed * embed, int n_batch, int * n_past);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/examples/omni-vlm-v2/omni_vlm_v2_cpp.py
+++ b/examples/omni-vlm-v2/omni_vlm_v2_cpp.py
@ -0,0 +1,84 @@
+import ctypes
+import os
+import sys
+from pathlib import Path
+
+
+# Load the library
+def _load_shared_library(lib_base_name: str, base_path: Path = None):
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux"):
+        lib_ext = ".so"
+    elif sys.platform == "darwin":
+        lib_ext = ".dylib"
+    elif sys.platform == "win32":
+        lib_ext = ".dll"
+    else:
+        raise RuntimeError("Unsupported platform")
+
+    # Construct the paths to the possible shared library names
+    if base_path is None:
+        _base_path = Path(__file__).parent.parent.resolve()
+    else:
+        print(f"Using base path: {base_path}")
+        _base_path = base_path
+    _lib_paths = [
+        _base_path / f"lib{lib_base_name}{lib_ext}",
+        _base_path / f"{lib_base_name}{lib_ext}",
+    ]
+
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(_base_path))
+
+    # Try to load the shared library, handling potential errors
+    for _lib_path in _lib_paths:
+        print(f"Trying to load shared library '{_lib_path}'")
+        if _lib_path.exists():
+            try:
+                return ctypes.CDLL(str(_lib_path))
+            except Exception as e:
+                print(f"Failed to load shared library '{_lib_path}': {e}")
+
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
+
+# Specify the base name of the shared library to load
+_lib_base_name = "omni_vlm_v2_wrapper_shared"
+base_path = (
+    Path(__file__).parent.parent.parent.resolve()
+    / "build"
+    / "examples"
+    / "omni-vlm-v2"
+)
+
+# Load the library
+_lib = _load_shared_library(_lib_base_name, base_path)
+
+omni_char_p = ctypes.c_char_p
+
+
+def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
+    return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
+
+
+_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
+_lib.omnivlm_init.restype = None
+
+
+def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):
+    return _lib.omnivlm_inference(prompt, image_path)
+
+
+_lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
+_lib.omnivlm_inference.restype = omni_char_p
+
+
+def omnivlm_free():
+    return _lib.omnivlm_free()
+
+
+_lib.omnivlm_free.argtypes = []
+_lib.omnivlm_free.restype = None
--- a/examples/omni-vlm-v2/omni_vlm_v2_demo.py
+++ b/examples/omni-vlm-v2/omni_vlm_v2_demo.py
@ -0,0 +1,57 @@
+
+import ctypes
+import logging
+import os
+
+import omni_vlm_cpp
+
+
+class NexaOmniVlmInference:
+    """
+    A class used for vision language model inference.
+    """
+
+    def __init__(self, llm_model_path: str, mmproj_model_path: str):
+        self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8"))
+        self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8"))
+
+        omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model)
+
+    def inference(self, prompt: str, image_path: str):
+        prompt = ctypes.c_char_p(prompt.encode("utf-8"))
+        image_path = ctypes.c_char_p(image_path.encode("utf-8"))
+        return omni_vlm_cpp.omnivlm_inference(prompt, image_path)
+
+    def __del__(self):
+        omni_vlm_cpp.omnivlm_free()
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Run omni vision language model generation"
+    )
+    parser.add_argument("--model", type=str, help="Path to the llm model file")
+    parser.add_argument("--mmproj", type=str, help="Path to the mmproj file")
+    # parser.add_argument("--prompt", type=str, help="prompt string.")
+    # parser.add_argument("--image-path", type=str, help="Path to the image.")
+
+    args = parser.parse_args()
+
+    omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj)
+    # omni_vlm_obj.inference(args.prompt, args.image_path)
+    while True:
+        print("Input your prompt:")
+        prompt = input()
+        if prompt == "":
+            print("ERROR: you input an empty prompt, try again.")
+            continue
+        print("Input your image path:")
+        image_path = input()
+        while not os.path.exists(image_path):
+            print("ERROR: can not find image in your input path, please check and input agian.")
+            image_path = input()
+        response = omni_vlm_obj.inference(prompt, image_path)
+        print("\tresponse:")
+        print(response.decode('utf-8'))
--- a/examples/omni-vlm-v2/omni_vlm_v2_surgery.py
+++ b/examples/omni-vlm-v2/omni_vlm_v2_surgery.py
@ -0,0 +1,161 @@
+import argparse
+import glob
+import os
+import torch
+from safetensors import safe_open
+from safetensors.torch import save_file
+from typing import Any, ContextManager, cast
+
+# Function to determine if file is a SafeTensor file
+def is_safetensor_file(file_path):
+    return file_path.endswith('.safetensors')
+
+
+# Unified loading function
+def load_model(file_path):
+    if is_safetensor_file(file_path):
+        tensors = {}
+        with cast(ContextManager[Any], safe_open(file_path, framework="pt", device="cpu")) as f:
+            for key in f.keys():
+                tensors[key] = f.get_tensor(key).clone()
+                # output shape
+                print(f"{key} : {tensors[key].shape}")
+        return tensors, 'safetensor'
+    else:
+        return torch.load(file_path, map_location=torch.device('cpu')), 'pytorch'
+
+
+# Unified saving function
+def save_model(model, file_path, file_type):
+    if file_type == 'safetensor':
+        # safe_save(model, file_path)
+        save_file(model, file_path)
+    else:
+        torch.save(model, file_path)
+
+
+# Adapted function to clean vision tower from checkpoint
+def clean_vision_tower_from_checkpoint(checkpoint_path):
+    checkpoint, file_type = load_model(checkpoint_path)
+    # file_type = 'pytorch'
+    model_path = os.path.dirname(checkpoint_path)
+    print(f"Searching for vision tower tensors in {checkpoint_path}")
+    clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vision_tower.vision_model")]
+
+    if len(clip_tensors) > 0:
+        print(f"Found {len(clip_tensors)} tensors to extract from {checkpoint_path}")
+        # Adapted for file type
+        clip_path = os.path.join(model_path, "omni_vlm.clip")
+
+        if os.path.exists(clip_path):
+            print(f"Loading existing omni_vlm.clip from {clip_path}")
+            existing_clip, _ = load_model(clip_path)
+        else:
+            print(f"Creating new omni_vlm.clip at {clip_path}")
+            existing_clip = {}
+        # Update existing_clip with new tensors, avoid duplicates
+        for name in clip_tensors:
+            simple_name = name[name.index('vision_model.'):] if 'vision_model.' in name else name
+            print(f"Adding {simple_name} to omni_vlm.clip")
+            if simple_name not in existing_clip:
+                existing_clip[simple_name] = checkpoint[name]
+
+        # Save the updated clip tensors back to omni_vlm.clip
+        save_model(existing_clip, clip_path, 'pytorch')
+
+        # Remove the tensors from the original checkpoint
+        for name in clip_tensors:
+            del checkpoint[name]
+
+        checkpoint_path = checkpoint_path
+        return True
+    return False
+
+def find_relevant_checkpoints(checkpoint_paths, newline_criteria, projector):
+    newline_checkpoint_path = None
+    projector_checkpoint_path = None
+
+    for path in checkpoint_paths:
+        checkpoint, _ = load_model(path)
+        if newline_criteria(checkpoint) and newline_checkpoint_path is None:
+            newline_checkpoint_path = path
+        if projector(checkpoint):
+            projector_checkpoint_path = path
+
+    return newline_checkpoint_path, projector_checkpoint_path
+
+def newline_criteria(checkpoint):
+    return any(k.startswith("model.image_newline") for k in checkpoint.keys())
+
+def proj_criteria(checkpoint):
+    # return any(k.startswith("multi_modal_projector.") or k.startswith("vision_proj.") for k in checkpoint.keys())
+    return any(k.startswith("multi_modal_projector.") for k in checkpoint.keys())
+
+
+# Command-line interface setup
+ap = argparse.ArgumentParser()
+ap.add_argument("-m", "--model", required=True, help="Path to omni-vlm model")
+ap.add_argument("-C", "--clean-vision-tower", action="store_true", help="Remove any vision tower from the model files")
+args = ap.parse_args()
+
+if args.clean_vision_tower:
+    # Generalized to handle both PyTorch and SafeTensors models
+    model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+    # checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and path.startswith('pytorch')) or (path.endswith('.safetensors') and path.startswith('model'))]
+    checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+    for projector_checkpoint_path in checkpoint_paths:
+        print(f"Cleaning {projector_checkpoint_path}")
+        if not clean_vision_tower_from_checkpoint(projector_checkpoint_path):
+            print(f"No vision tower found in {projector_checkpoint_path}")
+            # we break once none is found, so far all models append them at the end
+            # break
+    print("Done! All vision tower tensors are removed from the model files and stored in omni_vlm.clip file.")
+
+# Now we look for the projector in the last checkpoint
+model_files = sorted(glob.glob(f"{args.model}/*"), key=os.path.getmtime, reverse=True)
+checkpoint_paths = [path for path in model_files if (path.endswith('.bin') and 'pytorch' in path.split('/')[-1].split('\\')[-1]) or (path.endswith('.safetensors') and 'model' in path.split('/')[-1].split('\\')[-1])]
+# last_checkpoint_path = checkpoint_paths[0]
+# first_checkpoint_path = checkpoint_paths[-1]
+newline_checkpoint_path, projector_checkpoint_path = find_relevant_checkpoints(checkpoint_paths, newline_criteria, proj_criteria)
+
+print(f"Taking projector from {projector_checkpoint_path}")
+first_mm_tensors = []
+first_checkpoint = None
+if newline_checkpoint_path is not None:
+    print(f"Taking newline from {newline_checkpoint_path}")
+    first_checkpoint, file_type = load_model(newline_checkpoint_path)
+    first_mm_tensors = [k for k, v in first_checkpoint.items() if k.startswith("model.image_newline")]
+
+# Load the checkpoint
+mm_tensors = []
+last_checkpoint = None
+if projector_checkpoint_path is not None:
+    last_checkpoint, file_type = load_model(projector_checkpoint_path)
+    # mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("multi_modal_projector.") or k.startswith("vision_proj.")]
+    mm_tensors = [k for k, v in last_checkpoint.items() if k.startswith("multi_modal_projector.")]
+
+if len(mm_tensors) == 0:
+    if last_checkpoint is not None:
+        for k, v in last_checkpoint.items():
+            print(k)
+    print(f"Found {len(mm_tensors)} tensors to extract out of {len(last_checkpoint) if last_checkpoint is not None else 0} tensors.")
+    print("No tensors found. Is this omni-vlm model?")
+    exit()
+
+print(f"Found {len(mm_tensors)} tensors to extract.")
+print(f"Found additional {len(first_mm_tensors)} tensors to extract.")
+# projector = {name: checkpoint.[name].float() for name in mm_tensors}
+projector = {}
+for name in mm_tensors:
+    assert last_checkpoint is not None
+    projector[name] = last_checkpoint[name].float()
+for name in first_mm_tensors:
+    assert first_checkpoint is not None
+    projector[name] = first_checkpoint[name].float()
+
+if len(projector) > 0:
+    save_model(projector, f"{args.model}/omni_vlm.projector", 'pytorch')
+
+print("Done!")
+print(f"Now you can convert {args.model} to a a regular LLaMA GGUF file.")
+print(f"Also, use {args.model}/omni_vlm.projector to prepare a omni-vlm-encoder.gguf file.")
--- a/examples/omni-vlm-v2/requirements.txt
+++ b/examples/omni-vlm-v2/requirements.txt
@ -0,0 +1,5 @@
+-r ../../requirements/requirements-convert_legacy_llama.txt
+# --extra-index-url https://download.pytorch.org/whl/cpu
+pillow
+torch
+torchvision