diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 000000000..c9d0fbaef
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1 @@
+@zhiyuan8 @alexchen4ai
diff --git a/common/common-nexa.cpp b/common/common-nexa.cpp
index e8a54ba04..c41f91384 100644
--- a/common/common-nexa.cpp
+++ b/common/common-nexa.cpp
@@ -150,6 +150,7 @@ bool load_hparams_and_tensors_from_gguf(const std::string &fname, NexaBaseModel
     }
 
     ggml_free(meta);
+    gguf_free(ctx_gguf);
     return true;
 }
 
@@ -314,4 +315,4 @@ struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
 
     GGML_ASSERT(i < cgraph->n_nodes);
     return cgraph->nodes[i];
-}
\ No newline at end of file
+}
diff --git a/common/common.cpp b/common/common.cpp
index 715adf946..e85c498c9 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
 
+    if (arg == "--omni-vlm-version") {
+        CHECK_ARG
+        params.omni_vlm_version = argv[i];
+        return true;
+    }
     return false;
 }
 
@@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "layer range to apply the control vector(s) to, start and end inclusive" });
     options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"
                                                                         "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
+    options.push_back({ "*",           "        --omni-vlm-version VERSION_STRING",          "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n"                               "(default: 'vlm-81-ocr')"});
     options.push_back({ "*",           "-md,   --model-draft FNAME",    "draft model for speculative decoding (default: unused)" });
     options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
     options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
diff --git a/common/common.h b/common/common.h
index f603ba2be..73dab55ca 100644
--- a/common/common.h
+++ b/common/common.h
@@ -265,6 +265,8 @@ struct gpt_params {
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
 
     std::string lora_outfile = "ggml-lora-merged-f16.gguf";
+
+    std::string omni_vlm_version = "vlm-81-ocr";
 };
 
 void gpt_params_parse_from_env(gpt_params & params);
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
index 9da04f7d3..0b9539e56 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -72,19 +72,14 @@ class MainActivity(
 
         val models = listOf(
             Downloadable(
-                "Phi-2 7B (Q4_0, 1.6 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
-                File(extFilesDir, "phi-2-q4_0.gguf"),
+                "Llama3.2-1B-Instruct (Q4_0, 735 MB)",
+                Uri.parse("https://public-storage.nexa4ai.com/Llama3.2-1B-Instruct/q4_0.gguf"),
+                File(extFilesDir, "Llama3.2-1B-Instruct-q4_0.gguf"),
             ),
             Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
-            ),
-            Downloadable(
-                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
-                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
-                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
+                "octopus",
+                Uri.parse("https://public-storage.nexa4ai.com/Octopus-v2/q4_0.gguf"),
+                File(extFilesDir, "octopus-q4_0.gguf")
             ),
         )
 
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
index 2de496574..9b1a436c8 100644
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -33,6 +33,7 @@ project("llama-android")
 
 #load local llama.cpp
 add_subdirectory(../../../../../../ build-llama)
+add_subdirectory(../../../../../../examples/llava build-llava)
 
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
@@ -50,4 +51,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
         llama
         common
         android
-        log)
+        log
+        llava)
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 2aafe2316..297583c13 100644
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -6,6 +6,7 @@
 #include <unistd.h>
 #include "llama.h"
 #include "common.h"
+#include "llava.h"
 
 // Write C++ code here.
 //
diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
index 6c63e54e0..866cbaf89 100644
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -36,7 +36,7 @@ class LLamaAndroid {
         }
     }.asCoroutineDispatcher()
 
-    private val nlen: Int = 64
+    private val nlen: Int = 256
 
     private external fun log_to_android()
     private external fun load_model(filename: String): Long
diff --git a/examples/nexa-omni-audio/omni.cpp b/examples/nexa-omni-audio/omni.cpp
index f55dc3d5c..b236fae57 100644
--- a/examples/nexa-omni-audio/omni.cpp
+++ b/examples/nexa-omni-audio/omni.cpp
@@ -23,6 +23,8 @@
 // Constants
 //
 
+void* internal_chars = nullptr;
+
 static const char *AUDIO_TOKEN = "<|AUDIO|>";
 
 //
@@ -570,7 +572,7 @@ static omni_params get_omni_params_from_context_params(omni_context_params &para
     all_params.gpt.n_gpu_layers = params.n_gpu_layers;
     all_params.gpt.model = params.model;
     all_params.gpt.prompt = params.prompt;
-    
+
     // Initialize whisper params
     all_params.whisper.model = params.mmproj;
     all_params.whisper.fname_inp = {params.file};
@@ -703,6 +705,10 @@ struct omni_context *omni_init_context(omni_context_params &params)
 
 void omni_free(struct omni_context *ctx_omni)
 {
+    if(internal_chars != nullptr)
+    {
+        free(internal_chars);
+    }
     if (ctx_omni->ctx_whisper)
     {
         whisper_free(ctx_omni->ctx_whisper);
@@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
     return embed_proj;
 }
 
-void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
+const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
 {
     int n_past = 0;
 
@@ -833,12 +839,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
     for (int i = 0; i < max_tgt_len; i++)
     {
         const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
-        response += tmp;
         if (strcmp(tmp, "</s>") == 0)
             break;
         if (strstr(tmp, "###"))
             break; // Yi-VL behavior
-        printf("%s", tmp);
+        // printf("%s", tmp);
         if (strstr(response.c_str(), "<|im_end|>"))
             break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         if (strstr(response.c_str(), "<|im_start|>"))
@@ -847,16 +852,22 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
             break; // mistral llava-1.6
 
         fflush(stdout);
+        response += tmp;
     }
 
     llama_sampling_free(ctx_sampling);
     printf("\n");
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
 }
 
-void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
+const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
 {
     omni_params all_params = get_omni_params_from_context_params(params);
 
     ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
-    omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
-}
\ No newline at end of file
+    return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
+}
diff --git a/examples/nexa-omni-audio/omni.h b/examples/nexa-omni-audio/omni.h
index 5cbbd52ed..dcadb4288 100644
--- a/examples/nexa-omni-audio/omni.h
+++ b/examples/nexa-omni-audio/omni.h
@@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param
 
 OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
 
-OMNI_AUDIO_API void omni_process_full(
+OMNI_AUDIO_API const char* omni_process_full(
     struct omni_context *ctx_omni,
     omni_context_params &params
 );
 
 #ifdef __cplusplus
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/examples/omni-vlm/README.md b/examples/omni-vlm/README.md
index d6cfc7f37..07bbb3423 100644
--- a/examples/omni-vlm/README.md
+++ b/examples/omni-vlm/README.md
@@ -1,22 +1,30 @@
 # omni-vlm
 
-Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,
+Currently this implementation supports:
 
-After API is confirmed, more models will be supported / uploaded.
+* [nano-vlm-instruct](https://huggingface.co/NexaAIDev/nano-vlm-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/nano-vlm-instruct-gguf/tree/main))
+* [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-ocr-gguf/tree/main))
+* [vlm-81-instruct](https://huggingface.co/NexaAIDev/vlm-81-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-instruct-gguf/tree/main))
+
+After API is stable, more models will be supported.
 
 ## Usage
-Build with cmake in the `llama-cpp-experiments` folder:
-```bash
+
+Build with cmake in the `llama.cpp` folder:
+
+```console
 cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
 cmake --build build --verbose -j
 ```
+
 After building, run: `./omni-vlm-cli` to see the usage. For example:
 
-```bash
+```console
 ./omni-vlm-cli \
-    -m Nano-Llm-494M-F16.gguf \
-    --mmproj mmproj-omni-vlm-f16.gguf \
-    --image example/omni-vlm/cat.png
+    -m <llm-F16.gguf> \
+    --mmproj <mmproj-F16.gguf> \
+    --image example/omni-vlm/cat.png \
+    --omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
 ```
 
 See next section to convert gguf files from original safetensors.
@@ -27,6 +35,7 @@ See next section to convert gguf files from original safetensors.
 )
 
 ## Omni-vlm gguf conversion
+
 1) First clone omni-vlm model:
 ```console
 git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
@@ -34,7 +43,7 @@ git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
 
 2) Install the required Python packages:
 
-```sh
+```console
 pip install -r examples/omni-vlm/requirements.txt
 ```
 
@@ -104,6 +113,5 @@ After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
 python omni_vlm_demo.py \
   --model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
   --mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
-  --prompt="Describe this image for me" \
-  --image-path cat.png
+  --omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
 ```
diff --git a/examples/omni-vlm/clip.cpp b/examples/omni-vlm/clip.cpp
index 45764f9f3..618067aba 100644
--- a/examples/omni-vlm/clip.cpp
+++ b/examples/omni-vlm/clip.cpp
@@ -6,6 +6,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "common.h"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@@ -167,7 +168,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_RESAMPLER, "resampler"},
 };
 
-
+enum omni_vlm_version_type {
+    VLM_81_OCR,
+    VLM_81_INSTRUCT,
+    NANO_VLM_INSTRUCT,
+};
 //
 // utilities to get data from a gguf file
 //
@@ -294,115 +299,6 @@ static projector_type clip_projector_type_from_string(const std::string & name)
     return PROJECTOR_TYPE_UNKNOWN;
 }
 
-#ifdef CLIP_DEBUG_FUNCTIONS
-static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
-    std::ofstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
-        return;
-    }
-
-    // PPM header: P6 format, width, height, and max color value
-    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
-
-    // Write pixel data
-    for (size_t i = 0; i < img.buf.size(); i += 3) {
-        // PPM expects binary data in RGB format, which matches our image buffer
-        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
-    }
-
-    file.close();
-}
-
-static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
-    std::ofstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
-        return;
-    }
-
-    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
-    int bytesPerPixel = 3;
-    int widthInBytes = img.nx * bytesPerPixel;
-    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
-    int stride = widthInBytes + paddingAmount;
-
-    // Bitmap file header
-    unsigned char fileHeader[14] = {
-        'B','M',     // Signature
-        0,0,0,0,    // Image file size in bytes
-        0,0,0,0,    // Reserved
-        54,0,0,0    // Start of pixel array
-    };
-
-    // Total file size
-    fileSize = 54 + (stride * img.ny);
-    fileHeader[2] = (unsigned char)(fileSize);
-    fileHeader[3] = (unsigned char)(fileSize >> 8);
-    fileHeader[4] = (unsigned char)(fileSize >> 16);
-    fileHeader[5] = (unsigned char)(fileSize >> 24);
-
-    // Bitmap information header (BITMAPINFOHEADER)
-    unsigned char infoHeader[40] = {
-        40,0,0,0,   // Size of this header (40 bytes)
-        0,0,0,0,    // Image width
-        0,0,0,0,    // Image height
-        1,0,        // Number of color planes
-        24,0,       // Bits per pixel
-        0,0,0,0,    // No compression
-        0,0,0,0,    // Image size (can be 0 for no compression)
-        0,0,0,0,    // X pixels per meter (not specified)
-        0,0,0,0,    // Y pixels per meter (not specified)
-        0,0,0,0,    // Total colors (color table not used)
-        0,0,0,0     // Important colors (all are important)
-    };
-
-    // Width and height in the information header
-    infoHeader[4] = (unsigned char)(img.nx);
-    infoHeader[5] = (unsigned char)(img.nx >> 8);
-    infoHeader[6] = (unsigned char)(img.nx >> 16);
-    infoHeader[7] = (unsigned char)(img.nx >> 24);
-    infoHeader[8] = (unsigned char)(img.ny);
-    infoHeader[9] = (unsigned char)(img.ny >> 8);
-    infoHeader[10] = (unsigned char)(img.ny >> 16);
-    infoHeader[11] = (unsigned char)(img.ny >> 24);
-
-    // Write file headers
-    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
-    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
-
-    // Pixel data
-    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
-    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
-        for (int x = 0; x < img.nx; ++x) {
-            // Each pixel
-            size_t pixelIndex = (y * img.nx + x) * 3;
-            unsigned char pixel[3] = {
-                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
-                img.buf[pixelIndex + 1],
-                img.buf[pixelIndex]
-            };
-            file.write(reinterpret_cast<char*>(pixel), 3);
-        }
-        // Write padding for the row
-        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
-    }
-
-    file.close();
-}
-
-// debug function to convert f32 to u8
-static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(3 * src.nx * src.ny);
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
-    }
-}
-#endif
-
-
 //
 // clip layers
 //
@@ -564,6 +460,7 @@ struct clip_ctx {
 
     struct clip_vision_model vision_model;
     projector_type proj_type = PROJECTOR_TYPE_MLP;
+    omni_vlm_version_type omni_vlm_ver_type;
 
     float image_mean[3];
     float image_std[3];
@@ -785,6 +682,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
     }
 
+    if(ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_OCR || ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_INSTRUCT) {
+        embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0]*9, embeddings->ne[1]/9, 1);
+    }
+
     embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
     embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
@@ -800,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 }
 
 // read and create ggml_context containing the tensors and their data
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) {
     struct ggml_context * meta = NULL;
 
     struct gguf_init_params params = {
@@ -895,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     }
 
     clip_ctx * new_clip = new clip_ctx{};
+    if (std::string(omni_vlm_version) == "vlm-81-ocr") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
+    } else if (std::string(omni_vlm_version) == "vlm-81-instruct") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
+    } else if (std::string(omni_vlm_version) == "nano-vlm-instruct") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
+    } else {
+        throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version);
+    }
 
     // update projector type
     {
@@ -1308,6 +1218,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     return new_clip;
 }
 
+// void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
+//     if (params->omni_vlm_version == "vlm-81-ocr") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
+//     } else if (params->omni_vlm_version == "vlm-81-instruct") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
+//     } else if (params->omni_vlm_version == "nano-vlm-instruct") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
+//     } else {
+//         throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
+//     }
+// }
+
 void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
     ctx_clip->load_image_size = load_image_size;
 }
@@ -2294,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     return true;
 }
 
-bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) {
     ggml_type type = GGML_TYPE_Q4_1;
 
     assert(itype < GGML_TYPE_COUNT);
     type = static_cast<ggml_type>(itype);
 
-    auto * ctx_clip = clip_model_load(fname_inp, 2);
+    auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2);
 
     const auto & ctx_src = ctx_clip->ctx_gguf;
     const auto & ctx_data = ctx_clip->ctx_data;
diff --git a/examples/omni-vlm/clip.h b/examples/omni-vlm/clip.h
index 78588bdf1..cd4007a9e 100644
--- a/examples/omni-vlm/clip.h
+++ b/examples/omni-vlm/clip.h
@@ -39,9 +39,12 @@ struct clip_image_f32_batch {
     size_t size;
 };
 
-CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, const char * omni_vlm_version, int verbosity);
 CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
 
+// struct gpt_params;
+// CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
+
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
@@ -83,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
-CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version);
 
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
 
diff --git a/examples/omni-vlm/latex.png b/examples/omni-vlm/latex.png
new file mode 100644
index 000000000..b97318fc0
Binary files /dev/null and b/examples/omni-vlm/latex.png differ
diff --git a/examples/omni-vlm/omni-vlm-cli.cpp b/examples/omni-vlm/omni-vlm-cli.cpp
index 68e833182..d24634fe8 100644
--- a/examples/omni-vlm/omni-vlm-cli.cpp
+++ b/examples/omni-vlm/omni-vlm-cli.cpp
@@ -12,6 +12,10 @@
 #include <cstdlib>
 #include <cstring>
 #include <vector>
+// #include <iostream>
+//
+// using std::cout;
+// using std::endl;
 
 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
     int N = (int) tokens.size();
@@ -149,7 +153,7 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
             LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
         }
     }
-    LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+    // LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
     if (params->verbose_prompt) {
         auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
         for (int i = 0; i < (int) tmp.size(); i++) {
@@ -165,6 +169,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
 
     LOG("\n");
 
+    params->sparams.temp = 0.0f;
+    params->sparams.top_k = 1;
+    params->sparams.top_p = 1.0f;
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
     if (!ctx_sampling) {
         LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
@@ -177,8 +184,8 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
         response += tmp;
         if (strcmp(tmp, "<|im_end|>") == 0) break;
         if (strcmp(tmp, "</s>") == 0) break;
-        // if (strstr(tmp, "###")) break; // Yi-VL behavior
         printf("%s", tmp);
+        // LOG("%s", tmp);
         // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
         // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -212,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
         prompt = "describe the image in detail.";
     }
 
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
-
+    auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
+    // clip_set_omni_vlm_version(ctx_clip, params);
 
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -249,9 +256,6 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    // if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
-    //     return 1;
-    // }
     if (!gpt_params_parse(argc, argv, params)) {
         print_usage(argc, argv, params);
         return 1;
@@ -261,8 +265,21 @@ int main(int argc, char ** argv) {
         print_usage(argc, argv, {});
         return 1;
     }
+    if (params.omni_vlm_version != "vlm-81-ocr" && params.prompt.empty()) {
+        LOG_TEE("%s : prompt is empty.\n", __func__);
+        print_usage(argc, argv, {});
+        return 1;
+    }
 
-    params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe this image for me\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    if (params.omni_vlm_version == "vlm-81-ocr") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    } else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    } else {
+        LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
+        print_usage(argc, argv, {});
+        return 1;
+    }
 
     auto * model = omnivlm_init(&params);
     if (model == NULL) {
@@ -271,8 +288,8 @@ int main(int argc, char ** argv) {
     }
 
 
-    auto * ctx_omnivlm = omnivlm_init_context(&params, model);
     for (auto & image : params.image) {
+        auto * ctx_omnivlm = omnivlm_init_context(&params, model);
         auto * image_embed = load_image(ctx_omnivlm, &params, image);
         if (!image_embed) {
             LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
@@ -283,9 +300,9 @@ int main(int argc, char ** argv) {
 
         llama_print_timings(ctx_omnivlm->ctx_llama);
         omnivlm_image_embed_free(image_embed);
+        ctx_omnivlm->model = NULL;
+        omnivlm_free(ctx_omnivlm);
     }
-    ctx_omnivlm->model = NULL;
-    omnivlm_free(ctx_omnivlm);
 
     llama_free_model(model);
 
diff --git a/examples/omni-vlm/omni-vlm-wrapper-cli.cpp b/examples/omni-vlm/omni-vlm-wrapper-cli.cpp
index 731b7791e..6a65b7643 100644
--- a/examples/omni-vlm/omni-vlm-wrapper-cli.cpp
+++ b/examples/omni-vlm/omni-vlm-wrapper-cli.cpp
@@ -1,15 +1,24 @@
 // WARNING: this .cpp file is only for debugging. do not user directly.
 #include "omni-vlm-wrapper.h"
+#include <iostream>
+
+
+using std::cout;
+using std::endl;
 
 int main(int argc, char ** argv) {
-    const char* llm_model = "<path to llm gguf.>";
-    const char* mmproj_model = "<path to mm projector gguf>";
-    const char* image_path = "<path where image is located.>";
+    const char* llm_model = "";
+    const char* mmproj_model = "";
+    const char* image_path = "";
     const char* prompt = "";
 
-    omnivlm_init(llm_model, mmproj_model);
-    omnivlm_inference(prompt, image_path);
-    omnivlm_inference(prompt, image_path);
+    omnivlm_init(llm_model, mmproj_model, "vlm-81-ocr");
+
+    const char* res;
+    res = omnivlm_inference(prompt, image_path);
+    cout << "RES: " << res << endl;
+    res = omnivlm_inference(prompt, image_path);
+    cout << "RES: " << res << endl;
     omnivlm_free();
 
     return 0;
diff --git a/examples/omni-vlm/omni-vlm-wrapper.cpp b/examples/omni-vlm/omni-vlm-wrapper.cpp
index 81178205e..ba0749d06 100644
--- a/examples/omni-vlm/omni-vlm-wrapper.cpp
+++ b/examples/omni-vlm/omni-vlm-wrapper.cpp
@@ -24,6 +24,8 @@ struct omnivlm_context {
     struct llama_model * model = NULL;
 };
 
+void* internal_chars = nullptr;
+
 static struct gpt_params params;
 static struct llama_model* model;
 static struct omnivlm_context* ctx_omnivlm;
@@ -63,7 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
         prompt = "describe the image in detail.";
     }
 
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
+    auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
+    // clip_set_omni_vlm_version(ctx_clip, params);
 
 
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
@@ -128,19 +131,19 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
     return ret.c_str();
 }
 
-static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
     int n_past = 0;
 
     const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
 
-    std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
-                                + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
-    size_t image_pos = full_prompt.find("<|image_pad|>");
+    // std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
+    //                             + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    size_t image_pos = params->prompt.find("<|image_pad|>");
     std::string system_prompt, user_prompt;
 
     // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
-    system_prompt = full_prompt.substr(0, image_pos);
-    user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length());
+    system_prompt = params->prompt.substr(0, image_pos);
+    user_prompt = params->prompt.substr(image_pos + std::string("<|image_pad|>").length());
     if (params->verbose_prompt) {
         auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
         for (int i = 0; i < (int) tmp.size(); i++) {
@@ -155,6 +158,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
         }
     }
 
+    params->sparams.top_k = 1;
+    params->sparams.top_p = 1.0f;
+
     eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
     omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
     eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@@ -172,11 +178,11 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
     std::string response = "";
     for (int i = 0; i < max_tgt_len; i++) {
         const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past);
-        response += tmp;
         if (strcmp(tmp, "<|im_end|>") == 0) break;
         if (strcmp(tmp, "</s>") == 0) break;
         // if (strstr(tmp, "###")) break; // Yi-VL behavior
-        printf("%s", tmp);
+        // printf("%s", tmp);
+        response += tmp;
         // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
         // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@@ -186,6 +192,13 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
 
     llama_sampling_free(ctx_sampling);
     printf("\n");
+
+    // const char* ret_char_ptr = (const char*)(malloc(sizeof(char)*response.size()));
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
 }
 
 static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) {
@@ -208,8 +221,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 }
 
 // inference interface definition
-void omnivlm_init(const char* llm_model_path, const char* projector_model_path) {
-    const char* argv = "hello-omni-vlm-wrapper-cli";
+void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
+    const char* argv = "omni-wrapper-py";
     char* nc_argv = const_cast<char*>(argv);
     if (!gpt_params_parse(1, &nc_argv, params)) {
         print_usage(1, &nc_argv, {});
@@ -217,31 +230,60 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path)
     }
     params.model = llm_model_path;
     params.mmproj = projector_model_path;
+    params.omni_vlm_version = omni_vlm_version;
+
+    std::string omni_vlm_ver = params.omni_vlm_version;
+    if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
+        fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
+        fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
+        throw std::runtime_error("You set wrong vlm_version info strings.");
+    }
+
     model = omnivlm_init(&params);
     if (model == nullptr) {
         fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
         throw std::runtime_error("Failed to init omnivlm model");
     }
-    ctx_omnivlm = omnivlm_init_context(&params, model);
 }
 
-void omnivlm_inference(const char *prompt, const char *imag_path) {
+const char* omnivlm_inference(const char *prompt, const char *imag_path) {
+    ctx_omnivlm = omnivlm_init_context(&params, model);
+
     std::string image = imag_path;
     params.prompt = prompt;
+
+    if (params.omni_vlm_version == "vlm-81-ocr") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
+    } else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    } else {
+        LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
+        throw std::runtime_error("You set wrong vlm_version info strings.");
+    }
+
     auto * image_embed = load_image(ctx_omnivlm, &params, image);
     if (!image_embed) {
         LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
         throw std::runtime_error("failed to load image " + image);
     }
     // process the prompt
-    process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);
+    const char* ret_chars = process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);
 
     // llama_perf_print(ctx_omnivlm->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
     omnivlm_image_embed_free(image_embed);
+    ctx_omnivlm->model = nullptr;
+    omnivlm_free(ctx_omnivlm);
+    ctx_omnivlm = nullptr;
+
+    return ret_chars;
 }
 
 void omnivlm_free() {
-    ctx_omnivlm->model = NULL;
-    omnivlm_free(ctx_omnivlm);
+    if(internal_chars != nullptr) { free(internal_chars); }
+    if(ctx_omnivlm != nullptr) {
+        // this snipet should never be run!
+        ctx_omnivlm->model = nullptr;
+        omnivlm_free(ctx_omnivlm);
+    }
     llama_free_model(model);
 }
diff --git a/examples/omni-vlm/omni-vlm-wrapper.h b/examples/omni-vlm/omni-vlm-wrapper.h
index 4ab2c234c..22cc40533 100644
--- a/examples/omni-vlm/omni-vlm-wrapper.h
+++ b/examples/omni-vlm/omni-vlm-wrapper.h
@@ -20,9 +20,9 @@
 extern "C" {
 #endif
 
-OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path);
+OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version);
 
-OMNIVLM_API void omnivlm_inference(const char* prompt, const char* imag_path);
+OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path);
 
 OMNIVLM_API void omnivlm_free();
 
diff --git a/examples/omni-vlm/omni-vlm.cpp b/examples/omni-vlm/omni-vlm.cpp
index 539b300bf..339b6ffbe 100644
--- a/examples/omni-vlm/omni-vlm.cpp
+++ b/examples/omni-vlm/omni-vlm.cpp
@@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
 
     *n_img_pos = clip_n_patches(ctx_clip);
     bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
-    // cout << "\t\t A NICE START" << endl;
-    // cout << "\t\t" <<  *n_img_pos << endl;
-    /*
-    if (clip_is_minicpmv(ctx_clip)) {
-        std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        struct clip_image_size * load_image_size = clip_image_size_init();
-        for (size_t i = 0; i < img_res_v.size; i++) {
-            const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
-            int patch_size=14;
-            load_image_size->width = img_res_v.data[i].nx;
-            load_image_size->height = img_res_v.data[i].ny;
-            clip_add_load_image_size(ctx_clip, load_image_size);
-            bool encoded = false;
-            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-            if (has_minicpmv_projector == 2) {
-                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-            }
-            else if (has_minicpmv_projector == 3) {
-                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-            }
-            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
-                return false;
-            }
-            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
-        }
-        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
-
-        int n_img_pos_out = 0;
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
-            n_img_pos_out += clip_n_patches(ctx_clip);
-        }
-        *n_img_pos = n_img_pos_out;
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            free(image_embd_v[i]);
-        }
-        image_embd_v.clear();
-        load_image_size->width = img->nx;
-        load_image_size->height = img->ny;
-        clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
-    }
-    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
-        // flat / default llava-1.5 type embedding
-        *n_img_pos = clip_n_patches(ctx_clip);
-        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
-        delete[] img_res_v.data;
-        if (!encoded) {
-            LOG_ERR("Unable to encode image\n");
-
-            return false;
-        }
-    }
-    else {
-        // spatial_unpad llava-1.6 type embedding
-        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
-        std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        for (size_t i = 0; i < img_res_v.size; i++) {
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
-            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
-            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
-                return false;
-            }
-        }
-        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
-
-        const int32_t * image_grid = clip_image_grid(ctx_clip);
-
-        std::vector<std::pair<int, int>> grid_pinpoints;
-        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
-            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
-        }
-
-        // free all img_res_v - not needed anymore
-        delete[] img_res_v.data;
-        img_res_v.size = 0;
-        img_res_v.data = nullptr;
-
-        const int32_t image_size = clip_image_size(ctx_clip);
-
-        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
-
-        int n_img_pos_out;
-        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
-        *n_img_pos = n_img_pos_out;
-
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            free(image_embd_v[i]);
-        }
-        image_embd_v.clear();
-
-        // debug image/segment/normalization content:
-        // clip_image_u8 * tmp = clip_image_u8_init();
-        // clip_image_convert_f32_to_u8(*image_feature, *tmp);
-        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
-    }
-    */
 
     LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
 
diff --git a/examples/omni-vlm/omni_vlm_cpp.py b/examples/omni-vlm/omni_vlm_cpp.py
index 6f23f7c4c..81edb6f1d 100644
--- a/examples/omni-vlm/omni_vlm_cpp.py
+++ b/examples/omni-vlm/omni_vlm_cpp.py
@@ -60,11 +60,11 @@ _lib = _load_shared_library(_lib_base_name, base_path)
 omni_char_p = ctypes.c_char_p
 
 
-def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
-    return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
+def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p):
+    return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version)
 
 
-_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
+_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p]
 _lib.omnivlm_init.restype = None
 
 
@@ -73,7 +73,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):
 
 
 _lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
-_lib.omnivlm_inference.restype = None
+_lib.omnivlm_inference.restype = omni_char_p
 
 
 def omnivlm_free():
diff --git a/examples/omni-vlm/omni_vlm_demo.py b/examples/omni-vlm/omni_vlm_demo.py
index 4f8c5998f..fbed2758f 100644
--- a/examples/omni-vlm/omni_vlm_demo.py
+++ b/examples/omni-vlm/omni_vlm_demo.py
@@ -11,16 +11,17 @@ class NexaOmniVlmInference:
     A class used for vision language model inference.
     """
 
-    def __init__(self, llm_model_path: str, mmproj_model_path: str):
+    def __init__(self, llm_model_path: str, mmproj_model_path: str, omni_vlm_version: str):
         self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8"))
         self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8"))
+        self.omni_vlm_version = ctypes.c_char_p(omni_vlm_version.encode("utf-8"))
 
-        omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model)
+        omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model, self.omni_vlm_version)
 
     def inference(self, prompt: str, image_path: str):
         prompt = ctypes.c_char_p(prompt.encode("utf-8"))
         image_path = ctypes.c_char_p(image_path.encode("utf-8"))
-        omni_vlm_cpp.omnivlm_inference(prompt, image_path)
+        return omni_vlm_cpp.omnivlm_inference(prompt, image_path)
 
     def __del__(self):
         omni_vlm_cpp.omnivlm_free()
@@ -34,22 +35,30 @@ if __name__ == "__main__":
     )
     parser.add_argument("--model", type=str, help="Path to the llm model file")
     parser.add_argument("--mmproj", type=str, help="Path to the mmproj file")
+    parser.add_argument("--omni-vlm-version", type=str, help="omni-vlm-version info ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')")
     # parser.add_argument("--prompt", type=str, help="prompt string.")
     # parser.add_argument("--image-path", type=str, help="Path to the image.")
 
     args = parser.parse_args()
 
-    omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj)
+    print("DEBUG")
+    print(args.omni_vlm_version)
+    omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj, args.omni_vlm_version)
     # omni_vlm_obj.inference(args.prompt, args.image_path)
     while True:
-        print("Input your prompt:")
-        prompt = input()
-        if prompt == "":
-            print("ERROR: you input an empty prompt, try again.")
-            continue
+        if args.omni_vlm_version != "vlm-81-ocr":
+            print("Input your prompt:")
+            prompt = input()
+            if prompt == "":
+                print("ERROR: you input an empty prompt, try again.")
+                continue
+        else:
+            prompt = ""
         print("Input your image path:")
         image_path = input()
         while not os.path.exists(image_path):
             print("ERROR: can not find image in your input path, please check and input agian.")
             image_path = input()
-        omni_vlm_obj.inference(prompt, image_path)
+        response = omni_vlm_obj.inference(prompt, image_path)
+        print("\tresponse:")
+        print(response.decode('utf-8'))
diff --git a/examples/qwen2-audio/qwen2.cpp b/examples/qwen2-audio/qwen2.cpp
index be7d74d6d..8a08a7ac6 100644
--- a/examples/qwen2-audio/qwen2.cpp
+++ b/examples/qwen2-audio/qwen2.cpp
@@ -18,10 +18,12 @@
 #include <thread>
 #include <vector>
 #include <cstring>
+#include <iostream>
 
 //
 // Constants
 //
+void* internal_chars = nullptr;
 
 static const char *AUDIO_TOKEN = "<|AUDIO|>";
 
@@ -565,16 +567,16 @@ bool omni_params_parse(int argc, char **argv, omni_params &params)
 static omni_params get_omni_params_from_context_params(omni_context_params &params)
 {
     omni_params all_params;
-    
+
     // Initialize gpt params
     all_params.gpt.n_gpu_layers = params.n_gpu_layers;
     all_params.gpt.model = params.model;
     all_params.gpt.prompt = params.prompt;
-    
+
     // Initialize whisper params
     all_params.whisper.model = params.mmproj;
     all_params.whisper.fname_inp = {params.file};
-    
+
     if (all_params.gpt.n_threads <= 0)
     {
         all_params.gpt.n_threads = std::thread::hardware_concurrency();
@@ -703,6 +705,12 @@ struct omni_context *omni_init_context(omni_context_params &params)
 
 void omni_free(struct omni_context *ctx_omni)
 {
+
+    if(internal_chars != nullptr)
+    {
+        free(internal_chars);
+        internal_chars = nullptr;
+    }
     if (ctx_omni->ctx_whisper)
     {
         whisper_free(ctx_omni->ctx_whisper);
@@ -710,12 +718,13 @@ void omni_free(struct omni_context *ctx_omni)
     }
     if (ctx_omni->projector)
     {
-        ctx_omni->projector->free();
+        delete ctx_omni->projector;
     }
 
     llama_free(ctx_omni->ctx_llama);
     llama_free_model(ctx_omni->model);
     llama_backend_free();
+    free(ctx_omni);
 }
 
 static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_embed, int n_batch, int *n_past)
@@ -755,6 +764,7 @@ static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_e
         }
         *n_past += n_eval;
     }
+    free(audio_embed_data);
     return true;
 }
 
@@ -792,7 +802,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
     return embed_proj;
 }
 
-void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
+const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
 {
     int n_past = 0;
 
@@ -841,12 +851,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
     for (int i = 0; i < max_tgt_len; i++)
     {
         const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
-        response += tmp;
         if (strcmp(tmp, "</s>") == 0)
             break;
         if (strstr(tmp, "###"))
             break; // Yi-VL behavior
-        printf("%s", tmp);
+        // printf("%s", tmp);
         if (strstr(response.c_str(), "<|im_end|>"))
             break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         if (strstr(response.c_str(), "<|im_start|>"))
@@ -855,16 +864,23 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
             break; // mistral llava-1.6
 
         fflush(stdout);
+        response += tmp;
     }
 
     llama_sampling_free(ctx_sampling);
     printf("\n");
+
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
 }
 
-void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
+const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
 {
     omni_params all_params = get_omni_params_from_context_params(params);
 
     ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
-    omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
-}
\ No newline at end of file
+    return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
+}
diff --git a/examples/qwen2-audio/qwen2.h b/examples/qwen2-audio/qwen2.h
index 5cbbd52ed..dcadb4288 100644
--- a/examples/qwen2-audio/qwen2.h
+++ b/examples/qwen2-audio/qwen2.h
@@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param
 
 OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
 
-OMNI_AUDIO_API void omni_process_full(
+OMNI_AUDIO_API const char* omni_process_full(
     struct omni_context *ctx_omni,
     omni_context_params &params
 );
 
 #ifdef __cplusplus
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/examples/qwen2-audio/whisper.cpp b/examples/qwen2-audio/whisper.cpp
index 6da9d268d..b2ce58475 100644
--- a/examples/qwen2-audio/whisper.cpp
+++ b/examples/qwen2-audio/whisper.cpp
@@ -9467,6 +9467,8 @@ static bool whisper_encoder_load(struct whisper_model_loader *loader, whisper_co
 
     wctx.t_load_us = ggml_time_us() - t_start_us;
 
+    gguf_free(gguf_ctx);
+
     return true;
 }
 
diff --git a/ggml_llama/src/vulkan-shaders/CMakeLists.txt b/ggml_llama/src/vulkan-shaders/CMakeLists.txt
index bdb6038a1..10075db33 100644
--- a/ggml_llama/src/vulkan-shaders/CMakeLists.txt
+++ b/ggml_llama/src/vulkan-shaders/CMakeLists.txt
@@ -1,6 +1,6 @@
 find_package (Threads REQUIRED)
 
-set(TARGET llama-vulkan-shaders-gen)
+set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)