Merge remote-tracking branch 'origin' into zack/vlm

2024-11-22 09:04:28 +00:00 · 2024-11-22 09:04:28 +00:00 · bbf1aaa7ed
commit bbf1aaa7ed
parent 38c6fa3b8f 25190fefa2
25 changed files with 258 additions and 316 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1 @@
+@zhiyuan8 @alexchen4ai
--- a/common/common-nexa.cpp
+++ b/common/common-nexa.cpp
@ -150,6 +150,7 @@ bool load_hparams_and_tensors_from_gguf(const std::string &fname, NexaBaseModel
    }

    ggml_free(meta);
+    gguf_free(ctx_gguf);
    return true;
 }

--- a/common/common.cpp
+++ b/common/common.cpp
@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS

+    if (arg == "--omni-vlm-version") {
+        CHECK_ARG
+        params.omni_vlm_version = argv[i];
+        return true;
+    }
    return false;
 }

@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "layer range to apply the control vector(s) to, start and end inclusive" });
    options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"
                                                                        "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
+    options.push_back({ "*",           "        --omni-vlm-version VERSION_STRING",          "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n"                               "(default: 'vlm-81-ocr')"});
    options.push_back({ "*",           "-md,   --model-draft FNAME",    "draft model for speculative decoding (default: unused)" });
    options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
--- a/common/common.h
+++ b/common/common.h
@ -265,6 +265,8 @@ struct gpt_params {
    bool spm_infill = false; // suffix/prefix/middle pattern for infill

    std::string lora_outfile = "ggml-lora-merged-f16.gguf";
+
+    std::string omni_vlm_version = "vlm-81-ocr";
 };

 void gpt_params_parse_from_env(gpt_params & params);
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@ -72,19 +72,14 @@ class MainActivity(

        val models = listOf(
            Downloadable(
-                "Phi-2 7B (Q4_0, 1.6 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
-                File(extFilesDir, "phi-2-q4_0.gguf"),
+                "Llama3.2-1B-Instruct (Q4_0, 735 MB)",
+                Uri.parse("https://public-storage.nexa4ai.com/Llama3.2-1B-Instruct/q4_0.gguf"),
+                File(extFilesDir, "Llama3.2-1B-Instruct-q4_0.gguf"),
            ),
            Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
-            ),
-            Downloadable(
-                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
-                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
-                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
+                "octopus",
+                Uri.parse("https://public-storage.nexa4ai.com/Octopus-v2/q4_0.gguf"),
+                File(extFilesDir, "octopus-q4_0.gguf")
            ),
        )

--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@ -33,6 +33,7 @@ project("llama-android")

 #load local llama.cpp
 add_subdirectory(../../../../../../ build-llama)
+add_subdirectory(../../../../../../examples/llava build-llava)

 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
@ -50,4 +51,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
        llama
        common
        android
-        log)
+        log
+        llava)
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -6,6 +6,7 @@
 #include <unistd.h>
 #include "llama.h"
 #include "common.h"
+#include "llava.h"

 // Write C++ code here.
 //
--- a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@ -36,7 +36,7 @@ class LLamaAndroid {
        }
    }.asCoroutineDispatcher()

-    private val nlen: Int = 64
+    private val nlen: Int = 256

    private external fun log_to_android()
    private external fun load_model(filename: String): Long
--- a/examples/nexa-omni-audio/omni.cpp
+++ b/examples/nexa-omni-audio/omni.cpp
@ -23,6 +23,8 @@
 // Constants
 //

+void* internal_chars = nullptr;
+
 static const char *AUDIO_TOKEN = "<|AUDIO|>";

 //
@ -703,6 +705,10 @@ struct omni_context *omni_init_context(omni_context_params &params)

 void omni_free(struct omni_context *ctx_omni)
 {
+    if(internal_chars != nullptr)
+    {
+        free(internal_chars);
+    }
    if (ctx_omni->ctx_whisper)
    {
        whisper_free(ctx_omni->ctx_whisper);
@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
    return embed_proj;
 }

-void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
+const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
 {
    int n_past = 0;

@ -833,12 +839,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
    for (int i = 0; i < max_tgt_len; i++)
    {
        const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
-        response += tmp;
        if (strcmp(tmp, "</s>") == 0)
            break;
        if (strstr(tmp, "###"))
            break; // Yi-VL behavior
-        printf("%s", tmp);
+        // printf("%s", tmp);
        if (strstr(response.c_str(), "<|im_end|>"))
            break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
        if (strstr(response.c_str(), "<|im_start|>"))
@ -847,16 +852,22 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
            break; // mistral llava-1.6

        fflush(stdout);
+        response += tmp;
    }

    llama_sampling_free(ctx_sampling);
    printf("\n");
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
 }

-void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
+const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
 {
    omni_params all_params = get_omni_params_from_context_params(params);

    ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
-    omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
+    return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
 }
--- a/examples/nexa-omni-audio/omni.h
+++ b/examples/nexa-omni-audio/omni.h
@ -54,7 +54,7 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param

 OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);

-OMNI_AUDIO_API void omni_process_full(
+OMNI_AUDIO_API const char* omni_process_full(
    struct omni_context *ctx_omni,
    omni_context_params &params
 );
--- a/examples/omni-vlm/README.md
+++ b/examples/omni-vlm/README.md
@ -1,22 +1,30 @@
 # omni-vlm

-Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,
+Currently this implementation supports:

-After API is confirmed, more models will be supported / uploaded.
+* [nano-vlm-instruct](https://huggingface.co/NexaAIDev/nano-vlm-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/nano-vlm-instruct-gguf/tree/main))
+* [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-ocr-gguf/tree/main))
+* [vlm-81-instruct](https://huggingface.co/NexaAIDev/vlm-81-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-instruct-gguf/tree/main))
+
+After API is stable, more models will be supported.

 ## Usage
-Build with cmake in the `llama-cpp-experiments` folder:
-```bash
+
+Build with cmake in the `llama.cpp` folder:
+
+```console
 cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
 cmake --build build --verbose -j
 ```
+
 After building, run: `./omni-vlm-cli` to see the usage. For example:

-```bash
+```console
 ./omni-vlm-cli \
-    -m Nano-Llm-494M-F16.gguf \
-    --mmproj mmproj-omni-vlm-f16.gguf \
-    --image example/omni-vlm/cat.png
+    -m <llm-F16.gguf> \
+    --mmproj <mmproj-F16.gguf> \
+    --image example/omni-vlm/cat.png \
+    --omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
 ```

 See next section to convert gguf files from original safetensors.
@ -27,6 +35,7 @@ See next section to convert gguf files from original safetensors.
 )

 ## Omni-vlm gguf conversion
+
 1) First clone omni-vlm model:
 ```console
 git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
@ -34,7 +43,7 @@ git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct

 2) Install the required Python packages:

-```sh
+```console
 pip install -r examples/omni-vlm/requirements.txt
 ```

@ -104,6 +113,5 @@ After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
 python omni_vlm_demo.py \
  --model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
  --mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
-  --prompt="Describe this image for me" \
-  --image-path cat.png
+  --omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
 ```
--- a/examples/omni-vlm/clip.cpp
+++ b/examples/omni-vlm/clip.cpp
@ -6,6 +6,7 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
+#include "common.h"

 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
@ -167,7 +168,11 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
 };

-
+enum omni_vlm_version_type {
+    VLM_81_OCR,
+    VLM_81_INSTRUCT,
+    NANO_VLM_INSTRUCT,
+};
 //
 // utilities to get data from a gguf file
 //
@ -294,115 +299,6 @@ static projector_type clip_projector_type_from_string(const std::string & name)
    return PROJECTOR_TYPE_UNKNOWN;
 }

-#ifdef CLIP_DEBUG_FUNCTIONS
-static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
-    std::ofstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
-        return;
-    }
-
-    // PPM header: P6 format, width, height, and max color value
-    file << "P6\n" << img.nx << " " << img.ny << "\n255\n";
-
-    // Write pixel data
-    for (size_t i = 0; i < img.buf.size(); i += 3) {
-        // PPM expects binary data in RGB format, which matches our image buffer
-        file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
-    }
-
-    file.close();
-}
-
-static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
-    std::ofstream file(filename, std::ios::binary);
-    if (!file.is_open()) {
-        LOG_ERR("Failed to open file for writing: %s\n", filename.c_str());
-        return;
-    }
-
-    int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
-    int bytesPerPixel = 3;
-    int widthInBytes = img.nx * bytesPerPixel;
-    int paddingAmount = (4 - (widthInBytes % 4)) % 4;
-    int stride = widthInBytes + paddingAmount;
-
-    // Bitmap file header
-    unsigned char fileHeader[14] = {
-        'B','M',     // Signature
-        0,0,0,0,    // Image file size in bytes
-        0,0,0,0,    // Reserved
-        54,0,0,0    // Start of pixel array
-    };
-
-    // Total file size
-    fileSize = 54 + (stride * img.ny);
-    fileHeader[2] = (unsigned char)(fileSize);
-    fileHeader[3] = (unsigned char)(fileSize >> 8);
-    fileHeader[4] = (unsigned char)(fileSize >> 16);
-    fileHeader[5] = (unsigned char)(fileSize >> 24);
-
-    // Bitmap information header (BITMAPINFOHEADER)
-    unsigned char infoHeader[40] = {
-        40,0,0,0,   // Size of this header (40 bytes)
-        0,0,0,0,    // Image width
-        0,0,0,0,    // Image height
-        1,0,        // Number of color planes
-        24,0,       // Bits per pixel
-        0,0,0,0,    // No compression
-        0,0,0,0,    // Image size (can be 0 for no compression)
-        0,0,0,0,    // X pixels per meter (not specified)
-        0,0,0,0,    // Y pixels per meter (not specified)
-        0,0,0,0,    // Total colors (color table not used)
-        0,0,0,0     // Important colors (all are important)
-    };
-
-    // Width and height in the information header
-    infoHeader[4] = (unsigned char)(img.nx);
-    infoHeader[5] = (unsigned char)(img.nx >> 8);
-    infoHeader[6] = (unsigned char)(img.nx >> 16);
-    infoHeader[7] = (unsigned char)(img.nx >> 24);
-    infoHeader[8] = (unsigned char)(img.ny);
-    infoHeader[9] = (unsigned char)(img.ny >> 8);
-    infoHeader[10] = (unsigned char)(img.ny >> 16);
-    infoHeader[11] = (unsigned char)(img.ny >> 24);
-
-    // Write file headers
-    file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
-    file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));
-
-    // Pixel data
-    std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
-    for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
-        for (int x = 0; x < img.nx; ++x) {
-            // Each pixel
-            size_t pixelIndex = (y * img.nx + x) * 3;
-            unsigned char pixel[3] = {
-                img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
-                img.buf[pixelIndex + 1],
-                img.buf[pixelIndex]
-            };
-            file.write(reinterpret_cast<char*>(pixel), 3);
-        }
-        // Write padding for the row
-        file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
-    }
-
-    file.close();
-}
-
-// debug function to convert f32 to u8
-static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(3 * src.nx * src.ny);
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
-    }
-}
-#endif
-
-
 //
 // clip layers
 //
@ -564,6 +460,7 @@ struct clip_ctx {

    struct clip_vision_model vision_model;
    projector_type proj_type = PROJECTOR_TYPE_MLP;
+    omni_vlm_version_type omni_vlm_ver_type;

    float image_mean[3];
    float image_std[3];
@ -785,6 +682,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
    }

+    if(ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_OCR || ctx->omni_vlm_ver_type == omni_vlm_version_type::VLM_81_INSTRUCT) {
+        embeddings = ggml_reshape_3d(ctx0, embeddings, embeddings->ne[0]*9, embeddings->ne[1]/9, 1);
+    }
+
    embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
    embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

@ -800,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 }

 // read and create ggml_context containing the tensors and their data
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) {
    struct ggml_context * meta = NULL;

    struct gguf_init_params params = {
@ -895,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    }

    clip_ctx * new_clip = new clip_ctx{};
+    if (std::string(omni_vlm_version) == "vlm-81-ocr") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
+    } else if (std::string(omni_vlm_version) == "vlm-81-instruct") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
+    } else if (std::string(omni_vlm_version) == "nano-vlm-instruct") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
+    } else {
+        throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version);
+    }

    // update projector type
    {
@ -1308,6 +1218,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    return new_clip;
 }

+// void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
+//     if (params->omni_vlm_version == "vlm-81-ocr") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
+//     } else if (params->omni_vlm_version == "vlm-81-instruct") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
+//     } else if (params->omni_vlm_version == "nano-vlm-instruct") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
+//     } else {
+//         throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
+//     }
+// }
+
 void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
    ctx_clip->load_image_size = load_image_size;
 }
@ -2294,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    return true;
 }

-bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) {
    ggml_type type = GGML_TYPE_Q4_1;

    assert(itype < GGML_TYPE_COUNT);
    type = static_cast<ggml_type>(itype);

-    auto * ctx_clip = clip_model_load(fname_inp, 2);
+    auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2);

    const auto & ctx_src = ctx_clip->ctx_gguf;
    const auto & ctx_data = ctx_clip->ctx_data;
--- a/examples/omni-vlm/clip.h
+++ b/examples/omni-vlm/clip.h
@ -39,9 +39,12 @@ struct clip_image_f32_batch {
    size_t size;
 };

-CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, const char * omni_vlm_version, int verbosity);
 CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);

+// struct gpt_params;
+// CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
+
 CLIP_API void clip_free(struct clip_ctx * ctx);

 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
@ -83,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

-CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version);

 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);

--- a/examples/omni-vlm/latex.png
+++ b/examples/omni-vlm/latex.png
--- a/examples/omni-vlm/omni-vlm-cli.cpp
+++ b/examples/omni-vlm/omni-vlm-cli.cpp
@ -12,6 +12,10 @@
 #include <cstdlib>
 #include <cstring>
 #include <vector>
+// #include <iostream>
+//
+// using std::cout;
+// using std::endl;

 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
    int N = (int) tokens.size();
@ -149,7 +153,7 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
            LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
        }
    }
-    LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
+    // LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
    if (params->verbose_prompt) {
        auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
        for (int i = 0; i < (int) tmp.size(); i++) {
@ -165,6 +169,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima

    LOG("\n");

+    params->sparams.temp = 0.0f;
+    params->sparams.top_k = 1;
+    params->sparams.top_p = 1.0f;
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
    if (!ctx_sampling) {
        LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
@ -177,8 +184,8 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
        response += tmp;
        if (strcmp(tmp, "<|im_end|>") == 0) break;
        if (strcmp(tmp, "</s>") == 0) break;
-        // if (strstr(tmp, "###")) break; // Yi-VL behavior
        printf("%s", tmp);
+        // LOG("%s", tmp);
        // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
        // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
        // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -212,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
        prompt = "describe the image in detail.";
    }

-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
-
+    auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
+    // clip_set_omni_vlm_version(ctx_clip, params);

    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@ -249,9 +256,6 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    // if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
-    //     return 1;
-    // }
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
@ -261,8 +265,21 @@ int main(int argc, char ** argv) {
        print_usage(argc, argv, {});
        return 1;
    }
+    if (params.omni_vlm_version != "vlm-81-ocr" && params.prompt.empty()) {
+        LOG_TEE("%s : prompt is empty.\n", __func__);
+        print_usage(argc, argv, {});
+        return 1;
+    }

-    params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nDescribe this image for me\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    if (params.omni_vlm_version == "vlm-81-ocr") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    } else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    } else {
+        LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
+        print_usage(argc, argv, {});
+        return 1;
+    }

    auto * model = omnivlm_init(&params);
    if (model == NULL) {
@ -271,8 +288,8 @@ int main(int argc, char ** argv) {
    }


-    auto * ctx_omnivlm = omnivlm_init_context(&params, model);
    for (auto & image : params.image) {
+        auto * ctx_omnivlm = omnivlm_init_context(&params, model);
        auto * image_embed = load_image(ctx_omnivlm, &params, image);
        if (!image_embed) {
            LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
@ -283,9 +300,9 @@ int main(int argc, char ** argv) {

        llama_print_timings(ctx_omnivlm->ctx_llama);
        omnivlm_image_embed_free(image_embed);
-    }
        ctx_omnivlm->model = NULL;
        omnivlm_free(ctx_omnivlm);
+    }

    llama_free_model(model);

--- a/examples/omni-vlm/omni-vlm-wrapper-cli.cpp
+++ b/examples/omni-vlm/omni-vlm-wrapper-cli.cpp
@ -1,15 +1,24 @@
 // WARNING: this .cpp file is only for debugging. do not user directly.
 #include "omni-vlm-wrapper.h"
+#include <iostream>
+
+
+using std::cout;
+using std::endl;

 int main(int argc, char ** argv) {
-    const char* llm_model = "<path to llm gguf.>";
-    const char* mmproj_model = "<path to mm projector gguf>";
-    const char* image_path = "<path where image is located.>";
+    const char* llm_model = "";
+    const char* mmproj_model = "";
+    const char* image_path = "";
    const char* prompt = "";

-    omnivlm_init(llm_model, mmproj_model);
-    omnivlm_inference(prompt, image_path);
-    omnivlm_inference(prompt, image_path);
+    omnivlm_init(llm_model, mmproj_model, "vlm-81-ocr");
+
+    const char* res;
+    res = omnivlm_inference(prompt, image_path);
+    cout << "RES: " << res << endl;
+    res = omnivlm_inference(prompt, image_path);
+    cout << "RES: " << res << endl;
    omnivlm_free();

    return 0;
--- a/examples/omni-vlm/omni-vlm-wrapper.cpp
+++ b/examples/omni-vlm/omni-vlm-wrapper.cpp
@ -24,6 +24,8 @@ struct omnivlm_context {
    struct llama_model * model = NULL;
 };

+void* internal_chars = nullptr;
+
 static struct gpt_params params;
 static struct llama_model* model;
 static struct omnivlm_context* ctx_omnivlm;
@ -63,7 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
        prompt = "describe the image in detail.";
    }

-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 10);
+    auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
+    // clip_set_omni_vlm_version(ctx_clip, params);


    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
@ -128,19 +131,19 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
    return ret.c_str();
 }

-static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
+static const char* process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
    int n_past = 0;

    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;

-    std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
-                                + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
-    size_t image_pos = full_prompt.find("<|image_pad|>");
+    // std::string full_prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" \
+    //                             + prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    size_t image_pos = params->prompt.find("<|image_pad|>");
    std::string system_prompt, user_prompt;

    // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
-    system_prompt = full_prompt.substr(0, image_pos);
-    user_prompt = full_prompt.substr(image_pos + std::string("<|image_pad|>").length());
+    system_prompt = params->prompt.substr(0, image_pos);
+    user_prompt = params->prompt.substr(image_pos + std::string("<|image_pad|>").length());
    if (params->verbose_prompt) {
        auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, system_prompt, true, true);
        for (int i = 0; i < (int) tmp.size(); i++) {
@ -155,6 +158,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
        }
    }

+    params->sparams.top_k = 1;
+    params->sparams.top_p = 1.0f;
+
    eval_string(ctx_omnivlm->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
    omnivlm_eval_image_embed(ctx_omnivlm->ctx_llama, image_embed, params->n_batch, &n_past);
    eval_string(ctx_omnivlm->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@ -172,11 +178,11 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_sampling, ctx_omnivlm->ctx_llama, &n_past);
-        response += tmp;
        if (strcmp(tmp, "<|im_end|>") == 0) break;
        if (strcmp(tmp, "</s>") == 0) break;
        // if (strstr(tmp, "###")) break; // Yi-VL behavior
-        printf("%s", tmp);
+        // printf("%s", tmp);
+        response += tmp;
        // if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
        // if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
        // if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
@ -186,6 +192,13 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima

    llama_sampling_free(ctx_sampling);
    printf("\n");
+
+    // const char* ret_char_ptr = (const char*)(malloc(sizeof(char)*response.size()));
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
 }

 static void omnivlm_free(struct omnivlm_context * ctx_omnivlm) {
@ -208,8 +221,8 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
 }

 // inference interface definition
-void omnivlm_init(const char* llm_model_path, const char* projector_model_path) {
-    const char* argv = "hello-omni-vlm-wrapper-cli";
+void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
+    const char* argv = "omni-wrapper-py";
    char* nc_argv = const_cast<char*>(argv);
    if (!gpt_params_parse(1, &nc_argv, params)) {
        print_usage(1, &nc_argv, {});
@ -217,31 +230,60 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path)
    }
    params.model = llm_model_path;
    params.mmproj = projector_model_path;
+    params.omni_vlm_version = omni_vlm_version;
+
+    std::string omni_vlm_ver = params.omni_vlm_version;
+    if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
+        fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
+        fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
+        throw std::runtime_error("You set wrong vlm_version info strings.");
+    }
+
    model = omnivlm_init(&params);
    if (model == nullptr) {
        fprintf(stderr, "%s: error: failed to init omnivlm model\n", __func__);
        throw std::runtime_error("Failed to init omnivlm model");
    }
-    ctx_omnivlm = omnivlm_init_context(&params, model);
 }

-void omnivlm_inference(const char *prompt, const char *imag_path) {
+const char* omnivlm_inference(const char *prompt, const char *imag_path) {
+    ctx_omnivlm = omnivlm_init_context(&params, model);
+
    std::string image = imag_path;
    params.prompt = prompt;
+
+    if (params.omni_vlm_version == "vlm-81-ocr") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
+    } else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
+        params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
+    } else {
+        LOG_TEE("%s : error: you set wrong vlm version info:'%s'.\n", __func__, params.omni_vlm_version.c_str());
+        throw std::runtime_error("You set wrong vlm_version info strings.");
+    }
+
    auto * image_embed = load_image(ctx_omnivlm, &params, image);
    if (!image_embed) {
        LOG_TEE("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
        throw std::runtime_error("failed to load image " + image);
    }
    // process the prompt
-    process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);
+    const char* ret_chars = process_prompt(ctx_omnivlm, image_embed, &params, params.prompt);

    // llama_perf_print(ctx_omnivlm->ctx_llama, LLAMA_PERF_TYPE_CONTEXT);
    omnivlm_image_embed_free(image_embed);
+    ctx_omnivlm->model = nullptr;
+    omnivlm_free(ctx_omnivlm);
+    ctx_omnivlm = nullptr;
+
+    return ret_chars;
 }

 void omnivlm_free() {
-    ctx_omnivlm->model = NULL;
+    if(internal_chars != nullptr) { free(internal_chars); }
+    if(ctx_omnivlm != nullptr) {
+        // this snipet should never be run!
+        ctx_omnivlm->model = nullptr;
        omnivlm_free(ctx_omnivlm);
+    }
    llama_free_model(model);
 }
--- a/examples/omni-vlm/omni-vlm-wrapper.h
+++ b/examples/omni-vlm/omni-vlm-wrapper.h
@ -20,9 +20,9 @@
 extern "C" {
 #endif

-OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path);
+OMNIVLM_API void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version);

-OMNIVLM_API void omnivlm_inference(const char* prompt, const char* imag_path);
+OMNIVLM_API const char* omnivlm_inference(const char* prompt, const char* imag_path);

 OMNIVLM_API void omnivlm_free();

--- a/examples/omni-vlm/omni-vlm.cpp
+++ b/examples/omni-vlm/omni-vlm.cpp
@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

    *n_img_pos = clip_n_patches(ctx_clip);
    bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
-    // cout << "\t\t A NICE START" << endl;
-    // cout << "\t\t" <<  *n_img_pos << endl;
-    /*
-    if (clip_is_minicpmv(ctx_clip)) {
-        std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        struct clip_image_size * load_image_size = clip_image_size_init();
-        for (size_t i = 0; i < img_res_v.size; i++) {
-            const int64_t t_img_enc_step_start_us = ggml_time_us();
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
-            int patch_size=14;
-            load_image_size->width = img_res_v.data[i].nx;
-            load_image_size->height = img_res_v.data[i].ny;
-            clip_add_load_image_size(ctx_clip, load_image_size);
-            bool encoded = false;
-            int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
-            if (has_minicpmv_projector == 2) {
-                encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
-            }
-            else if (has_minicpmv_projector == 3) {
-                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
-            }
-            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
-                return false;
-            }
-            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
-        }
-        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
-
-        int n_img_pos_out = 0;
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
-            n_img_pos_out += clip_n_patches(ctx_clip);
-        }
-        *n_img_pos = n_img_pos_out;
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            free(image_embd_v[i]);
-        }
-        image_embd_v.clear();
-        load_image_size->width = img->nx;
-        load_image_size->height = img->ny;
-        clip_add_load_image_size(ctx_clip, load_image_size);
-        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
-    }
-    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
-        // flat / default llava-1.5 type embedding
-        *n_img_pos = clip_n_patches(ctx_clip);
-        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
-        delete[] img_res_v.data;
-        if (!encoded) {
-            LOG_ERR("Unable to encode image\n");
-
-            return false;
-        }
-    }
-    else {
-        // spatial_unpad llava-1.6 type embedding
-        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
-        std::vector<float *> image_embd_v;
-        image_embd_v.resize(img_res_v.size);
-        for (size_t i = 0; i < img_res_v.size; i++) {
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
-            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
-            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
-                return false;
-            }
-        }
-        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
-
-        const int32_t * image_grid = clip_image_grid(ctx_clip);
-
-        std::vector<std::pair<int, int>> grid_pinpoints;
-        for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
-            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
-        }
-
-        // free all img_res_v - not needed anymore
-        delete[] img_res_v.data;
-        img_res_v.size = 0;
-        img_res_v.data = nullptr;
-
-        const int32_t image_size = clip_image_size(ctx_clip);
-
-        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
-
-        int n_img_pos_out;
-        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
-        *n_img_pos = n_img_pos_out;
-
-        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            free(image_embd_v[i]);
-        }
-        image_embd_v.clear();
-
-        // debug image/segment/normalization content:
-        // clip_image_u8 * tmp = clip_image_u8_init();
-        // clip_image_convert_f32_to_u8(*image_feature, *tmp);
-        // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
-    }
-    */

    LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

--- a/examples/omni-vlm/omni_vlm_cpp.py
+++ b/examples/omni-vlm/omni_vlm_cpp.py
@ -60,11 +60,11 @@ _lib = _load_shared_library(_lib_base_name, base_path)
 omni_char_p = ctypes.c_char_p


-def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
-    return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
+def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p):
+    return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version)


-_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
+_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p]
 _lib.omnivlm_init.restype = None


@ -73,7 +73,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):


 _lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
-_lib.omnivlm_inference.restype = None
+_lib.omnivlm_inference.restype = omni_char_p


 def omnivlm_free():
--- a/examples/omni-vlm/omni_vlm_demo.py
+++ b/examples/omni-vlm/omni_vlm_demo.py
@ -11,16 +11,17 @@ class NexaOmniVlmInference:
    A class used for vision language model inference.
    """

-    def __init__(self, llm_model_path: str, mmproj_model_path: str):
+    def __init__(self, llm_model_path: str, mmproj_model_path: str, omni_vlm_version: str):
        self.llm_model = ctypes.c_char_p(llm_model_path.encode("utf-8"))
        self.mmproj_model = ctypes.c_char_p(mmproj_model_path.encode("utf-8"))
+        self.omni_vlm_version = ctypes.c_char_p(omni_vlm_version.encode("utf-8"))

-        omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model)
+        omni_vlm_cpp.omnivlm_init(self.llm_model, self.mmproj_model, self.omni_vlm_version)

    def inference(self, prompt: str, image_path: str):
        prompt = ctypes.c_char_p(prompt.encode("utf-8"))
        image_path = ctypes.c_char_p(image_path.encode("utf-8"))
-        omni_vlm_cpp.omnivlm_inference(prompt, image_path)
+        return omni_vlm_cpp.omnivlm_inference(prompt, image_path)

    def __del__(self):
        omni_vlm_cpp.omnivlm_free()
@ -34,22 +35,30 @@ if __name__ == "__main__":
    )
    parser.add_argument("--model", type=str, help="Path to the llm model file")
    parser.add_argument("--mmproj", type=str, help="Path to the mmproj file")
+    parser.add_argument("--omni-vlm-version", type=str, help="omni-vlm-version info ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')")
    # parser.add_argument("--prompt", type=str, help="prompt string.")
    # parser.add_argument("--image-path", type=str, help="Path to the image.")

    args = parser.parse_args()

-    omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj)
+    print("DEBUG")
+    print(args.omni_vlm_version)
+    omni_vlm_obj = NexaOmniVlmInference(args.model, args.mmproj, args.omni_vlm_version)
    # omni_vlm_obj.inference(args.prompt, args.image_path)
    while True:
+        if args.omni_vlm_version != "vlm-81-ocr":
            print("Input your prompt:")
            prompt = input()
            if prompt == "":
                print("ERROR: you input an empty prompt, try again.")
                continue
+        else:
+            prompt = ""
        print("Input your image path:")
        image_path = input()
        while not os.path.exists(image_path):
            print("ERROR: can not find image in your input path, please check and input agian.")
            image_path = input()
-        omni_vlm_obj.inference(prompt, image_path)
+        response = omni_vlm_obj.inference(prompt, image_path)
+        print("\tresponse:")
+        print(response.decode('utf-8'))
--- a/examples/qwen2-audio/qwen2.cpp
+++ b/examples/qwen2-audio/qwen2.cpp
@ -18,10 +18,12 @@
 #include <thread>
 #include <vector>
 #include <cstring>
+#include <iostream>

 //
 // Constants
 //
+void* internal_chars = nullptr;

 static const char *AUDIO_TOKEN = "<|AUDIO|>";

@ -703,6 +705,12 @@ struct omni_context *omni_init_context(omni_context_params &params)

 void omni_free(struct omni_context *ctx_omni)
 {
+
+    if(internal_chars != nullptr)
+    {
+        free(internal_chars);
+        internal_chars = nullptr;
+    }
    if (ctx_omni->ctx_whisper)
    {
        whisper_free(ctx_omni->ctx_whisper);
@ -710,12 +718,13 @@ void omni_free(struct omni_context *ctx_omni)
    }
    if (ctx_omni->projector)
    {
-        ctx_omni->projector->free();
+        delete ctx_omni->projector;
    }

    llama_free(ctx_omni->ctx_llama);
    llama_free_model(ctx_omni->model);
    llama_backend_free();
+    free(ctx_omni);
 }

 static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_embed, int n_batch, int *n_past)
@ -755,6 +764,7 @@ static bool omni_eval_audio_embed(llama_context *ctx_llama, ggml_tensor *audio_e
        }
        *n_past += n_eval;
    }
+    free(audio_embed_data);
    return true;
 }

@ -792,7 +802,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
    return embed_proj;
 }

-void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
+const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
 {
    int n_past = 0;

@ -841,12 +851,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
    for (int i = 0; i < max_tgt_len; i++)
    {
        const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
-        response += tmp;
        if (strcmp(tmp, "</s>") == 0)
            break;
        if (strstr(tmp, "###"))
            break; // Yi-VL behavior
-        printf("%s", tmp);
+        // printf("%s", tmp);
        if (strstr(response.c_str(), "<|im_end|>"))
            break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
        if (strstr(response.c_str(), "<|im_start|>"))
@ -855,16 +864,23 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
            break; // mistral llava-1.6

        fflush(stdout);
+        response += tmp;
    }

    llama_sampling_free(ctx_sampling);
    printf("\n");
+
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
 }

-void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
+const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
 {
    omni_params all_params = get_omni_params_from_context_params(params);

    ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
-    omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
+    return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
 }
--- a/examples/qwen2-audio/qwen2.h
+++ b/examples/qwen2-audio/qwen2.h
@ -54,7 +54,7 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param

 OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);

-OMNI_AUDIO_API void omni_process_full(
+OMNI_AUDIO_API const char* omni_process_full(
    struct omni_context *ctx_omni,
    omni_context_params &params
 );
--- a/examples/qwen2-audio/whisper.cpp
+++ b/examples/qwen2-audio/whisper.cpp
@ -9467,6 +9467,8 @@ static bool whisper_encoder_load(struct whisper_model_loader *loader, whisper_co

    wctx.t_load_us = ggml_time_us() - t_start_us;

+    gguf_free(gguf_ctx);
+
    return true;
 }

--- a/ggml_llama/src/vulkan-shaders/CMakeLists.txt
+++ b/ggml_llama/src/vulkan-shaders/CMakeLists.txt
@ -1,6 +1,6 @@
 find_package (Threads REQUIRED)

-set(TARGET llama-vulkan-shaders-gen)
+set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)