update to get some results; need to check vit and llm

2024-09-16 17:11:12 +00:00 · 2024-09-16 17:11:12 +00:00 · 30b751ef06
commit 30b751ef06
parent cc553a0ae0
12 changed files with 4840 additions and 341 deletions
--- a/.gitignore
+++ b/.gitignore
@ -155,3 +155,6 @@ examples/xgenmm copy/imgs/image-1d100e9-1.jpg
 examples/xgenmm copy/imgs/image-1d100e9.jpg
 examples/xgenmm/imgs/4patches_embeddings.pt
 examples/xgenmm/imgs/attention_mask_4patchhes.pt
+examples/xgenmm/models/tokenizers/*
+models/*.inp
+models/*.out
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -542,18 +542,12 @@ class Model:
        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
            res = "refact"
-        if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
-            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
-            res = "command-r"
        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
            res = "qwen2"
        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
            # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
            res = "olmo"
-        if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
-            # ref: https://huggingface.co/databricks/dbrx-base
-            res = "dbrx"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
            res = "jina-v2-en"
@ -572,15 +566,9 @@ class Model:
        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
            res = "jina-v2-code"
-        if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
-            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
-            res = "chatglm-bpe"
        if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
            # ref: https://huggingface.co/LumiOpen/Viking-7B
            res = "viking"
-        if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
-            # ref: https://huggingface.co/core42/jais-13b
-            res = "jais"
        if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
            res = "codeshell"
@ -596,9 +584,6 @@ class Model:
        if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
            # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
            res = "gpt3-finnish"
-        if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
-            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
-            res = "exaone"

        if res is None:
            logger.warning("\n")
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
@ -40,6 +40,139 @@
 #include <cinttypes>
 #include <limits>

+void print_my_tensor(ggml_tensor *tensor, const char *name = "", int verbosity = 0)
+{
+    if (tensor->ne[2] == 1)
+    {
+        printf("---> %s: (%ld, %ld)\n", name, tensor->ne[0], tensor->ne[1]);
+    }
+    else if (ggml_is_3d(tensor))
+    {
+        printf("---> %s: (%ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
+    }
+    else
+    {
+        printf("---> %s: (%ld, %ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    }
+    if (verbosity == 1)
+    {
+        printf("*********************************************************************\n");
+        if (tensor->ne[2] == 1)
+        {
+            const float *mat = (float *)tensor->data;
+            int          dim0 = tensor->ne[1];
+            int          dim1 = tensor->ne[0];
+            if (dim0 < 6 && dim1 < 6)
+            {
+                for (int i = 0; i < dim0; i++)
+                {
+                    for (int j = 0; j < dim1; j++)
+                    {
+                        printf("%+.4f ", mat[i * dim1 + j]);
+                    }
+                    printf("\n");
+                }
+                printf("\n");
+            }
+            else
+            {
+                for (int i = 0; i < std::min(dim0, 3); i++)
+                {
+                    for (int j = 0; j < std::min(dim1, 3); j++)
+                    {
+                        printf("%+.6f ", mat[i * dim1 + j]);
+                    }
+                    printf("... ");
+                    for (int j = dim1 - 3; j < dim1; j++)
+                    {
+                        printf("%+.6f ", mat[i * dim1 + j]);
+                    }
+                    printf("\n");
+                }
+                if (dim0 > 3)
+                {
+                    printf("...................... omit ......................\n");
+                    for (int i = dim0 - 3; i < dim0; i++)
+                    {
+                        for (int j = 0; j < std::min(dim1, 3); j++)
+                        {
+                            printf("%+.6f ", mat[i * dim1 + j]);
+                        }
+                        printf("... ");
+                        for (int j = dim1 - 3; j < dim1; j++)
+                        {
+                            printf("%+.6f ", mat[i * dim1 + j]);
+                        }
+                        printf("\n");
+                    }
+                }
+            }
+        }
+        else if (ggml_is_3d(tensor))
+        {
+            const float *data = (float *)tensor->data;
+            int          dim0 = tensor->ne[2];
+            int          dim1 = tensor->ne[1];
+            int          dim2 = tensor->ne[0];
+            if (dim0 < 6 && dim1 < 6 && dim2 < 6)
+            {
+                for (int i = 0; i < dim0; i++)
+                {
+                    printf("dim0 = %d\n", i);
+                    for (int j = 0; j < dim1; j++)
+                    {
+                        for (int k = 0; k < dim2; k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("\n");
+                    }
+                    printf("\n");
+                }
+                printf("\n");
+            }
+            else
+            {
+                for (int i = 0; i < std::min(dim0, 3); i++)
+                {
+                    printf("dim0 = %d\n", i);
+                    for (int j = 0; j < std::min(dim1, 3); j++)
+                    {
+                        for (int k = 0; k < std::min(dim2, 3); k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("... ");
+                        for (int k = dim2 - 3; k < dim2; k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("\n");
+                    }
+                    printf("........................\n");
+                    for (int j = dim1 - 3; j < dim1; j++)
+                    {
+                        for (int k = 0; k < std::min(dim2, 3); k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("... ");
+                        for (int k = dim2 - 3; k < dim2; k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("\n");
+                    }
+                    printf("---------------------------------------------------\n");
+                }
+                printf("\n");
+            }
+        }
+    }
+    printf("*********************************************************************\n");
+    printf("\n");
+}
+
 //#define CLIP_DEBUG_FUNCTIONS

 // RGB uint8 image
@ -602,7 +735,7 @@ struct clip_ctx {
    struct clip_image_size * load_image_size;
 };

-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
@ -1047,7 +1180,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 }


-static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false, ggml_tensor *attn_bias_input = nullptr) {
+static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
        LOG_TEE("This gguf file seems to have no vision encoder\n");
        return nullptr;
@ -1119,9 +1252,7 @@ static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image
        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
    }
    // loop over layers
-    if (ctx->has_minicpmv_projector) {
-        n_layer += 1;
-    }
+    n_layer += 1;
    for (int il = 0; il < n_layer - 1; il++) {
        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states

@ -1218,7 +1349,7 @@ static ggml_cgraph * clip_image_build_graph_vit(clip_ctx * ctx, const clip_image
 static ggml_cgraph *clip_build_graph_xgenmm_projector(clip_ctx *ctx, int batch_size, ggml_tensor *img_embeddings, ggml_tensor *attn_bias_input = nullptr)
 {
    const auto & model = ctx->vision_model;
-    const auto & hparams = model.hparams;
+    // const auto & hparams = model.hparams;
    // const float eps                 = hparams.eps;  // double check this value
    const float eps  = 1e-5;

@ -2493,7 +2624,7 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
 void clip_free(clip_ctx * ctx) {
    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);
-
+    
    ggml_backend_buffer_free(ctx->params_buffer);
    ggml_backend_free(ctx->backend);
    ggml_gallocr_free(ctx->compute_alloc);
@ -2676,12 +2807,10 @@ bool clip_image_encode_tokenizer(struct clip_ctx * ctx, int batch_size, ggml_ten
    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
    ggml_backend_graph_compute(ctx->backend, gf);
    struct ggml_tensor * llm_inputs = gf->nodes[gf->n_nodes - 1];
+    print_my_tensor(llm_inputs, "llm_inputs", 1);
+    // exit(0);
    ggml_backend_tensor_get(llm_inputs, image_embd, 0, ggml_nbytes(llm_inputs));
    clip_free(ctx);
-    // ggml_free(tensor.ctx);
-    // if (ctx0){
-    //     ggml_free(ctx0);
-    // }
    return true;
 }

@ -3029,7 +3158,7 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
    GGML_ASSERT(batch_size == 1); // TODO: support multiple images

    // build the inference graph
-    ggml_cgraph * gf = clip_image_build_graph_vit(ctx, imgs, ctx->load_image_size, true);
+    ggml_cgraph * gf = clip_image_build_graph_vit(ctx, imgs);

    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
    // set inputs
@ -3039,8 +3168,8 @@ bool clip_image_batch_encode_vit(clip_ctx * ctx, const int n_threads, const clip
    int image_size_width  = image_size;
    int image_size_height = image_size;
    const int patch_size    = hparams.patch_size;
-    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    // const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    // const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
    if(ctx->load_image_size==nullptr){
        ctx->load_image_size= clip_image_size_init();
    }
--- a/examples/xgenmm/convert.sh
+++ b/examples/xgenmm/convert.sh
@ -4,10 +4,20 @@ conda activate xgenmm-flamingo
 # # step 1: surgery
 # python xgenmm_surgery.py

-# step 2: convert to gguf (vit + projector)
+# # step 2: convert vit + projector to gguf 

-python xgenmm_convert_image_encoder_to_gguf.py \
-    --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
-    --output_dirname gguf_test \
-    --version siglip_kosmos_phi3_4k_instruct \
-    --use_f32 
+# python xgenmm_convert_image_encoder_to_gguf.py \
+#     --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
+#     --output_dirname gguf_test \
+#     --version siglip_kosmos_phi3_4k_instruct \
+#     --use_f32 
+
+# step 3:  convert llm to gguf
+# https://github.com/ggerganov/llama.cpp/discussions/7927
+HF_TOKEN=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
+LLM_PATH=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/llm
+# LLM_OUTPUT_FILE=/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf/phi3_.gguf
+# downloads the tokenizer models of the specified models from Huggingface; generates the get_vocab_base_pre() function for convert_hf_to_gguf.py
+cd ../..
+# python convert_hf_to_gguf_update.py $HF_TOKEN
+python convert_hf_to_gguf.py $LLM_PATH
--- a/examples/xgenmm/convert_hf_to_gguf.py
+++ b/examples/xgenmm/convert_hf_to_gguf.py
--- a/examples/xgenmm/imgs/receipt.jpg
+++ b/examples/xgenmm/imgs/receipt.jpg
--- a/examples/xgenmm/run_cli.sh
+++ b/examples/xgenmm/run_cli.sh
@ -16,8 +16,26 @@ make xgenmm-cli
 #     -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image> Describe this image.<|end|>\n<|assistant|>\n"


-./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
+# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
+#     --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
+#     -c 4096 --temp 0.01 --repeat-penalty 1.05 \
+#     --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
+#     -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
+
+
+./xgenmm-cli --model /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
    --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
-    -c 4096 --temp 0.01 --repeat-penalty 1.05 \
-    --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\
-    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n How many objects are there in this image?<|end|>\n<|assistant|>\n"
+    --image /export/home/llama.cpp/examples/xgenmm/imgs/receipt.jpg\
+    --prompt "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n Describe this image.<|end|>\n<|assistant|>\n" \
+    --seed 42 --ctx-size 4096 --predict 1024 \
+    --temp 0 --verbose-prompt
+
+    # 
+
+
+# ./xgenmm-cli --model /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \
+#     --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \
+#     --image /export/home/llama.cpp/examples/xgenmm/imgs/receipt.jpg\
+#     --prompt "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n<image>\n What is the address of this restirant?<|end|>\n<|assistant|>\n" \
+#     --seed 42 --ctx-size 4096 --predict 1024 \
+#     --temp 0 --verbose-prompt
--- a/examples/xgenmm/xgenmm-cli.cpp
+++ b/examples/xgenmm/xgenmm-cli.cpp
@ -1,5 +1,3 @@
-// refer to example/minicpmv-cli
-
 #include "ggml.h"
 #include "log.h"
 #include "common.h"
@ -11,97 +9,18 @@
 #include <cstdlib>
 #include <vector>

-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    LOG_TEE("%s", text);
-}
-
-static struct llama_model * llava_init(gpt_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
-
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
-    if (params->n_ctx < 2048) {
-        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_TEE("%s: warn: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
-        ctx_params.n_ctx = 2048;
-    } else {
-        ctx_params.n_ctx = params->n_ctx;
-    }
-
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
-    llama_backend_free();
-}
-
-static struct clip_ctx * clip_init_context(gpt_params * params) {
-    const char * clip_path = params->mmproj.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-    // std::cout << __LINE__ << std::endl;
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-    return ctx_clip;
-}
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
+static bool eval_tokens(struct llama_context *ctx_llama, std::vector<llama_token> tokens, int n_batch, int *n_past)
+{
+    int N = (int)tokens.size();
+    for (int i = 0; i < N; i += n_batch)
+    {
+        int n_eval = (int)tokens.size() - i;
+        if (n_eval > n_batch)
+        {
            n_eval = n_batch;
        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0)))
+        {
            LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
@ -110,111 +29,170 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
    return true;
 }

-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+static bool eval_id(struct llama_context *ctx_llama, int id, int *n_past)
+{
    std::vector<llama_token> tokens;
    tokens.push_back(id);
    return eval_tokens(ctx_llama, tokens, 1, n_past);
 }

-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
+static bool eval_string(struct llama_context *ctx_llama, const char *str, int n_batch, int *n_past, bool add_bos)
+{
+
+    std::string              str2 = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
+    printf("prompt: %s", str);
+    for (auto token : embd_inp){
+        printf("%6d, ", token);
+    }
+    printf("\n");
    return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
 }

-static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
-    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
-    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
-
-    auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
-    slice_embed->embed = image_embed;
-    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
-    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
-    llava_image_embed_free(slice_embed);
-}
-
-static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
-    std::string system_prompt;
-    int idx = 0;
-    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (has_minicpmv_projector == 2) {
-        system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
-    }
-    else if (has_minicpmv_projector == 3) {
-        system_prompt = "<|im_start|>user\n";
-    }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
-    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
-    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-    if (num_image_embeds > 1) {
-        size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-        eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
-        for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
-            for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
-                process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-                if (j == num_image_embeds_col - 1) {
-                    eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
-                }
-            }
-        }
-        eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
-    }
-    LOG_TEE("%s: image token past: %d\n", __func__, n_past);
-}
-
-static const char * sample(struct llama_sampling_context * ctx_sampling,
-                           struct llama_context * ctx_llama,
-                           int * n_past) {
+static const char *sample(struct llama_sampling_context *ctx_sampling, struct llama_context *ctx_llama, int *n_past)
+{
    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
    static std::string ret;
-    if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
+    if (llama_token_is_eog(llama_get_model(ctx_llama), id))
+    {
        ret = "</s>";
-    } else {
+    }
+    else
+    {
        ret = llama_token_to_piece(ctx_llama, id);
    }
    eval_id(ctx_llama, id, n_past);
    return ret.c_str();
 }

-static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto ctx_clip = clip_init_context(params);
-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
-    if (!embeds) {
-        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
-        return NULL;
+static const char *IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
+static const char *IMG_BASE64_TAG_END = "\">";
+
+static void find_image_tag_in_prompt(const std::string &prompt, size_t &begin_out, size_t &end_out)
+{
+    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
+    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
+}
+
+static bool prompt_contains_image(const std::string &prompt)
+{
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    return (begin != std::string::npos);
+}
+
+// TODO: Implememt this function llava_image_embed_make_with_prompt_base64 for xgenmm
+// static llava_image_embed *llava_image_embed_make_with_prompt_base64(struct clip_ctx *ctx_clip, int n_threads,
+//                                                                     const std::string &prompt)
+// {
+//     size_t img_base64_str_start, img_base64_str_end;
+//     find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
+//     if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos)
+//     {
+//         LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN,
+//                 IMG_BASE64_TAG_END);
+//         return NULL;
+//     }
+
+//     auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
+//     auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
+//     auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count);
+
+//     auto required_bytes = base64::required_encode_size(base64_str.size());
+//     auto img_bytes = std::vector<unsigned char>(required_bytes);
+//     base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
+
+//     auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
+//     if (!embed)
+//     {
+//         LOG_TEE("%s: could not load image from base64 string.\n", __func__);
+//         return NULL;
+//     }
+
+//     return embed;
+// }
+
+static std::string remove_image_from_prompt(const std::string &prompt, const char *replacement = "")
+{
+    size_t begin, end;
+    find_image_tag_in_prompt(prompt, begin, end);
+    if (begin == std::string::npos || end == std::string::npos)
+    {
+        return prompt;
+    }
+    auto pre = prompt.substr(0, begin);
+    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
+    return pre + replacement + post;
+}
+
+struct llava_context
+{
+    struct clip_ctx      *ctx_clip = NULL;
+    struct llama_context *ctx_llama = NULL;
+    struct llama_model   *model = NULL;
+};
+
+// static void process_eval_image_embed(struct llava_context *ctx_llava, const struct llava_image_embed *embeds,
+//                                      int n_batch, int *n_past, int idx)
+// {
+//     float *image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
+//     std::memcpy(image_embed,
+//                 embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip),
+//                 clip_embd_nbytes(ctx_llava->ctx_clip));
+
+//     auto slice_embed = (llava_image_embed *)malloc(sizeof(llava_image_embed));
+//     slice_embed->embed = image_embed;
+//     slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
+//     llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
+//     llava_image_embed_free(slice_embed);
+// }
+
+static void print_usage(int argc, char **argv, const gpt_params &params)
+{
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE(
+        "\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image "
+        "<path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in "
+        "detail.\"]\n",
+        argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static struct llava_image_embed *load_image(llava_context *ctx_llava, gpt_params *params, const std::string &fname)
+{
+    // load and preprocess the image
+    llava_image_embed *embed = NULL;
+    auto               prompt = params->prompt;
+    if (prompt_contains_image(prompt))
+    {
+        // if (!params->image.empty())
+        // {
+        //     LOG_TEE("using base64 encoded image instead of command line image path\n");
+        // }
+        // embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
+        // if (!embed)
+        // {
+        //     LOG_TEE("%s: can't load image from prompt\n", __func__);
+        //     return NULL;
+        // }
+        // params->prompt = remove_image_from_prompt(prompt);
+        printf("not implemented\n");
+        exit(1);
+    }
+    else
+    {
+        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
+        if (!embed)
+        {
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
+            return NULL;
+        }
    }

-    // process the prompt
-    if (params->prompt.empty() && params->interactive == false) {
-        LOG_TEE("prompt should be given or interactive mode should be on");
-        return NULL;
-    }
-
-    auto model = llava_init(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
-        return NULL;
-    }
-    const int64_t t_llava_init_start_us = ggml_time_us();
-    auto ctx_llava = llava_init_context(params, model);
-    ctx_llava->ctx_clip = ctx_clip;
-    const int64_t t_llava_init_end_us = ggml_time_us();
-    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
-
-    const int64_t t_process_image_start_us = ggml_time_us();
-    process_image(ctx_llava, embeds, params, n_past);
-    const int64_t t_process_image_end_us = ggml_time_us();
-    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
-
-    llava_image_embed_free(embeds);
-    return ctx_llava;
+    return embed;
 }

 static void process_prompt(struct llava_context *ctx_llava, struct llava_image_embed *image_embed, gpt_params *params,
@ -233,6 +211,7 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
        system_prompt = prompt.substr(0, image_pos);
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
+        // phi3-tokenizer https://github.com/ggerganov/llama.cpp/issues/7938
        if (params->verbose_prompt)
        {
            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
@ -267,7 +246,6 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
            }
        }
    }
-
    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
@ -286,9 +264,18 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++)
    {
+        // printf("i: %d\n", i);
        const char *tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
        response += tmp;
-        if (strcmp(tmp, "</s>") == 0) break;
+        // printf("%s", tmp);
+        if (strcmp(tmp, "<|end|>") == 0){
+            printf("\n STOP GENERATING because I saw <|end|>\n");
+            break;
+        }
+        if (strcmp(tmp, "</s>") == 0) {
+            printf("\n STOP GENERATING because I saw </s>\n");
+            break;
+        }
        if (strstr(tmp, "###")) break;  // Yi-VL behavior
        printf("%s", tmp);
        if (strstr(response.c_str(), "<|im_end|>"))
@ -303,90 +290,207 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
    printf("\n");
 }

-static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
-    auto ctx_clip = clip_init_context(params);
-    std::cout << "clip model has been loaded \n\n";

-    auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
-    if (!embeds) {
-        std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
-        return NULL;
-    }
-    std::cout<< "Start Processing Prompt: " << std::endl;
-    // TODO:
-    // process the prompt
-    if (params->prompt.empty() && params->interactive == false) {
-        LOG_TEE("prompt should be given or interactive mode should be on");
-        return NULL;
-    }
+static struct llama_model * llava_init(gpt_params * params) {
+    llama_backend_init();
+    llama_numa_init(params->numa);

-    auto model = llava_init(params);
+    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
+
+    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
+        LOG_TEE("%s: error: unable to load model\n" , __func__);
        return NULL;
    }
-    const int64_t t_llava_init_start_us = ggml_time_us();
-    auto ctx_llava = llava_init_context(params, model);
+    return model;
+}
+
+static struct llava_context *llava_init_context(gpt_params *params, llama_model *model)
+{
+    const char *clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty())
+    {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/1);
+
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
+    ctx_params.n_ctx =
+        params->n_ctx < 2048 ? 2048 : params->n_ctx;  // we need a longer context size to process image embeddings
+
+    llama_context *ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL)
+    {
+        LOG_TEE("%s: error: failed to create the llama_context\n", __func__);
+        return NULL;
+    }
+
+    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
+
+    ctx_llava->ctx_llama = ctx_llama;
    ctx_llava->ctx_clip = ctx_clip;
-    const int64_t t_llava_init_end_us = ggml_time_us();
-    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
-
-    const int64_t t_process_image_start_us = ggml_time_us();
-    process_prompt(ctx_llava, embeds, params, params->prompt);
-    // process_image(ctx_llava, embeds, params, n_past);
-    const int64_t t_process_image_end_us = ggml_time_us();
-    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
-
-    llava_image_embed_free(embeds);
+    ctx_llava->model = model;
    return ctx_llava;
 }

-
-static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
-    std::string user_prompt = prompt;
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (!is_first) {
-        if (has_minicpmv_projector == 2) {
-            user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
-        }
-        else if (has_minicpmv_projector == 3) {
-            user_prompt = "<|im_start|>user\n" + prompt;
-        }
+static void llava_free(struct llava_context * ctx_llava) {
+    if (ctx_llava->ctx_clip) {
+        printf(
+            "YD:::Segmentation fault here; Because header.n_kv is empty\n clip_free->gguf_free(ctx->ctx_gguf)-> for "
+            "(uint64_t i = 0; i < ctx->header.n_kv; ++i)\n");
+        exit(1);
+        clip_free(ctx_llava->ctx_clip);
+        ctx_llava->ctx_clip = NULL;
    }
-
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-    if (has_minicpmv_projector == 2) {
-        eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
-    }
-    else if (has_minicpmv_projector == 3) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    }
-
-    // generate the response
-
-    LOG_TEE("\n");
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    return ctx_sampling;
+    llama_free(ctx_llava->ctx_llama);
+    llama_free_model(ctx_llava->model);
+    llama_backend_free();
 }

-static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
+// static struct clip_ctx * clip_init_context(gpt_params * params) {
+//     const char * clip_path = params->mmproj.c_str();

-    const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
-    return tmp;
+//     auto prompt = params->prompt;
+//     if (prompt.empty()) {
+//         prompt = "describe the image in detail.";
+//     }
+//     // std::cout << __LINE__ << std::endl;
+//     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+//     return ctx_clip;
+// }
+
+
+
+// TODO: REMOVE THIS FUNCTION
+// static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
+//     std::string system_prompt;
+//     int idx = 0;
+//     int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
+//     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
+//     if (has_minicpmv_projector == 2) {
+//         system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
+//     }
+//     else if (has_minicpmv_projector == 3) {
+//         system_prompt = "<|im_start|>user\n";
+//     }
+//     LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+//     eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
+//     process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
+//     eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+//     if (num_image_embeds > 1) {
+//         size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
+//         eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
+//         for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
+//             for (size_t j = 0; j < num_image_embeds_col; ++j) {
+//                 eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
+//                 process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
+//                 eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
+//                 if (j == num_image_embeds_col - 1) {
+//                     eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
+//                 }
+//             }
+//         }
+//         eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
+//     }
+//     LOG_TEE("%s: image token past: %d\n", __func__, n_past);
+// }
+
+
+
+
+
+// static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){
+//     auto ctx_clip = clip_init_context(params);
+//     std::cout << "clip model has been loaded \n\n";
+
+//     auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
+//     if (!embeds) {
+//         std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
+//         return NULL;
+//     }
+//     std::cout<< "Start Processing Prompt: " << std::endl;
+//     // TODO:
+//     // process the prompt
+//     if (params->prompt.empty() && params->interactive == false) {
+//         LOG_TEE("prompt should be given or interactive mode should be on");
+//         return NULL;
+//     }
+
+//     auto model = llava_init(params);
+//     if (model == NULL) {
+//         fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
+//         return NULL;
+//     }
+//     const int64_t t_llava_init_start_us = ggml_time_us();
+//     auto ctx_llava = llava_init_context(params, model);
+//     ctx_llava->ctx_clip = ctx_clip;
+//     const int64_t t_llava_init_end_us = ggml_time_us();
+//     float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
+//     LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
+
+//     const int64_t t_process_image_start_us = ggml_time_us();
+//     process_prompt(ctx_llava, embeds, params, params->prompt);
+//     // process_image(ctx_llava, embeds, params, n_past);
+//     const int64_t t_process_image_end_us = ggml_time_us();
+//     float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
+//     LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
+
+//     llava_image_embed_free(embeds);
+//     return ctx_llava;
+// }
+
+
+// static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
+//     std::string user_prompt = prompt;
+//     int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
+//     if (!is_first) {
+//         if (has_minicpmv_projector == 2) {
+//             user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
+//         }
+//         else if (has_minicpmv_projector == 3) {
+//             user_prompt = "<|im_start|>user\n" + prompt;
+//         }
+//     }
+
+//     eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
+//     if (has_minicpmv_projector == 2) {
+//         eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
+//     }
+//     else if (has_minicpmv_projector == 3) {
+//         eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
+//     }
+
+//     // generate the response
+
+//     LOG_TEE("\n");
+
+//     struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
+//     return ctx_sampling;
+// }
+
+// static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){
+
+//     const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
+//     return tmp;
+// }
+
+static void llama_log_callback_logTee(ggml_log_level level, const char *text, void *user_data)
+{
+    (void)level;
+    (void)user_data;
+    LOG_TEE("%s", text);
 }

-
-
 int main(int argc, char ** argv) {
    ggml_time_init();

    gpt_params params;
-
    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
+        print_usage(argc, argv, params);
        return 1;
    }

@ -399,67 +503,67 @@ int main(int argc, char ** argv) {

    if (params.mmproj.empty() || (params.image.empty())) {
        gpt_params_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
+        print_usage(argc, argv, params);
        return 1;
    }

-    for (auto & image : params.image) {  // only single image for now
-        int n_past = 0;
-        // auto ctx_llava = minicpmv_init(&params, image, n_past);
-        auto ctx_llava = xgenmm_init(&params, image, n_past);  // generate vision tokens
-        std::cout << "Start llava generation: " << std::endl;
+    auto model = llava_init(&params);
+    if (model == NULL)
+    {
+        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
+        return 1;
+    }
+
+    if (prompt_contains_image(params.prompt))
+    {
+        auto ctx_llava = llava_init_context(&params, model);
+
+        auto image_embed = load_image(ctx_llava, &params, "");
+
+        // process the prompt
+        process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
        llama_print_timings(ctx_llava->ctx_llama);
-
-        // // TODO: integrate base llm
-        // if (!params.prompt.empty()) {
-        //     LOG_TEE("<user>%s\n", params.prompt.c_str());
-        //     LOG_TEE("<assistant>");
-        //     auto ctx_sampling = llama_init(ctx_llava, &params, params.prompt.c_str(), n_past, true);
-        //     const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-        //     std::string response = "";
-        //     bool have_tmp = false;
-        //     for (int i = 0; i < max_tgt_len; i++) {
-        //         auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
-        //         response += tmp;
-        //         if (strcmp(tmp, "</s>") == 0){
-        //             if(!have_tmp)continue;
-        //             else break;
-        //         }
-        //         if (strstr(tmp, "###")) break; // Yi-VL behavior
-        //         have_tmp = true;
-        //         printf("%s", tmp);
-        //         if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-
-        //         fflush(stdout);
-        //     }
-        //     llama_sampling_free(ctx_sampling);
-        // }else {
-        //     while (true) {
-        //         LOG_TEE("<user>");
-        //         std::string prompt;
-        //         std::getline(std::cin, prompt);
-        //         LOG_TEE("<assistant>");
-        //         auto ctx_sampling = llama_init(ctx_llava, &params, prompt, n_past, true);
-        //         const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-        //         std::string response = "";
-        //         for (int i = 0; i < max_tgt_len; i++) {
-        //             auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past);
-        //             response += tmp;
-        //             if (strcmp(tmp, "</s>") == 0) break;
-        //             if (strstr(tmp, "###")) break; // Yi-VL behavior
-        //             printf("%s", tmp);// mistral llava-1.6
-        //             if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-        //             fflush(stdout);
-        //         }
-        //         llama_sampling_free(ctx_sampling);
-        //     }
-        // }
-        // printf("\n");
-        // llama_print_timings(ctx_llava->ctx_llama);
-
+        llava_image_embed_free(image_embed);
        ctx_llava->model = NULL;
        llava_free(ctx_llava);
    }
+    else
+    {
+        for (auto &image : params.image)
+        {
+            printf("image: %s\n", image.c_str());
+            auto ctx_llava = llava_init_context(&params, model);

+            auto image_embed = load_image(ctx_llava, &params, image);
+            printf("n_image_pos: %d\n", image_embed->n_image_pos);
+            if (!image_embed)
+            {
+                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
+                return 1;
+            }
+
+            // process the prompt
+            process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+            llama_print_timings(ctx_llava->ctx_llama);
+            llava_image_embed_free(image_embed);
+            ctx_llava->model = NULL;
+            llava_free(ctx_llava);
+        }
+    }
+
+    llama_free_model(model);
+
+    // prompt_contains_image(params.prompt);
+    // for (auto & image : params.image) {  // only single image for now
+    //     int n_past = 0;
+    //     auto ctx_llava = xgenmm_init(&params, image, n_past);  // generate vision tokens
+    //     std::cout << "Start llava generation: " << std::endl;
+    //     llama_print_timings(ctx_llava->ctx_llama);
+    //     ctx_llava->model = NULL;
+    //     llava_free(ctx_llava);
+    // }
+    printf("Remember to remove print_tensor function in xgenmm.cpp and clip.cpp\n");
    return 0;
 }
--- a/examples/xgenmm/xgenmm.cpp
+++ b/examples/xgenmm/xgenmm.cpp
@ -14,6 +14,162 @@
 #include "llama.h"
 #include "xgenmm.h"

+struct tensor_from_gguf
+{
+    struct ggml_tensor  *data;
+    struct ggml_context *ctx;
+};
+
+bool load_tensor_from_file(const char *filename, tensor_from_gguf &tensor)
+{
+    struct gguf_init_params params = {
+        /*.no_alloc   =*/false,
+        /*.ctx        =*/&tensor.ctx,
+    };
+    gguf_context *ctx = gguf_init_from_file(filename, params);
+    if (!ctx)
+    {
+        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
+        return false;
+    }
+    tensor.data = ggml_get_tensor(tensor.ctx, "data");
+
+    return true;
+}
+
+
+void print_tensor(ggml_tensor *tensor, const char *name = "", int verbosity = 0)
+{
+    if (tensor->ne[2] == 1)
+    {
+        printf("---> %s: (%ld, %ld)\n", name, tensor->ne[0], tensor->ne[1]);
+    }
+    else if (ggml_is_3d(tensor))
+    {
+        printf("---> %s: (%ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2]);
+    }
+    else
+    {
+        printf("---> %s: (%ld, %ld, %ld, %ld)\n", name, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+    }
+    if (verbosity == 1)
+    {
+        printf("*********************************************************************\n");
+        if (tensor->ne[2] == 1)
+        {
+            const float *mat = (float *)tensor->data;
+            int          dim0 = tensor->ne[1];
+            int          dim1 = tensor->ne[0];
+            if (dim0 < 6 && dim1 < 6)
+            {
+                for (int i = 0; i < dim0; i++)
+                {
+                    for (int j = 0; j < dim1; j++)
+                    {
+                        printf("%+.4f ", mat[i * dim1 + j]);
+                    }
+                    printf("\n");
+                }
+                printf("\n");
+            }
+            else
+            {
+                for (int i = 0; i < std::min(dim0, 3); i++)
+                {
+                    for (int j = 0; j < std::min(dim1, 3); j++)
+                    {
+                        printf("%+.6f ", mat[i * dim1 + j]);
+                    }
+                    printf("... ");
+                    for (int j = dim1 - 3; j < dim1; j++)
+                    {
+                        printf("%+.6f ", mat[i * dim1 + j]);
+                    }
+                    printf("\n");
+                }
+                if (dim0 > 3)
+                {
+                    printf("...................... omit ......................\n");
+                    for (int i = dim0 - 3; i < dim0; i++)
+                    {
+                        for (int j = 0; j < std::min(dim1, 3); j++)
+                        {
+                            printf("%+.6f ", mat[i * dim1 + j]);
+                        }
+                        printf("... ");
+                        for (int j = dim1 - 3; j < dim1; j++)
+                        {
+                            printf("%+.6f ", mat[i * dim1 + j]);
+                        }
+                        printf("\n");
+                    }
+                }
+            }
+        }
+        else if (ggml_is_3d(tensor))
+        {
+            const float *data = (float *)tensor->data;
+            int          dim0 = tensor->ne[2];
+            int          dim1 = tensor->ne[1];
+            int          dim2 = tensor->ne[0];
+            if (dim0 < 6 && dim1 < 6 && dim2 < 6)
+            {
+                for (int i = 0; i < dim0; i++)
+                {
+                    printf("dim0 = %d\n", i);
+                    for (int j = 0; j < dim1; j++)
+                    {
+                        for (int k = 0; k < dim2; k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("\n");
+                    }
+                    printf("\n");
+                }
+                printf("\n");
+            }
+            else
+            {
+                for (int i = 0; i < std::min(dim0, 3); i++)
+                {
+                    printf("dim0 = %d\n", i);
+                    for (int j = 0; j < std::min(dim1, 3); j++)
+                    {
+                        for (int k = 0; k < std::min(dim2, 3); k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("... ");
+                        for (int k = dim2 - 3; k < dim2; k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("\n");
+                    }
+                    printf("........................\n");
+                    for (int j = dim1 - 3; j < dim1; j++)
+                    {
+                        for (int k = 0; k < std::min(dim2, 3); k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("... ");
+                        for (int k = dim2 - 3; k < dim2; k++)
+                        {
+                            printf("%+.6f ", data[i * dim1 * dim2 + j * dim2 + k]);
+                        }
+                        printf("\n");
+                    }
+                    printf("---------------------------------------------------\n");
+                }
+                printf("\n");
+            }
+        }
+    }
+    printf("*********************************************************************\n");
+    printf("\n");
+}
 // RGB uint8 image
 struct clip_image_u8
 {
@ -418,6 +574,33 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
    ggml_graph_compute_with_ctx(mask.ctx, gf, 1);
    attention_mask = gf->nodes[gf->n_nodes - 1];
    // memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask));
+    
+    {
+        printf((" =========================     DEBUG  =========================\n"));
+        printf("Load pre-computed image embeddings and attention_mask\n");
+        std::string      filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
+        tensor_from_gguf tensor;
+        bool             is_successful = load_tensor_from_file(filename.c_str(), tensor);
+        if (!is_successful)
+        {
+            fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
+            return 1;
+        }
+        result = tensor.data;
+        // print_tensor(result, "result", 1);
+        filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
+        is_successful = load_tensor_from_file(filename.c_str(), tensor);
+        if (!is_successful)
+        {
+            fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
+            return 1;
+        }
+        attention_mask = tensor.data;
+        // print_tensor(attention_mask, "attention_mask", 1);
+        num_patches_width = 2;
+        num_patches_height = 2;
+    }
+    

    // compute attnetion masks outside of the graph
    struct ggml_tensor * attn_bias_input;
@ -463,10 +646,19 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
        ggml_build_forward_expand(gf_temp, attn_bias);
        ggml_graph_compute_with_ctx(ctx0, gf_temp, 1);
        attn_bias_input = attn_bias;
+    }else{
+        attn_bias_input = NULL;
    }
    int batch_size = num_patches_width * num_patches_height + 1;
+    // print_tensor(attn_bias_input, "attn_bias_input", 1);
+    // print_tensor(result, "result", 1);
+    printf("batch_size: %d\n", batch_size);
    const bool encoded = clip_image_encode_tokenizer(
        ctx_clip, batch_size, result, attn_bias_input, image_embd);
+    if (!encoded){
+        LOG_TEE("%s: failed at image tokenizer (projector step failed)\n", __func__);
+        return false;
+    }

    ggml_free(model.ctx);
    ggml_free(mask.ctx);
--- a/examples/xgenmm/xgenmm_convert.sh
+++ b/examples/xgenmm/xgenmm_convert.sh
@ -1,5 +0,0 @@
-python examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py\
-  --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \
-  --version siglip_kosmos_phi3_4k_instruct \
-  --xgenmm_projector /export/home/Projects/xgenmm-quantization/target_models/MiniCPM-Llama3-V-2_5/minicpmv.projector \
-  --use_f32
--- a/examples/xgenmm/xgenmm_surgery.py
+++ b/examples/xgenmm/xgenmm_surgery.py
@ -92,8 +92,9 @@ if __name__ == "__main__":
    torch.save(projector_tensors, save_path)
    
    # processors
-    tokenizer.save_pretrained(f"{save_dir}/tokenizer")
-    # will hard code the image_processor in the convert_image_encoder_to_gguf.py
+    
+    # put the tokenizer in the same dir as the lang model
+    tokenizer.save_pretrained(f"{save_dir}/llm")
    
    end = time.time()
    print(f"🟢 time used: [{end-start:.3f} s]")
--- a/BIN
+++ b/BIN