From 953bef9374041888416720ef4b0538734c792ee4 Mon Sep 17 00:00:00 2001
From: root <chrisjtan23@gmail.com>
Date: Mon, 7 Oct 2024 21:36:10 +0000
Subject: [PATCH] fix memeory error & clean the print statements

---
 examples/xgenmm/clip.cpp       | 12 +++----
 examples/xgenmm/xgenmm-cli.cpp | 62 +++++++++++++++-------------------
 examples/xgenmm/xgenmm.cpp     | 32 ++----------------
 3 files changed, 36 insertions(+), 70 deletions(-)

diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp
index 558f906f2..96e868351 100644
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
@@ -1504,6 +1504,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
 
     // kv
     const int n_kv = gguf_get_n_kv(ctx);
+
+    // std::cout << "do I have n_kv here at clip.cpp "<< __LINE__  << "? : " << gguf_get_n_kv(ctx) <<std::endl;
+
     LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
         __func__, n_kv, n_tensors, fname);
     {
@@ -1981,7 +1984,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
         size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
         LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
     }
-
     return new_clip;
 }
 
@@ -2424,7 +2426,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
-
     if(clip_is_minicpmv(ctx)){
         int max_slice_nums = 9;
         std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
@@ -2497,7 +2498,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
                 possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
             }
             std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
-            printf("best_resolution: %d %d\n", best_resolution.first, best_resolution.second);
+            // printf("best_resolution: %d %d\n", best_resolution.first, best_resolution.second);
             // clip_image_save_to_bmp(*img, "input.bmp");
             resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
             // clip_image_save_to_bmp(*temp, "resized.bmp");
@@ -2624,7 +2625,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
 void clip_free(clip_ctx * ctx) {
     ggml_free(ctx->ctx_data);
     gguf_free(ctx->ctx_gguf);
-    
     ggml_backend_buffer_free(ctx->params_buffer);
     ggml_backend_free(ctx->backend);
     ggml_gallocr_free(ctx->compute_alloc);
@@ -2807,10 +2807,8 @@ bool clip_image_encode_tokenizer(struct clip_ctx * ctx, int batch_size, ggml_ten
     ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
     ggml_backend_graph_compute(ctx->backend, gf);
     struct ggml_tensor * llm_inputs = gf->nodes[gf->n_nodes - 1];
-    print_my_tensor(llm_inputs, "llm_inputs", 1);
-    // exit(0);
     ggml_backend_tensor_get(llm_inputs, image_embd, 0, ggml_nbytes(llm_inputs));
-    clip_free(ctx);
+    // clip_free(ctx);  // debug: llava_ctx was freed here, now free all the 'ctx' memory outside.
     return true;
 }
 
diff --git a/examples/xgenmm/xgenmm-cli.cpp b/examples/xgenmm/xgenmm-cli.cpp
index c2c0e94a3..4928038a8 100644
--- a/examples/xgenmm/xgenmm-cli.cpp
+++ b/examples/xgenmm/xgenmm-cli.cpp
@@ -44,12 +44,12 @@ static bool eval_string(struct llama_context *ctx_llama, const char *str, int n_
 
     std::string              str2 = str;
     std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
-    printf("!!prompt to eval!!: %s", str);
-    printf("----------------------\n");
-    // for (auto token : embd_inp){
-    //     printf("%6d, ", token);
-    // }
-    printf("\n");
+    // printf("!!prompt to eval!!: %s", str);
+    // printf("----------------------\n");
+    // // for (auto token : embd_inp){
+    // //     printf("%6d, ", token);
+    // // }
+    // printf("\n");
     eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
     return true;
 }
@@ -217,23 +217,23 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
         user_prompt = prompt.substr(image_pos + std::string("<image>").length());
         LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
         // phi3-tokenizer https://github.com/ggerganov/llama.cpp/issues/7938
-        if (params->verbose_prompt)
-        {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
-            for (int i = 0; i < (int)tmp.size(); i++)
-            {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
+        // if (params->verbose_prompt)
+        // {
+        //     auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+        //     for (int i = 0; i < (int)tmp.size(); i++)
+        //     {
+        //         LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+        //     }
+        // }
         LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
-        if (params->verbose_prompt)
-        {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int)tmp.size(); i++)
-            {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
+        // if (params->verbose_prompt)
+        // {
+        //     auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+        //     for (int i = 0; i < (int)tmp.size(); i++)
+        //     {
+        //         LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+        //     }
+        // }
     }
     else
     {
@@ -280,11 +280,11 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
         response += tmp;
         // printf("%s", tmp);
         if (strcmp(tmp, "<|end|>") == 0){
-            printf("\n STOP GENERATING because I saw <|end|>\n");
+            // printf("\n STOP GENERATING because I saw <|end|>\n");
             break;
         }
         if (strcmp(tmp, "</s>") == 0) {
-            printf("\n STOP GENERATING because I saw </s>\n");
+            // printf("\n STOP GENERATING because I saw </s>\n");
             break;
         }
         if (strstr(tmp, "###")) break;  // Yi-VL behavior
@@ -327,7 +327,6 @@ static struct llava_context *llava_init_context(gpt_params *params, llama_model
     }
 
     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/1);
-
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
     ctx_params.n_ctx =
         params->n_ctx < 2048 ? 2048 : params->n_ctx;  // we need a longer context size to process image embeddings
@@ -349,11 +348,7 @@ static struct llava_context *llava_init_context(gpt_params *params, llama_model
 }
 
 static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        printf(
-            "YD:::Segmentation fault here; Because header.n_kv is empty\n clip_free->gguf_free(ctx->ctx_gguf)-> for "
-            "(uint64_t i = 0; i < ctx->header.n_kv; ++i)\n");
-        exit(1);
+    if (ctx_llava->ctx_clip) {        
         clip_free(ctx_llava->ctx_clip);
         ctx_llava->ctx_clip = NULL;
     }
@@ -528,7 +523,6 @@ int main(int argc, char ** argv) {
     if (prompt_contains_image(params.prompt))
     {
         auto ctx_llava = llava_init_context(&params, model);
-
         auto image_embed = load_image(ctx_llava, &params, "");
 
         // process the prompt
@@ -543,11 +537,11 @@ int main(int argc, char ** argv) {
     {
         for (auto &image : params.image)
         {
-            printf("image: %s\n", image.c_str());
+            // printf("image: %s\n", image.c_str());
             auto ctx_llava = llava_init_context(&params, model);
 
             auto image_embed = load_image(ctx_llava, &params, image);
-            printf("n_image_pos: %d\n", image_embed->n_image_pos);
+            // printf("n_image_pos: %d\n", image_embed->n_image_pos);
             if (!image_embed)
             {
                 std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
@@ -575,6 +569,6 @@ int main(int argc, char ** argv) {
     //     ctx_llava->model = NULL;
     //     llava_free(ctx_llava);
     // }
-    printf("Remember to remove print_tensor function in xgenmm.cpp and clip.cpp\n");
+    // printf("Remember to remove print_tensor function in xgenmm.cpp and clip.cpp\n");
     return 0;
 }
diff --git a/examples/xgenmm/xgenmm.cpp b/examples/xgenmm/xgenmm.cpp
index b82cfa728..dd6a81e33 100644
--- a/examples/xgenmm/xgenmm.cpp
+++ b/examples/xgenmm/xgenmm.cpp
@@ -534,7 +534,7 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
         scale_factor = (float)current_height / (float)original_height;
         int new_width = int(original_width * scale_factor);
         int padding = (current_width - new_width) / 2;
-        printf("new_width: %d, padding: %d\n", new_width, padding);
+        // printf("new_width: %d, padding: %d\n", new_width, padding);
         for (int i = 0; i < current_height; i++){
             for (int j = 0; j < current_width; j++){
                 if (j < padding || j >= current_width - padding)
@@ -561,33 +561,6 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
     ggml_graph_compute_with_ctx(mask.ctx, gf, 1);
     attention_mask = gf->nodes[gf->n_nodes - 1];
     // memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask));
-    
-    // {
-    //     printf((" =========================     DEBUG  =========================\n"));
-    //     printf("Load pre-computed image embeddings and attention_mask\n");
-    //     std::string      filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
-    //     tensor_from_gguf tensor;
-    //     bool             is_successful = load_tensor_from_file(filename.c_str(), tensor);
-    //     if (!is_successful)
-    //     {
-    //         fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
-    //         return 1;
-    //     }
-    //     result = tensor.data;
-    //     // print_tensor(result, "result", 1);
-    //     filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
-    //     is_successful = load_tensor_from_file(filename.c_str(), tensor);
-    //     if (!is_successful)
-    //     {
-    //         fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
-    //         return 1;
-    //     }
-    //     attention_mask = tensor.data;
-    //     // print_tensor(attention_mask, "attention_mask", 1);
-    //     num_patches_width = 2;
-    //     num_patches_height = 2;
-    // }
-    
 
     // compute attnetion masks outside of the graph
     struct ggml_tensor * attn_bias_input;
@@ -639,7 +612,7 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
     int batch_size = num_patches_width * num_patches_height + 1;
     // print_tensor(attn_bias_input, "attn_bias_input", 1);
     // print_tensor(result, "result", 1);
-    printf("batch_size: %d\n", batch_size);
+    // printf("batch_size: %d\n", batch_size);
     const bool encoded = clip_image_encode_tokenizer(
         ctx_clip, batch_size, result, attn_bias_input, image_embd);
     if (!encoded){
@@ -982,6 +955,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx *ctx_clip, int n_threads, con
         free(image_embd);
         return false;
     }
+    
     *image_embd_out = image_embd;
     *n_img_pos_out = n_img_pos;