fix memeory error & clean the print statements

2024-10-07 21:36:10 +00:00 · 2024-10-07 21:36:10 +00:00 · 953bef9374
commit 953bef9374
parent aa23425236
3 changed files with 36 additions and 70 deletions
--- a/examples/xgenmm/clip.cpp
+++ b/examples/xgenmm/clip.cpp
@ -1504,6 +1504,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    // kv
    const int n_kv = gguf_get_n_kv(ctx);
+
+    // std::cout << "do I have n_kv here at clip.cpp "<< __LINE__  << "? : " << gguf_get_n_kv(ctx) <<std::endl;
+
    LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
        __func__, n_kv, n_tensors, fname);
    {
@ -1981,7 +1984,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
        LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
    }
-
    return new_clip;
 }

@ -2424,7 +2426,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
-
    if(clip_is_minicpmv(ctx)){
        int max_slice_nums = 9;
        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
@ -2497,7 +2498,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
                possible_resolutions.push_back({params.image_grid_pinpoints[i], params.image_grid_pinpoints[i+1]});
            }
            std::pair<int, int> best_resolution = select_best_resolution({img->nx, img->ny}, possible_resolutions);
-            printf("best_resolution: %d %d\n", best_resolution.first, best_resolution.second);
+            // printf("best_resolution: %d %d\n", best_resolution.first, best_resolution.second);
            // clip_image_save_to_bmp(*img, "input.bmp");
            resize_and_pad_image(*img, *temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
            // clip_image_save_to_bmp(*temp, "resized.bmp");
@ -2624,7 +2625,6 @@ ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
 void clip_free(clip_ctx * ctx) {
    ggml_free(ctx->ctx_data);
    gguf_free(ctx->ctx_gguf);
-    
    ggml_backend_buffer_free(ctx->params_buffer);
    ggml_backend_free(ctx->backend);
    ggml_gallocr_free(ctx->compute_alloc);
@ -2807,10 +2807,8 @@ bool clip_image_encode_tokenizer(struct clip_ctx * ctx, int batch_size, ggml_ten
    ggml_gallocr_alloc_graph(ctx->compute_alloc, gf);
    ggml_backend_graph_compute(ctx->backend, gf);
    struct ggml_tensor * llm_inputs = gf->nodes[gf->n_nodes - 1];
-    print_my_tensor(llm_inputs, "llm_inputs", 1);
-    // exit(0);
    ggml_backend_tensor_get(llm_inputs, image_embd, 0, ggml_nbytes(llm_inputs));
-    clip_free(ctx);
+    // clip_free(ctx);  // debug: llava_ctx was freed here, now free all the 'ctx' memory outside.
    return true;
 }

--- a/examples/xgenmm/xgenmm-cli.cpp
+++ b/examples/xgenmm/xgenmm-cli.cpp
@ -44,12 +44,12 @@ static bool eval_string(struct llama_context *ctx_llama, const char *str, int n_

    std::string              str2 = str;
    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
-    printf("!!prompt to eval!!: %s", str);
-    printf("----------------------\n");
-    // for (auto token : embd_inp){
-    //     printf("%6d, ", token);
-    // }
-    printf("\n");
+    // printf("!!prompt to eval!!: %s", str);
+    // printf("----------------------\n");
+    // // for (auto token : embd_inp){
+    // //     printf("%6d, ", token);
+    // // }
+    // printf("\n");
    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
    return true;
 }
@ -217,23 +217,23 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
        LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
        // phi3-tokenizer https://github.com/ggerganov/llama.cpp/issues/7938
-        if (params->verbose_prompt)
-        {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
-            for (int i = 0; i < (int)tmp.size(); i++)
-            {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
+        // if (params->verbose_prompt)
+        // {
+        //     auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
+        //     for (int i = 0; i < (int)tmp.size(); i++)
+        //     {
+        //         LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+        //     }
+        // }
        LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
-        if (params->verbose_prompt)
-        {
-            auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int)tmp.size(); i++)
-            {
-                LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
+        // if (params->verbose_prompt)
+        // {
+        //     auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
+        //     for (int i = 0; i < (int)tmp.size(); i++)
+        //     {
+        //         LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
+        //     }
+        // }
    }
    else
    {
@ -280,11 +280,11 @@ static void process_prompt(struct llava_context *ctx_llava, struct llava_image_e
        response += tmp;
        // printf("%s", tmp);
        if (strcmp(tmp, "<|end|>") == 0){
-            printf("\n STOP GENERATING because I saw <|end|>\n");
+            // printf("\n STOP GENERATING because I saw <|end|>\n");
            break;
        }
        if (strcmp(tmp, "</s>") == 0) {
-            printf("\n STOP GENERATING because I saw </s>\n");
+            // printf("\n STOP GENERATING because I saw </s>\n");
            break;
        }
        if (strstr(tmp, "###")) break;  // Yi-VL behavior
@ -327,7 +327,6 @@ static struct llava_context *llava_init_context(gpt_params *params, llama_model
    }

    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/1);
-
    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
    ctx_params.n_ctx =
        params->n_ctx < 2048 ? 2048 : params->n_ctx;  // we need a longer context size to process image embeddings
@ -349,11 +348,7 @@ static struct llava_context *llava_init_context(gpt_params *params, llama_model
 }

 static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        printf(
-            "YD:::Segmentation fault here; Because header.n_kv is empty\n clip_free->gguf_free(ctx->ctx_gguf)-> for "
-            "(uint64_t i = 0; i < ctx->header.n_kv; ++i)\n");
-        exit(1);
+    if (ctx_llava->ctx_clip) {        
        clip_free(ctx_llava->ctx_clip);
        ctx_llava->ctx_clip = NULL;
    }
@ -528,7 +523,6 @@ int main(int argc, char ** argv) {
    if (prompt_contains_image(params.prompt))
    {
        auto ctx_llava = llava_init_context(&params, model);
-
        auto image_embed = load_image(ctx_llava, &params, "");

        // process the prompt
@ -543,11 +537,11 @@ int main(int argc, char ** argv) {
    {
        for (auto &image : params.image)
        {
-            printf("image: %s\n", image.c_str());
+            // printf("image: %s\n", image.c_str());
            auto ctx_llava = llava_init_context(&params, model);

            auto image_embed = load_image(ctx_llava, &params, image);
-            printf("n_image_pos: %d\n", image_embed->n_image_pos);
+            // printf("n_image_pos: %d\n", image_embed->n_image_pos);
            if (!image_embed)
            {
                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
@ -575,6 +569,6 @@ int main(int argc, char ** argv) {
    //     ctx_llava->model = NULL;
    //     llava_free(ctx_llava);
    // }
-    printf("Remember to remove print_tensor function in xgenmm.cpp and clip.cpp\n");
+    // printf("Remember to remove print_tensor function in xgenmm.cpp and clip.cpp\n");
    return 0;
 }
--- a/examples/xgenmm/xgenmm.cpp
+++ b/examples/xgenmm/xgenmm.cpp
@ -534,7 +534,7 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
        scale_factor = (float)current_height / (float)original_height;
        int new_width = int(original_width * scale_factor);
        int padding = (current_width - new_width) / 2;
-        printf("new_width: %d, padding: %d\n", new_width, padding);
+        // printf("new_width: %d, padding: %d\n", new_width, padding);
        for (int i = 0; i < current_height; i++){
            for (int j = 0; j < current_width; j++){
                if (j < padding || j >= current_width - padding)
@ -561,33 +561,6 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
    ggml_graph_compute_with_ctx(mask.ctx, gf, 1);
    attention_mask = gf->nodes[gf->n_nodes - 1];
    // memcpy(image_embd_v_m_mask_out, (float *)attention_mask->data, ggml_nbytes(attention_mask));
-    
-    // {
-    //     printf((" =========================     DEBUG  =========================\n"));
-    //     printf("Load pre-computed image embeddings and attention_mask\n");
-    //     std::string      filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_features.gguf";
-    //     tensor_from_gguf tensor;
-    //     bool             is_successful = load_tensor_from_file(filename.c_str(), tensor);
-    //     if (!is_successful)
-    //     {
-    //         fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
-    //         return 1;
-    //     }
-    //     result = tensor.data;
-    //     // print_tensor(result, "result", 1);
-    //     filename = "/export/home/ggml/examples/projectors/receipt_5patches_vision_attn_masks.gguf";
-    //     is_successful = load_tensor_from_file(filename.c_str(), tensor);
-    //     if (!is_successful)
-    //     {
-    //         fprintf(stderr, "%s: load_tensor_from_file() failed\n", __func__);
-    //         return 1;
-    //     }
-    //     attention_mask = tensor.data;
-    //     // print_tensor(attention_mask, "attention_mask", 1);
-    //     num_patches_width = 2;
-    //     num_patches_height = 2;
-    // }
-    

    // compute attnetion masks outside of the graph
    struct ggml_tensor * attn_bias_input;
@ -639,7 +612,7 @@ static bool clip_xgenmm_handle_vit_patches(clip_ctx *ctx_clip , const clip_image
    int batch_size = num_patches_width * num_patches_height + 1;
    // print_tensor(attn_bias_input, "attn_bias_input", 1);
    // print_tensor(result, "result", 1);
-    printf("batch_size: %d\n", batch_size);
+    // printf("batch_size: %d\n", batch_size);
    const bool encoded = clip_image_encode_tokenizer(
        ctx_clip, batch_size, result, attn_bias_input, image_embd);
    if (!encoded){
@ -982,6 +955,7 @@ bool llava_image_embed_make_with_clip_img(clip_ctx *ctx_clip, int n_threads, con
        free(image_embd);
        return false;
    }
+    
    *image_embd_out = image_embd;
    *n_img_pos_out = n_img_pos;