From 2327897175f1738b50fd490c0c02e8950243b63b Mon Sep 17 00:00:00 2001 From: Alex-Brooks Date: Mon, 10 Feb 2025 07:04:08 -0700 Subject: [PATCH] Cleanup logs Signed-off-by: Alex-Brooks --- examples/llava/clip.cpp | 53 +++---------------- .../llava/convert_image_encoder_to_gguf.py | 1 - 2 files changed, 8 insertions(+), 46 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 44a65e40b..9d2922dd3 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -121,6 +121,7 @@ static std::string format(const char * fmt, ...) { #define KEY_IMAGE_STD "clip.vision.image_std" #define KEY_PROJ_TYPE "clip.projector_type" #define KEY_VISION_FEATURE_LAYER "clip.vision.feature_layer" + #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution" @@ -170,6 +171,7 @@ static std::string format(const char * fmt, ...) { #define TN_GLM_BOI_W "adapter.boi" #define TN_GLM_EOI_W "adapter.eoi" + enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, @@ -429,6 +431,7 @@ static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u // // clip layers // + struct clip_hparams { int32_t image_size; int32_t patch_size; @@ -614,7 +617,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 LOG_ERR("This gguf file seems to have no vision encoder\n"); return nullptr; } - LOG_INF("In the graph builder...\n"); const auto & model = ctx->vision_model; const auto & hparams = model.hparams; @@ -666,11 +668,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 /*.mem_buffer =*/ ctx->buf_compute_meta.data(), /*.no_alloc =*/ true, }; - LOG_INF("Making the graph...\n"); struct ggml_context * ctx0 = ggml_init(params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); - LOG_INF("Graph made...\n"); struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size); ggml_set_name(inp_raw, "inp_raw"); @@ -753,7 +753,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b); } - LOG_INF("About to iterate over layers...\n"); // Check to see we have 1+ set vision feature layers set; otherwise it's the last layer std::vector embedding_stack; @@ -771,7 +770,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 if(max_feature_layer < 0) { max_feature_layer = n_layer; } - LOG_INF("Number of feature layers: %d\n", max_feature_layer); // loop over layers for (int il = 0; il < max_feature_layer; il++) { @@ -781,7 +779,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // NOTE: 0 index here refers to the input to the encoder. for(int vf_layer_idx = 0; vf_layer_idx < 4; vf_layer_idx++) { if (il == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) { - LOG_INF("Keeping vision feature layer: %d\n", il); embedding_stack.push_back(embeddings); break; } @@ -878,7 +875,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // post-layernorm if (ctx->has_post_norm && max_feature_layer == n_layer) { - LOG_INF("POST NORMALIZING"); embeddings = ggml_norm(ctx0, embeddings, eps); ggml_set_name(embeddings, "post_ln"); @@ -888,7 +884,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // final layer is a vision feature layer for(int vf_layer_idx = 0; vf_layer_idx < 4; vf_layer_idx++) { if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vf_layer_idx]) { - LOG_INF("Keeping vision feature layer : %d\n", n_layer); embedding_stack.push_back(embeddings); break; } @@ -896,14 +891,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // If feature layers are explicitly set, stack them (if we have multiple) if(has_feature_layers && embedding_stack.size() > 0) { - LOG_INF("Stacking vision feature layers : %d\n", n_layer); embeddings = embedding_stack.at(0); for(unsigned long i=1; i < embedding_stack.size(); i++) { embeddings = ggml_concat(ctx0, embeddings, embedding_stack.at(i), 0); } } - LOG_INF("Layer loop over - trying to llava project...\n"); // llava projector if (ctx->has_llava_projector) { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); @@ -920,10 +913,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // llava projector if (ctx->proj_type == PROJECTOR_TYPE_MLP) { - LOG_INF("---- MLP projector ----"); - LOG_INF("proj mlp: mm 0 shape: [%d, %d, %d, %d] | embedding shape: [%d, %d, %d, %d]\n", model.mm_0_w->ne[0], model.mm_0_w->ne[1], model.mm_0_w->ne[2], model.mm_0_w->ne[3], embeddings->ne[0], embeddings->ne[1], embeddings->ne[2], embeddings->ne[3]); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); - LOG_INF("proj mlp - first mulmat done\n"); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_gelu(ctx0, embeddings); @@ -931,7 +921,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); } else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) { - LOG_INF("proj mlp norm\n"); embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); // ggml_tensor_printf(embeddings, "mm_0_w",0,true,false); @@ -1203,14 +1192,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_1_b); } - LOG_INF("forward expanding\n"); // build the graph ggml_build_forward_expand(gf, embeddings); - LOG_INF("forward expand done\n"); ggml_free(ctx0); - LOG_INF("freeing it all\n"); return gf; } @@ -1478,7 +1464,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } fin.close(); } - LOG_INF("%s: We are up to the vision model\n", __func__); + // vision model if (new_clip->has_vision_encoder) { // load vision model @@ -1497,31 +1483,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { int idx = get_key_idx(ctx, KEY_IMAGE_GRID_PINPOINTS); int n = gguf_get_arr_n(ctx, idx); const int32_t * pinpoints = (const int32_t *)gguf_get_arr_data(ctx, idx); - LOG_INF("Grid pinpoints | max %d | actual %d ", 64, n); for (int i = 0; i < 64 && i < n && pinpoints[i] != 0; ++i) { - LOG_INF(" %d ", i); hparams.image_grid_pinpoints[i] = pinpoints[i]; } - LOG_INF("\n"); if (n < 64) hparams.image_grid_pinpoints[n] = 0; } catch (std::runtime_error & /*e*/) { hparams.image_grid_pinpoints[0]=0; } - // Load the vision feature layer indices; For most models, this will be - // an array of length one with value -1 (i.e., use last layer as visual features), - // but for IBM granite, we have multiple feature layers that get concatenated. - // - // Here, we should standardize all values to uint values so that we can use -1 as unset values. + // Load the vision feature layer indices if they are explicitly provided; + // if multiple vision feature layers are present, the values will be concatenated + // to form the final visual features. try { - LOG_INF("ABOUT TO GET VISION FEATURE LAYER KEYS\n"); int idx = get_key_idx(ctx, KEY_VISION_FEATURE_LAYER); - LOG_INF("VISION FEATURE LAYER IDX %d\n", idx); int n = gguf_get_arr_n(ctx, idx); - LOG_INF("GETTING %d VISION FEATURE LAYERS \n", n); - // TODO - fix this - LOG_INF("n_layer in hparams is: %d\n", hparams.n_layer); const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx); // HACK - need to set a good invalid number here; or maybe not, I guess it could just @@ -1529,12 +1505,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { // -1 is the sad one for (int i = 0; i < 4 && i < n && vision_feature_layer[i] != 0; ++i) { hparams.vision_feature_layer[i] = vision_feature_layer[i]; - LOG_INF("feature layer %d - %d | ", i, vision_feature_layer[i]); } if (n < 4) hparams.vision_feature_layer[n] = -1; } catch (std::runtime_error & /*e*/) { - LOG_INF("VISION FEATURE LAYER RETRIEVAL FAILED"); hparams.vision_feature_layer[0] = -1; } @@ -1595,8 +1569,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->has_class_embedding = false; } - LOG_INF("Has class embedding: %d", new_clip->has_class_embedding); - try { vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); @@ -1631,7 +1603,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch(const std::exception& /*e*/) { new_clip->has_qwen2vl_merger = false; } - LOG_INF("Loaded up to llava projection"); // LLaVA projection if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) { @@ -1769,7 +1740,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->ctx_gguf = ctx; - LOG_INF("About to measure memory and build graphs...\n"); // measure mem requirement and allocate { new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead()); @@ -1777,7 +1747,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { clip_image_f32_batch batch; batch.size = 1; batch.data = nullptr; - LOG_INF("Entering graph...\n"); ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch, nullptr, false); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); @@ -2656,10 +2625,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } // build the inference graph - LOG_INF("Doing a batch encode\n"); ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); - LOG_INF("did graph alloc\n"); // set inputs const auto & model = ctx->vision_model; @@ -2819,22 +2786,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } } - LOG_INF("about to do backend graph compute\n"); if (ggml_backend_is_cpu(ctx->backend)) { ggml_backend_cpu_set_n_threads(ctx->backend, n_threads); } - LOG_INF("-----\n"); + ggml_backend_graph_compute(ctx->backend, gf); - LOG_INF("did backend graph compute\n"); // the last node is the embedding tensor struct ggml_tensor * embeddings = ggml_graph_node(gf, -1); - LOG_INF("retrieved emb tensor\n"); // copy the embeddings to the location passed by the user ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); - LOG_INF("embeddings have been recopied\n"); if (ctx->has_glm_projector) { //eoi diff --git a/examples/llava/convert_image_encoder_to_gguf.py b/examples/llava/convert_image_encoder_to_gguf.py index 414eb2838..df56bf789 100644 --- a/examples/llava/convert_image_encoder_to_gguf.py +++ b/examples/llava/convert_image_encoder_to_gguf.py @@ -342,7 +342,6 @@ fout.add_bool("clip.use_gelu", use_gelu) if has_llava_projector: - # model.vision_model.encoder.layers.pop(-1) projector = torch.load(args.llava_projector) for name, data in projector.items(): name = get_tensor_name(name)