diff --git a/examples/xgenmm/clip.cpp b/examples/xgenmm/clip.cpp index 5d73ef7a0..fdd489004 100644 --- a/examples/xgenmm/clip.cpp +++ b/examples/xgenmm/clip.cpp @@ -667,22 +667,21 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 struct ggml_tensor * embeddings = inp; struct ggml_tensor * pos_embed = nullptr; if (ctx->has_llava_projector) { - printf("use has_llava_projector\n"); + printf(" use has_llava_projector\n"); // concat class_embeddings and patch_embeddings if (ctx->has_class_embedding) { - printf("I am in!\n"); embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - printf("created embeddings new 3d tensors\n"); ggml_set_name(embeddings, "embeddings"); ggml_set_input(embeddings); - printf("ggml_set_input\n"); embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + printf(" first acc worked\n"); embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + printf(" second acc worked\n"); } } - // printf("hi1!"); + // printf(" after ctx->has_llava_projector\n"); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); ggml_set_input(positions); @@ -2500,6 +2499,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // build the inference graph ggml_cgraph * gf = clip_image_build_graph(ctx, imgs, ctx->load_image_size, true); + printf(" build graph done\n"); ggml_gallocr_alloc_graph(ctx->compute_alloc, gf); // set inputs const auto & model = ctx->vision_model; @@ -2546,6 +2546,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw)); free(data); } + printf(" before ctx->has_minicpmv_projector\n"); if (ctx->has_minicpmv_projector) { { // inspired from siglip: @@ -2638,7 +2639,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_metal_set_n_cb(ctx->backend, n_threads); } #endif + printf(" before ggml_backend_graph_compute\n"); ggml_backend_graph_compute(ctx->backend, gf); + printf(" after ggml_backend_graph_compute\n"); // the last node is the embedding tensor struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1]; // copy the embeddings to the location passed by the user diff --git a/examples/xgenmm/convert.sh b/examples/xgenmm/convert.sh index e62e71a5f..3c05c73f4 100644 --- a/examples/xgenmm/convert.sh +++ b/examples/xgenmm/convert.sh @@ -1,12 +1,12 @@ -# source /export/share/yutong/miniconda3/bin/activate -# conda activate xgenmm-flamingo +source /export/share/yutong/miniconda3/bin/activate +conda activate xgenmm-flamingo # which python # # step 1: surgery # python xgenmm_surgery.py # step 2: convert to gguf (vit + projector) -python examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py \ +python xgenmm_convert_image_encoder_to_gguf.py \ --surgery_dir /export/share/yutong/xgenmm/llamacpp_wd \ --output_dirname gguf_test \ --version siglip_kosmos_phi3_4k_instruct \ diff --git a/examples/xgenmm/test_anyres_handle_patches.cpp b/examples/xgenmm/test_anyres_handle_patches.cpp index 53fc5164d..ace927b1f 100644 --- a/examples/xgenmm/test_anyres_handle_patches.cpp +++ b/examples/xgenmm/test_anyres_handle_patches.cpp @@ -502,33 +502,34 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair< int main(){ - const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf"; + // const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/llava-1.6/vit/mmproj-model-f16.gguf"; + const char* clip_path = "/export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf"; struct clip_ctx * ctx = clip_model_load(clip_path, /*verbosity=*/2); - printf("Model loaded\n"); - for (int i=0; i < 3; i++){ - ctx->image_mean[i] = 0.5; - ctx->image_std[i] = 0.5; - } - LOG_TEE("v_image_mean %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]); - LOG_TEE("v_image_std %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]); - // [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]] - ctx->vision_model.hparams.image_grid_pinpoints[0] = 384; - ctx->vision_model.hparams.image_grid_pinpoints[1] = 768; - ctx->vision_model.hparams.image_grid_pinpoints[2] = 768; - ctx->vision_model.hparams.image_grid_pinpoints[3] = 384; - ctx->vision_model.hparams.image_grid_pinpoints[4] = 768; - ctx->vision_model.hparams.image_grid_pinpoints[5] = 768; - ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152; - ctx->vision_model.hparams.image_grid_pinpoints[7] = 384; - ctx->vision_model.hparams.image_grid_pinpoints[8] = 384; - ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152; - for (int i = 0; i < 10; i++) - { - printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]); - } - printf("\n"); - ctx->vision_model.hparams.image_size = 384; - printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size); + // printf("Model loaded\n"); + // for (int i=0; i < 3; i++){ + // ctx->image_mean[i] = 0.5; + // ctx->image_std[i] = 0.5; + // } + // LOG_TEE("v_image_mean %f %f %f\n", ctx->image_mean[0], ctx->image_mean[1], ctx->image_mean[2]); + // LOG_TEE("v_image_std %f %f %f\n", ctx->image_std[0], ctx->image_std[1], ctx->image_std[2]); + // // [[384, 768], [768, 384], [768, 768], [1152, 384], [384, 1152]] + // ctx->vision_model.hparams.image_grid_pinpoints[0] = 384; + // ctx->vision_model.hparams.image_grid_pinpoints[1] = 768; + // ctx->vision_model.hparams.image_grid_pinpoints[2] = 768; + // ctx->vision_model.hparams.image_grid_pinpoints[3] = 384; + // ctx->vision_model.hparams.image_grid_pinpoints[4] = 768; + // ctx->vision_model.hparams.image_grid_pinpoints[5] = 768; + // ctx->vision_model.hparams.image_grid_pinpoints[6] = 1152; + // ctx->vision_model.hparams.image_grid_pinpoints[7] = 384; + // ctx->vision_model.hparams.image_grid_pinpoints[8] = 384; + // ctx->vision_model.hparams.image_grid_pinpoints[9] = 1152; + // for (int i = 0; i < 10; i++) + // { + // printf("grid[%d]:%d ", i, ctx->vision_model.hparams.image_grid_pinpoints[i]); + // } + // printf("\n"); + // ctx->vision_model.hparams.image_size = 384; + // printf("in test_anyres: params.image_size:%d\n", ctx->vision_model.hparams.image_size); /* part of: llava_image_embed_make_with_filename @@ -618,6 +619,7 @@ int main(){ printf("image_embd_v.size():%d\n", image_embd_v.size()); for (size_t i = 0; i < img_res_v.size; i++) { + printf("encode patch %d\n", i); image_embd_v[i] = (float*)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 const bool encoded = clip_image_encode( diff --git a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py index 2699ab131..a5b3026cf 100644 --- a/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py +++ b/examples/xgenmm/xgenmm_convert_image_encoder_to_gguf.py @@ -267,6 +267,8 @@ if __name__ == "__main__": # ggml implements gelu_with_tanh approximation use_gelu = "gelu" in vision_config["hidden_act"].lower() fout.add_bool("clip.use_gelu", use_gelu) + fout.add_string("clip.vision.mm_patch_merge_type", 'spatial_unpad') + print("hard coded mm_patch_merge_type as spatial_unpad") # for VIT model with print_time("Loading vision encoder and converting to gguf"):