diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 24073c5a9..1add69d43 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1308,6 +1308,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { LOG_INF("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); LOG_INF("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); LOG_INF("%s: minicpmv_projector: %d\n", __func__, new_clip->has_minicpmv_projector); + LOG_INF("%s: minicpmv_version: %d\n", __func__, new_clip->minicpmv_version); LOG_INF("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); LOG_INF("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 53d902d61..23b3de4db 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -148,19 +148,34 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); if (num_image_embeds > 1) { - size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { - for (size_t j = 0; j < num_image_embeds_col; ++j) { - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); - if (j == num_image_embeds_col - 1) { - eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + if (has_minicpmv_projector == 2) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } + } + } + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + } + else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) { + size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); + for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) { + for (size_t j = 0; j < num_image_embeds_col; ++j) { + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); + eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + if (j == num_image_embeds_col - 1) { + eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false); + } } } } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); } LOG_INF("%s: image token past: %d\n", __func__, n_past); } diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py index 9b196757f..cfe0961f9 100644 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -597,7 +597,6 @@ elif args.minicpmv_projector is not None: fname_middle = "mmproj-" has_text_encoder = False has_minicpmv_projector = True - minicpmv_version = 4 elif args.vision_only: fname_middle = "vision-" has_text_encoder = False