diff --git a/examples/omni-vlm/clip.cpp b/examples/omni-vlm/clip.cpp index e8a04ad3a..618067aba 100644 --- a/examples/omni-vlm/clip.cpp +++ b/examples/omni-vlm/clip.cpp @@ -701,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { +struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) { struct ggml_context * meta = NULL; struct gguf_init_params params = { @@ -796,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } clip_ctx * new_clip = new clip_ctx{}; + if (std::string(omni_vlm_version) == "vlm-81-ocr") { + new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR; + } else if (std::string(omni_vlm_version) == "vlm-81-instruct") { + new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT; + } else if (std::string(omni_vlm_version) == "nano-vlm-instruct") { + new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT; + } else { + throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version); + } // update projector type { @@ -1209,17 +1218,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return new_clip; } -void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) { - if (params->omni_vlm_version == "vlm-81-ocr") { - ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR; - } else if (params->omni_vlm_version == "vlm-81-instruct") { - ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT; - } else if (params->omni_vlm_version == "nano-vlm-instruct") { - ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT; - } else { - throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version); - } -} +// void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) { +// if (params->omni_vlm_version == "vlm-81-ocr") { +// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR; +// } else if (params->omni_vlm_version == "vlm-81-instruct") { +// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT; +// } else if (params->omni_vlm_version == "nano-vlm-instruct") { +// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT; +// } else { +// throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version); +// } +// } void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { ctx_clip->load_image_size = load_image_size; @@ -2207,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return true; } -bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { +bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) { ggml_type type = GGML_TYPE_Q4_1; assert(itype < GGML_TYPE_COUNT); type = static_cast(itype); - auto * ctx_clip = clip_model_load(fname_inp, 2); + auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2); const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; diff --git a/examples/omni-vlm/clip.h b/examples/omni-vlm/clip.h index ae004b6df..cd4007a9e 100644 --- a/examples/omni-vlm/clip.h +++ b/examples/omni-vlm/clip.h @@ -39,11 +39,11 @@ struct clip_image_f32_batch { size_t size; }; -CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); +CLIP_API struct clip_ctx * clip_model_load (const char * fname, const char * omni_vlm_version, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); -struct gpt_params; -CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params); +// struct gpt_params; +// CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params); CLIP_API void clip_free(struct clip_ctx * ctx); @@ -86,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); -CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); +CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); diff --git a/examples/omni-vlm/omni-vlm-cli.cpp b/examples/omni-vlm/omni-vlm-cli.cpp index 40e65b339..d24634fe8 100644 --- a/examples/omni-vlm/omni-vlm-cli.cpp +++ b/examples/omni-vlm/omni-vlm-cli.cpp @@ -219,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_ prompt = "describe the image in detail."; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0); - clip_set_omni_vlm_version(ctx_clip, params); + auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0); + // clip_set_omni_vlm_version(ctx_clip, params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings diff --git a/examples/omni-vlm/omni-vlm-wrapper.cpp b/examples/omni-vlm/omni-vlm-wrapper.cpp index 9a74e73d1..ba0749d06 100644 --- a/examples/omni-vlm/omni-vlm-wrapper.cpp +++ b/examples/omni-vlm/omni-vlm-wrapper.cpp @@ -65,8 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_ prompt = "describe the image in detail."; } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0); - clip_set_omni_vlm_version(ctx_clip, params); + auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0); + // clip_set_omni_vlm_version(ctx_clip, params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); @@ -280,7 +280,10 @@ const char* omnivlm_inference(const char *prompt, const char *imag_path) { void omnivlm_free() { if(internal_chars != nullptr) { free(internal_chars); } - ctx_omnivlm->model = NULL; - omnivlm_free(ctx_omnivlm); + if(ctx_omnivlm != nullptr) { + // this snipet should never be run! + ctx_omnivlm->model = nullptr; + omnivlm_free(ctx_omnivlm); + } llama_free_model(model); }