[omni-vlm] fixed the segmentation fault issue in nano-vlm-instruct(WIP,

current solution is still not perfect)
This commit is contained in:
李为 2024-11-12 14:17:42 +08:00
parent 362bdf3292
commit 55953d35a4
4 changed files with 36 additions and 24 deletions

View file

@ -701,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
} }
// read and create ggml_context containing the tensors and their data // read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) {
struct ggml_context * meta = NULL; struct ggml_context * meta = NULL;
struct gguf_init_params params = { struct gguf_init_params params = {
@ -796,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
clip_ctx * new_clip = new clip_ctx{}; clip_ctx * new_clip = new clip_ctx{};
if (std::string(omni_vlm_version) == "vlm-81-ocr") {
new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
} else if (std::string(omni_vlm_version) == "vlm-81-instruct") {
new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
} else if (std::string(omni_vlm_version) == "nano-vlm-instruct") {
new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
} else {
throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version);
}
// update projector type // update projector type
{ {
@ -1209,17 +1218,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip; return new_clip;
} }
void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) { // void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
if (params->omni_vlm_version == "vlm-81-ocr") { // if (params->omni_vlm_version == "vlm-81-ocr") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR; // ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
} else if (params->omni_vlm_version == "vlm-81-instruct") { // } else if (params->omni_vlm_version == "vlm-81-instruct") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT; // ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
} else if (params->omni_vlm_version == "nano-vlm-instruct") { // } else if (params->omni_vlm_version == "nano-vlm-instruct") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT; // ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
} else { // } else {
throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version); // throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
} // }
} // }
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) { void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
ctx_clip->load_image_size = load_image_size; ctx_clip->load_image_size = load_image_size;
@ -2207,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
return true; return true;
} }
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) { bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) {
ggml_type type = GGML_TYPE_Q4_1; ggml_type type = GGML_TYPE_Q4_1;
assert(itype < GGML_TYPE_COUNT); assert(itype < GGML_TYPE_COUNT);
type = static_cast<ggml_type>(itype); type = static_cast<ggml_type>(itype);
auto * ctx_clip = clip_model_load(fname_inp, 2); auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2);
const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx_data; const auto & ctx_data = ctx_clip->ctx_data;

View file

@ -39,11 +39,11 @@ struct clip_image_f32_batch {
size_t size; size_t size;
}; };
CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load (const char * fname, const char * omni_vlm_version, int verbosity);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
struct gpt_params; // struct gpt_params;
CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params); // CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
CLIP_API void clip_free(struct clip_ctx * ctx); CLIP_API void clip_free(struct clip_ctx * ctx);
@ -86,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);

View file

@ -219,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
prompt = "describe the image in detail."; prompt = "describe the image in detail.";
} }
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0); auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
clip_set_omni_vlm_version(ctx_clip, params); // clip_set_omni_vlm_version(ctx_clip, params);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings

View file

@ -65,8 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
prompt = "describe the image in detail."; prompt = "describe the image in detail.";
} }
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0); auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
clip_set_omni_vlm_version(ctx_clip, params); // clip_set_omni_vlm_version(ctx_clip, params);
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
@ -280,7 +280,10 @@ const char* omnivlm_inference(const char *prompt, const char *imag_path) {
void omnivlm_free() { void omnivlm_free() {
if(internal_chars != nullptr) { free(internal_chars); } if(internal_chars != nullptr) { free(internal_chars); }
ctx_omnivlm->model = NULL; if(ctx_omnivlm != nullptr) {
omnivlm_free(ctx_omnivlm); // this snipet should never be run!
ctx_omnivlm->model = nullptr;
omnivlm_free(ctx_omnivlm);
}
llama_free_model(model); llama_free_model(model);
} }