fix OCR template error.
This commit is contained in:
parent
667a6d9838
commit
d04e354f2f
3 changed files with 6 additions and 116 deletions
|
@ -149,7 +149,7 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
||||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
// LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
|
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
|
@ -165,6 +165,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
||||||
|
params->sparams.temp = 0.0f;
|
||||||
|
params->sparams.top_k = 1;
|
||||||
|
params->sparams.top_p = 1.0f;
|
||||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||||
if (!ctx_sampling) {
|
if (!ctx_sampling) {
|
||||||
LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
|
LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
|
@ -177,8 +180,8 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
|
||||||
response += tmp;
|
response += tmp;
|
||||||
if (strcmp(tmp, "<|im_end|>") == 0) break;
|
if (strcmp(tmp, "<|im_end|>") == 0) break;
|
||||||
if (strcmp(tmp, "</s>") == 0) break;
|
if (strcmp(tmp, "</s>") == 0) break;
|
||||||
// if (strstr(tmp, "###")) break; // Yi-VL behavior
|
|
||||||
printf("%s", tmp);
|
printf("%s", tmp);
|
||||||
|
// LOG("%s", tmp);
|
||||||
// if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
// if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
|
||||||
// if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
// if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
|
||||||
// if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
// if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
|
||||||
|
@ -265,7 +268,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.omni_vlm_version == "vlm-81-ocr") {
|
if (params.omni_vlm_version == "vlm-81-ocr") {
|
||||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
|
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||||
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
|
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
|
||||||
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
|
||||||
} else {
|
} else {
|
||||||
|
@ -282,10 +285,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
auto * ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
auto * ctx_omnivlm = omnivlm_init_context(¶ms, model);
|
||||||
|
|
||||||
// temporarily set to greedy decoding.
|
|
||||||
params.sparams.top_k = 1;
|
|
||||||
params.sparams.top_p = 1.0f;
|
|
||||||
|
|
||||||
for (auto & image : params.image) {
|
for (auto & image : params.image) {
|
||||||
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
auto * image_embed = load_image(ctx_omnivlm, ¶ms, image);
|
||||||
if (!image_embed) {
|
if (!image_embed) {
|
||||||
|
|
|
@ -222,8 +222,6 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
|
|
||||||
// inference interface definition
|
// inference interface definition
|
||||||
void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
|
void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
|
||||||
std::cout << "debug0 " << llm_model_path << std::endl;
|
|
||||||
std::cout << "debug1 " << omni_vlm_version << std::endl;
|
|
||||||
const char* argv = "omni-wrapper-py";
|
const char* argv = "omni-wrapper-py";
|
||||||
char* nc_argv = const_cast<char*>(argv);
|
char* nc_argv = const_cast<char*>(argv);
|
||||||
if (!gpt_params_parse(1, &nc_argv, params)) {
|
if (!gpt_params_parse(1, &nc_argv, params)) {
|
||||||
|
@ -235,8 +233,6 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path,
|
||||||
params.omni_vlm_version = omni_vlm_version;
|
params.omni_vlm_version = omni_vlm_version;
|
||||||
|
|
||||||
std::string omni_vlm_ver = params.omni_vlm_version;
|
std::string omni_vlm_ver = params.omni_vlm_version;
|
||||||
std::cout << "\t\t DEBUG omni_ver" << std::endl;
|
|
||||||
std::cout << params.omni_vlm_version << std::endl;
|
|
||||||
if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
|
if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
|
||||||
fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
|
fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
|
||||||
fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
|
fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
|
||||||
|
|
|
@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
|
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
*n_img_pos = clip_n_patches(ctx_clip);
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
|
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
|
||||||
// cout << "\t\t A NICE START" << endl;
|
|
||||||
// cout << "\t\t" << *n_img_pos << endl;
|
|
||||||
/*
|
|
||||||
if (clip_is_minicpmv(ctx_clip)) {
|
|
||||||
std::vector<float *> image_embd_v;
|
|
||||||
image_embd_v.resize(img_res_v.size);
|
|
||||||
struct clip_image_size * load_image_size = clip_image_size_init();
|
|
||||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
|
||||||
const int64_t t_img_enc_step_start_us = ggml_time_us();
|
|
||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
|
|
||||||
int patch_size=14;
|
|
||||||
load_image_size->width = img_res_v.data[i].nx;
|
|
||||||
load_image_size->height = img_res_v.data[i].ny;
|
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
||||||
bool encoded = false;
|
|
||||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
|
|
||||||
if (has_minicpmv_projector == 2) {
|
|
||||||
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
|
|
||||||
}
|
|
||||||
else if (has_minicpmv_projector == 3) {
|
|
||||||
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
|
|
||||||
}
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
|
|
||||||
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
|
|
||||||
}
|
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
||||||
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
||||||
|
|
||||||
int n_img_pos_out = 0;
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
||||||
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
|
|
||||||
n_img_pos_out += clip_n_patches(ctx_clip);
|
|
||||||
}
|
|
||||||
*n_img_pos = n_img_pos_out;
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
||||||
free(image_embd_v[i]);
|
|
||||||
}
|
|
||||||
image_embd_v.clear();
|
|
||||||
load_image_size->width = img->nx;
|
|
||||||
load_image_size->height = img->ny;
|
|
||||||
clip_add_load_image_size(ctx_clip, load_image_size);
|
|
||||||
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
|
|
||||||
}
|
|
||||||
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
|
|
||||||
// flat / default llava-1.5 type embedding
|
|
||||||
*n_img_pos = clip_n_patches(ctx_clip);
|
|
||||||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
|
||||||
delete[] img_res_v.data;
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_ERR("Unable to encode image\n");
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// spatial_unpad llava-1.6 type embedding
|
|
||||||
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
|
|
||||||
std::vector<float *> image_embd_v;
|
|
||||||
image_embd_v.resize(img_res_v.size);
|
|
||||||
for (size_t i = 0; i < img_res_v.size; i++) {
|
|
||||||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
|
||||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
|
||||||
if (!encoded) {
|
|
||||||
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
|
||||||
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
|
||||||
|
|
||||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
|
||||||
|
|
||||||
std::vector<std::pair<int, int>> grid_pinpoints;
|
|
||||||
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
|
|
||||||
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
|
|
||||||
}
|
|
||||||
|
|
||||||
// free all img_res_v - not needed anymore
|
|
||||||
delete[] img_res_v.data;
|
|
||||||
img_res_v.size = 0;
|
|
||||||
img_res_v.data = nullptr;
|
|
||||||
|
|
||||||
const int32_t image_size = clip_image_size(ctx_clip);
|
|
||||||
|
|
||||||
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
|
|
||||||
|
|
||||||
int n_img_pos_out;
|
|
||||||
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
|
|
||||||
*n_img_pos = n_img_pos_out;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < image_embd_v.size(); i++) {
|
|
||||||
free(image_embd_v[i]);
|
|
||||||
}
|
|
||||||
image_embd_v.clear();
|
|
||||||
|
|
||||||
// debug image/segment/normalization content:
|
|
||||||
// clip_image_u8 * tmp = clip_image_u8_init();
|
|
||||||
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
|
|
||||||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue