diff --git a/examples/xgenmm/imgs/5patches_embeddings.pt b/examples/xgenmm/imgs/5patches_embeddings.pt new file mode 100644 index 000000000..5a68bc924 Binary files /dev/null and b/examples/xgenmm/imgs/5patches_embeddings.pt differ diff --git a/examples/xgenmm/imgs/attention_mask_4patches.pt b/examples/xgenmm/imgs/attention_mask_4patches.pt new file mode 100644 index 000000000..95d7ffec9 Binary files /dev/null and b/examples/xgenmm/imgs/attention_mask_4patches.pt differ diff --git a/examples/xgenmm/imgs/attention_mask_5patches.pt b/examples/xgenmm/imgs/attention_mask_5patches.pt new file mode 100644 index 000000000..0266ef1af Binary files /dev/null and b/examples/xgenmm/imgs/attention_mask_5patches.pt differ diff --git a/examples/xgenmm/imgs/attention_mask_5patches_stage1.pt b/examples/xgenmm/imgs/attention_mask_5patches_stage1.pt new file mode 100644 index 000000000..684e6c3a6 Binary files /dev/null and b/examples/xgenmm/imgs/attention_mask_5patches_stage1.pt differ diff --git a/examples/xgenmm/run_cli.sh b/examples/xgenmm/run_cli.sh index cbfe57740..0be88ecc9 100644 --- a/examples/xgenmm/run_cli.sh +++ b/examples/xgenmm/run_cli.sh @@ -2,8 +2,22 @@ make xgenmm-cli -./xgenmm-cli -m /export/share/llamacpp_models/MiniCPM-Llama3-V-2_5/ggml-model-Q4_K_M.gguf \ +# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \ +# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \ +# -c 4096 --temp 0.01 --repeat-penalty 1.05 \ +# --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg \ +# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n\nWhat is the color of this notebook?<|end|>\n<|assistant|>\n" + + +# ./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \ +# --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \ +# -c 4096 --temp 0 --num_beams 1 \ +# --image /export/home/on-device-mm/notebooks/open-flamingo/imgs/receipt.jpg \ +# -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n Describe this image.<|end|>\n<|assistant|>\n" + + +./xgenmm-cli -m /export/share/tawalgaonkar/llama.cpp/models/llm/xgenmm-phi-3-llm-Q4.gguf \ --mmproj /export/share/yutong/xgenmm/llamacpp_wd/siglip_kosmos_phi3_4k_instruct/gguf_test/mmproj-model-f32.gguf \ - -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 \ - --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9-1.jpg \ - -p "What is in the image?" \ No newline at end of file + -c 4096 --temp 0.01 --repeat-penalty 1.05 \ + --image /export/home/llama.cpp/examples/xgenmm/imgs/image-1d100e9.jpg\ + -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. \nThe assistant gives helpful, detailed, and polite answers to the user's questions.<|end|>\n<|user|>\n\n How many objects are there in this image?<|end|>\n<|assistant|>\n" \ No newline at end of file diff --git a/examples/xgenmm/xgenmm-cli.cpp b/examples/xgenmm/xgenmm-cli.cpp index f894c81a0..a3396a2d1 100644 --- a/examples/xgenmm/xgenmm-cli.cpp +++ b/examples/xgenmm/xgenmm-cli.cpp @@ -217,6 +217,92 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri return ctx_llava; } +static void process_prompt(struct llava_context *ctx_llava, struct llava_image_embed *image_embed, gpt_params *params, + const std::string &prompt) +{ + int n_past = 0; + + const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; + + std::string system_prompt, user_prompt; + size_t image_pos = prompt.find(""); + if (image_pos != std::string::npos) + { + // new templating mode: Provide the full prompt including system message and use as a placeholder for + // the image + system_prompt = prompt.substr(0, image_pos); + user_prompt = prompt.substr(image_pos + std::string("").length()); + LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); + if (params->verbose_prompt) + { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); + for (int i = 0; i < (int)tmp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + if (params->verbose_prompt) + { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int)tmp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + } + else + { + // llava-1.5 native mode + system_prompt = + "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, " + "detailed, and polite answers to the human's questions.\nUSER:"; + user_prompt = prompt + "\nASSISTANT:"; + if (params->verbose_prompt) + { + auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); + for (int i = 0; i < (int)tmp.size(); i++) + { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + } + } + } + + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); + llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + + // generate the response + + LOG_TEE("\n"); + + struct llama_sampling_context *ctx_sampling = llama_sampling_init(params->sparams); + if (!ctx_sampling) + { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) + { + const char *tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + printf("%s", tmp); + if (strstr(response.c_str(), "<|im_end|>")) + break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) + if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 + if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 + + fflush(stdout); + } + + llama_sampling_free(ctx_sampling); + printf("\n"); +} + static struct llava_context * xgenmm_init(gpt_params * params, const std::string & fname, int &n_past){ auto ctx_clip = clip_init_context(params); std::cout << "clip model has been loaded \n\n"; @@ -227,7 +313,6 @@ static struct llava_context * xgenmm_init(gpt_params * params, const std::string return NULL; } std::cout<< "Start Processing Prompt: " << std::endl; - exit(1); // TODO: // process the prompt if (params->prompt.empty() && params->interactive == false) { @@ -248,7 +333,8 @@ static struct llava_context * xgenmm_init(gpt_params * params, const std::string LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); const int64_t t_process_image_start_us = ggml_time_us(); - process_image(ctx_llava, embeds, params, n_past); + process_prompt(ctx_llava, embeds, params, params->prompt); + // process_image(ctx_llava, embeds, params, n_past); const int64_t t_process_image_end_us = ggml_time_us(); float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; LOG_TEE("\n%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms); @@ -292,6 +378,8 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sam return tmp; } + + int main(int argc, char ** argv) { ggml_time_init(); @@ -320,6 +408,8 @@ int main(int argc, char ** argv) { // auto ctx_llava = minicpmv_init(¶ms, image, n_past); auto ctx_llava = xgenmm_init(¶ms, image, n_past); // generate vision tokens std::cout << "Start llava generation: " << std::endl; + llama_print_timings(ctx_llava->ctx_llama); + // // TODO: integrate base llm // if (!params.prompt.empty()) { // LOG_TEE("%s\n", params.prompt.c_str()); diff --git a/xgenmm-cli b/xgenmm-cli new file mode 100755 index 000000000..76bcb3713 Binary files /dev/null and b/xgenmm-cli differ