From f661483ea72407f7342f4a4ae1161834c78fe3f4 Mon Sep 17 00:00:00 2001 From: HimariO Date: Tue, 1 Oct 2024 23:25:06 +0800 Subject: [PATCH] update qwen2vl cli tool --- examples/llava/qwen2vl-cli.cpp | 122 +++++++++++++-------------------- 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index 82583e750..974f10edd 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -1,14 +1,16 @@ -#include "ggml.h" +#include "arg.h" +#include "base64.hpp" #include "log.h" #include "common.h" +#include "sampling.h" #include "clip.h" #include "llava.h" #include "llama.h" - -#include "base64.hpp" +#include "ggml.h" #include #include +#include #include #include #include @@ -17,23 +19,13 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { int N = (int) tokens.size(); - std::vector pos; for (int i = 0; i < N; i += n_batch) { int n_eval = (int) tokens.size() - i; if (n_eval > n_batch) { n_eval = n_batch; } - llama_batch batch = llama_batch_get_one(&tokens[i], n_eval, *n_past, 0); - - // TODO: add mrope pos ids somewhere else - pos.resize(batch.n_tokens * 3); - for (int j = 0; j < batch.n_tokens * 3; j ++) { - pos[j] = j % batch.n_tokens; - } - batch.pos = pos.data(); - - if (llama_decode(ctx_llama, batch)) { - LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); + if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) { + LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past); return false; } *n_past += n_eval; @@ -54,11 +46,11 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n return true; } -static const char * sample(struct llama_sampling_context * ctx_sampling, +static const char * sample(struct gpt_sampler * smpl, struct llama_context * ctx_llama, int * n_past) { - const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL); - llama_sampling_accept(ctx_sampling, ctx_llama, id, true); + const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1); + gpt_sampler_accept(smpl, id, true); static std::string ret; if (llama_token_is_eog(llama_get_model(ctx_llama), id)) { ret = ""; @@ -88,7 +80,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip size_t img_base64_str_start, img_base64_str_end; find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { - LOG_TEE("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + LOG_ERR("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); return NULL; } @@ -102,7 +94,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); if (!embed) { - LOG_TEE("%s: could not load image from base64 string.\n", __func__); + LOG_ERR("%s: could not load image from base64 string.\n", __func__); return NULL; } @@ -126,12 +118,10 @@ struct llava_context { struct llama_model * model = NULL; }; -static void print_usage(int argc, char ** argv, const gpt_params & params) { - gpt_params_print_usage(argc, argv, params); - - LOG_TEE("\n example usage:\n"); - LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); +static void print_usage(int, char ** argv) { + LOG("\n example usage:\n"); + LOG("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n"); } static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { @@ -141,11 +131,11 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para auto prompt = params->prompt; if (prompt_contains_image(prompt)) { if (!params->image.empty()) { - LOG_TEE("using base64 encoded image instead of command line image path\n"); + LOG_INF("using base64 encoded image instead of command line image path\n"); } embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt); if (!embed) { - LOG_TEE("%s: can't load image from prompt\n", __func__); + LOG_ERR("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); @@ -171,18 +161,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // new templating mode: Provide the full prompt including system message and use as a placeholder for the image system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); + LOG_INF("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } - LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); + LOG_INF("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -192,7 +182,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } @@ -204,21 +194,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // generate the response - LOG_TEE("\n"); + LOG("\n"); - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); - if (!ctx_sampling) { - fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams); + if (!smpl) { + LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__); exit(1); } std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); + const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past); response += tmp; if (strcmp(tmp, "") == 0) break; if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp); + LOG("%s", tmp); if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works) if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6 if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6 @@ -226,8 +216,8 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ fflush(stdout); } - llama_sampling_free(ctx_sampling); - printf("\n"); + gpt_sampler_free(smpl); + LOG("\n"); } static struct llama_model * llava_init(gpt_params * params) { @@ -238,7 +228,7 @@ static struct llama_model * llava_init(gpt_params * params) { llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); if (model == NULL) { - LOG_TEE("%s: error: unable to load model\n" , __func__); + LOG_ERR("%s: unable to load model\n" , __func__); return NULL; } return model; @@ -261,11 +251,11 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); + LOG_ERR("%s: failed to create the llama_context\n" , __func__); return NULL; } - auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); + auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; ctx_llava->ctx_clip = ctx_clip; @@ -284,12 +274,6 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - LOG_TEE("%s", text); -} - static void tmp_test_conv2d_reshape(struct llava_context * ctx_llava, gpt_params * params) { int image_size_width = 256; int image_size_height = 256; @@ -564,43 +548,36 @@ int main(int argc, char ** argv) { gpt_params params; - if (!gpt_params_parse(argc, argv, params)) { - print_usage(argc, argv, params); + if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) { return 1; } -#ifndef LOG_DISABLE_LOGS - log_set_target(log_filename_generator("llava", "log")); - LOG_TEE("Log start\n"); - log_dump_cmdline(argc, argv); - llama_log_set(llama_log_callback_logTee, nullptr); -#endif // LOG_DISABLE_LOGS + gpt_init(); if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - print_usage(argc, argv, {}); + print_usage(argc, argv); return 1; } - auto model = llava_init(¶ms); + + auto * model = llava_init(¶ms); if (model == NULL) { fprintf(stderr, "%s: error: failed to init llava model\n", __func__); return 1; } - params.image.clear(); - if (prompt_contains_image(params.prompt)) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, ""); + auto * image_embed = load_image(ctx_llava, ¶ms, ""); // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_print_timings(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); - } else if (params.image.empty()) { + } else if (params.image.empty() | true) { // This section is for testing LLM parts of the model during development phase! auto ctx_llava = llava_init_context(¶ms, model); @@ -609,31 +586,30 @@ int main(int argc, char ** argv) { // tmp_test_rope(ctx_llava, ¶ms); // tmp_test_mrope(ctx_llava, ¶ms); tmp_test_mrope_2d(ctx_llava, ¶ms); - // process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); + process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); - llama_print_timings(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); ctx_llava->model = NULL; llava_free(ctx_llava); } else { for (auto & image : params.image) { - auto ctx_llava = llava_init_context(¶ms, model); + auto * ctx_llava = llava_init_context(¶ms, model); - auto image_embed = load_image(ctx_llava, ¶ms, image); + auto * image_embed = load_image(ctx_llava, ¶ms, image); if (!image_embed) { - std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; + LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str()); return 1; } // process the prompt process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - llama_print_timings(ctx_llava->ctx_llama); + llama_perf_context_print(ctx_llava->ctx_llama); llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); } - } llama_free_model(model);