From f83c0606bd26ed14285832df154f4988b3d810de Mon Sep 17 00:00:00 2001 From: Damian Stewart Date: Sat, 14 Oct 2023 12:58:40 +0200 Subject: [PATCH] further cleanup; move llava-cli into its own file and rename --- examples/llava/CMakeLists.txt | 16 +++++- examples/llava/README.md | 6 +- examples/llava/clip.cpp | 2 +- examples/llava/clip.h | 2 +- examples/llava/llava-cli.cpp | 101 ++++++++++++++++++++++++++++++++++ examples/llava/llava-utils.h | 10 ++-- examples/llava/llava.cpp | 95 +------------------------------- examples/llava/llava.h | 6 +- 8 files changed, 128 insertions(+), 110 deletions(-) create mode 100644 examples/llava/llava-cli.cpp diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index d04dcc5c5..7e05bb3bf 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -11,9 +11,21 @@ if(TARGET BUILD_INFO) endif() set(TARGET llava) -add_executable(${TARGET} llava.cpp) +add_library(${TARGET} llava.cpp llava.h) +install(TARGETS ${TARGET} LIBRARY) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if (NOT MSVC) + target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h + endif() +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() + +set(TARGET llava-cli) +add_executable(${TARGET} llava-cli.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama clip llava ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) diff --git a/examples/llava/README.md b/examples/llava/README.md index fc3446b60..b1df8dd16 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -9,12 +9,12 @@ models are available. After API is confirmed, more models will be supported / uploaded. ## Usage -Build with cmake or run `make llava` to build it. +Build with cmake or run `make llava-cli` to build it. -After building, run: `./llava` to see the usage. For example: +After building, run: `./llava-cli` to see the usage. For example: ```sh -./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg +./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ``` **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 5bb2e4c37..d8eb865fc 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -690,7 +690,7 @@ static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_ memcpy(img->data, data, img->size); } -bool clip_image_load_from_bytes(const unsigned char * bytes, int bytes_length, clip_image_u8 * img) { +bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, clip_image_u8 * img) { int nx, ny, nc; auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); if (!data) { diff --git a/examples/llava/clip.h b/examples/llava/clip.h index c0b53d0b8..f161b738e 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -58,7 +58,7 @@ struct clip_image_f32_batch { struct clip_image_u8 * make_clip_image_u8(); struct clip_image_f32 * make_clip_image_f32(); bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img); -bool clip_image_load_from_bytes(const unsigned char * bytes, int bytes_length, clip_image_u8 * img); +bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, clip_image_u8 * img); bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square); bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp new file mode 100644 index 000000000..84c376246 --- /dev/null +++ b/examples/llava/llava-cli.cpp @@ -0,0 +1,101 @@ +#include +#include + +#include "ggml.h" +#include "common.h" +#include "clip.h" +#include "llava.h" +#include "llava-utils.h" + + +static void show_additional_info(int /*argc*/, char ** argv) { + printf("\n example usage: %s -m --mmproj --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} + +static bool load_image(llava_context * ctx_llava, gpt_params * params, float **image_embd, int * n_image_pos) { + // load and preprocess the image + clip_image_u8 img; + auto prompt = params->prompt; + if (prompt_contains_image(prompt)) { + if (!params->image.empty()) { + printf("using base64 encoded image instead of command line image path\n"); + } + if (!clip_image_load_from_prompt(prompt, &img)) { + fprintf(stderr, "%s: can't load image from prompt\n", __func__); + return false; + } + prompt = remove_image_from_prompt(prompt); + } else { + if (!clip_image_load_from_file(params->image.c_str(), &img)) { + fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str()); + return false; + } + } + llava_build_img_embed(ctx_llava, params->n_threads, &img, image_embd, n_image_pos); + + return true; +} + +static void process_prompt(struct llava_context * ctx_llava, float * image_embd, int n_img_pos, gpt_params * params, const char * prompt) { + int n_past = 0; + + const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; + + // llava chat format is "USER: \n\nASSISTANT:" + // GG: are we sure that the should be a trailing whitespace at the end of this string? + eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past); + eval_image_embd(ctx_llava->ctx_llama, image_embd, n_img_pos, params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past); + eval_string(ctx_llava->ctx_llama, "\nASSISTANT:", params->n_batch, &n_past); + + // generate the response + + printf("\n"); + + for (int i = 0; i < max_tgt_len; i++) { + const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past); + if (strcmp(tmp, "") == 0) break; + + printf("%s", tmp); + fflush(stdout); + } + + printf("\n"); + +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + show_additional_info(argc, argv); + return 1; + } + if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { + gpt_print_usage(argc, argv, params); + show_additional_info(argc, argv); + return 1; + } + + auto ctx_llava = llava_init(¶ms); + if (ctx_llava == NULL) { + fprintf(stderr, "%s: error: failed to init llava\n", __func__); + return 1; + } + + float * image_embd; + int n_image_pos; + load_image(ctx_llava, ¶ms, &image_embd, &n_image_pos); + + // process the prompt + process_prompt(ctx_llava, image_embd, n_image_pos, ¶ms, params.prompt.c_str()); + + llama_print_timings(ctx_llava->ctx_llama); + + free(image_embd); + llava_free(ctx_llava); + return 0; +} diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h index b794c39cc..3b4fa96cc 100644 --- a/examples/llava/llava-utils.h +++ b/examples/llava/llava-utils.h @@ -149,19 +149,19 @@ inline const char * sample(struct llama_context * ctx_llama, gpt_params & params static const char* IMG_BASE64_TAG_BEGIN = ""; -static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) { +inline void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) { begin_out = prompt.find(IMG_BASE64_TAG_BEGIN); end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out); } -static bool prompt_contains_image(const std::string& prompt) { +inline bool prompt_contains_image(const std::string& prompt) { size_t begin, end; find_image_tag_in_prompt(prompt, begin, end); return (begin != std::string::npos); } // replaces the base64 image tag in the prompt with `replacement` -static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img) { +inline bool clip_image_load_from_prompt(const std::string& prompt, clip_image_u8 * img) { size_t img_base64_str_start, img_base64_str_end; find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end); if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) { @@ -176,7 +176,7 @@ static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img auto required_bytes = base64::required_encode_size(base64_str.size()); auto img_bytes = std::vector(required_bytes); auto img_bytes_end = base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin()); - auto img_bytes_len = img_bytes_end - img_bytes.begin(); + size_t img_bytes_len = img_bytes_end - img_bytes.begin(); auto img_loaded_ok = clip_image_load_from_bytes(img_bytes.data(), img_bytes_len, img); if (!img_loaded_ok) { @@ -187,7 +187,7 @@ static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img return true; } -static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { +inline std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") { size_t begin, end; find_image_tag_in_prompt(prompt, begin, end); if (begin == std::string::npos || end == std::string::npos) { diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index ffdad9c99..522334c7c 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -10,11 +10,6 @@ #include "base64.hpp" -static void show_additional_info(int /*argc*/, char ** argv) { - printf("\n example usage: %s -m --mmproj --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); -} - static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_embd, int * n_img_pos) { auto ctx_clip = ctx_llava->ctx_clip; clip_image_f32 img_res; @@ -51,7 +46,7 @@ static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, con return true; } -static bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out) { +bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out) { auto ctx_clip = ctx_llava->ctx_clip; float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)); @@ -128,91 +123,3 @@ void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } - - -static bool load_image(llava_context * ctx_llava, gpt_params * params, float **image_embd, int * n_image_pos) { - // load and preprocess the image - clip_image_u8 img; - auto prompt = params->prompt; - if (prompt_contains_image(prompt)) { - if (!params->image.empty()) { - printf("using base64 encoded image instead of command line image path\n"); - } - if (!get_image_from_prompt(prompt, &img)) { - fprintf(stderr, "%s: can't load image from prompt\n", __func__); - return false; - } - prompt = remove_image_from_prompt(prompt); - } else { - if (!clip_image_load_from_file(params->image.c_str(), &img)) { - fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str()); - return false; - } - } - llava_build_img_embed(ctx_llava, params->n_threads, &img, image_embd, n_image_pos); - - return true; -} - -static void process_prompt(struct llava_context * ctx_llava, float * image_embd, int n_img_pos, gpt_params * params, const char * prompt) { - int n_past = 0; - - const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; - - // llava chat format is "USER: \n\nASSISTANT:" - // GG: are we sure that the should be a trailing whitespace at the end of this string? - eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past); - eval_image_embd(ctx_llava->ctx_llama, image_embd, n_img_pos, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past); - eval_string(ctx_llava->ctx_llama, "\nASSISTANT:", params->n_batch, &n_past); - - // generate the response - - printf("\n"); - - for (int i = 0; i < max_tgt_len; i++) { - const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past); - if (strcmp(tmp, "") == 0) break; - - printf("%s", tmp); - fflush(stdout); - } - - printf("\n"); - -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - gpt_params params; - - if (!gpt_params_parse(argc, argv, params)) { - show_additional_info(argc, argv); - return 1; - } - if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - gpt_print_usage(argc, argv, params); - show_additional_info(argc, argv); - return 1; - } - - auto ctx_llava = llava_init(¶ms); - if (ctx_llava == NULL) { - fprintf(stderr, "%s: error: failed to init llava\n", __func__); - return 1; - } - - float * image_embd; - int n_image_pos; - load_image(ctx_llava, ¶ms, &image_embd, &n_image_pos); - - // process the prompt - process_prompt(ctx_llava, image_embd, n_image_pos, ¶ms, params.prompt.c_str()); - - llama_print_timings(ctx_llava->ctx_llama); - - free(image_embd); - llava_free(ctx_llava); - return 0; -} diff --git a/examples/llava/llava.h b/examples/llava/llava.h index ddbcc8d43..1d8b87a46 100644 --- a/examples/llava/llava.h +++ b/examples/llava/llava.h @@ -2,6 +2,7 @@ #define LLAVA_H #include "ggml.h" +#include "common.h" struct clip_ctx; @@ -13,15 +14,12 @@ struct llava_context { struct clip_ctx * ctx_clip = NULL; struct llama_context * ctx_llama = NULL; struct llama_model * model = NULL; - -// int n_img_pos = 0; -// float * image_embd = NULL; }; struct llava_context * llava_init(gpt_params * params); void llava_free(struct llava_context * ctx_llava); -//void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt); +bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out); #ifdef __cplusplus