From f83c0606bd26ed14285832df154f4988b3d810de Mon Sep 17 00:00:00 2001
From: Damian Stewart <d@damianstewart.com>
Date: Sat, 14 Oct 2023 12:58:40 +0200
Subject: [PATCH] further cleanup; move llava-cli into its own file and rename

---
 examples/llava/CMakeLists.txt |  16 +++++-
 examples/llava/README.md      |   6 +-
 examples/llava/clip.cpp       |   2 +-
 examples/llava/clip.h         |   2 +-
 examples/llava/llava-cli.cpp  | 101 ++++++++++++++++++++++++++++++++++
 examples/llava/llava-utils.h  |  10 ++--
 examples/llava/llava.cpp      |  95 +-------------------------------
 examples/llava/llava.h        |   6 +-
 8 files changed, 128 insertions(+), 110 deletions(-)
 create mode 100644 examples/llava/llava-cli.cpp

diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index d04dcc5c5..7e05bb3bf 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -11,9 +11,21 @@ if(TARGET BUILD_INFO)
 endif()
 
 set(TARGET llava)
-add_executable(${TARGET} llava.cpp)
+add_library(${TARGET} llava.cpp llava.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if (NOT MSVC)
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
+    endif()
+if(TARGET BUILD_INFO)
+    add_dependencies(${TARGET} BUILD_INFO)
+endif()
+
+set(TARGET llava-cli)
+add_executable(${TARGET} llava-cli.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama clip llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
     add_dependencies(${TARGET} BUILD_INFO)
diff --git a/examples/llava/README.md b/examples/llava/README.md
index fc3446b60..b1df8dd16 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -9,12 +9,12 @@ models are available.
 After API is confirmed, more models will be supported / uploaded.
 
 ## Usage
-Build with cmake or run `make llava` to build it.
+Build with cmake or run `make llava-cli` to build it.
 
-After building, run: `./llava` to see the usage. For example:
+After building, run: `./llava-cli` to see the usage. For example:
 
 ```sh
-./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 5bb2e4c37..d8eb865fc 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -690,7 +690,7 @@ static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_
     memcpy(img->data, data, img->size);
 }
 
-bool clip_image_load_from_bytes(const unsigned char * bytes, int bytes_length, clip_image_u8 * img) {
+bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, clip_image_u8 * img) {
     int nx, ny, nc;
     auto data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
     if (!data) {
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index c0b53d0b8..f161b738e 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -58,7 +58,7 @@ struct clip_image_f32_batch {
 struct clip_image_u8 * make_clip_image_u8();
 struct clip_image_f32 * make_clip_image_f32();
 bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
-bool clip_image_load_from_bytes(const unsigned char * bytes, int bytes_length, clip_image_u8 * img);
+bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, clip_image_u8 * img);
 bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
 bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);
 
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
new file mode 100644
index 000000000..84c376246
--- /dev/null
+++ b/examples/llava/llava-cli.cpp
@@ -0,0 +1,101 @@
+#include <cstdio>
+#include <cstdlib>
+
+#include "ggml.h"
+#include "common.h"
+#include "clip.h"
+#include "llava.h"
+#include "llava-utils.h"
+
+
+static void show_additional_info(int /*argc*/, char ** argv) {
+    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
+
+static bool load_image(llava_context * ctx_llava, gpt_params * params, float **image_embd, int * n_image_pos) {
+    // load and preprocess the image
+    clip_image_u8 img;
+    auto prompt = params->prompt;
+    if (prompt_contains_image(prompt)) {
+        if (!params->image.empty()) {
+            printf("using base64 encoded image instead of command line image path\n");
+        }
+        if (!clip_image_load_from_prompt(prompt, &img)) {
+            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
+            return false;
+        }
+        prompt = remove_image_from_prompt(prompt);
+    } else {
+        if (!clip_image_load_from_file(params->image.c_str(), &img)) {
+            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
+            return false;
+        }
+    }
+    llava_build_img_embed(ctx_llava, params->n_threads, &img, image_embd, n_image_pos);
+
+    return true;
+}
+
+static void process_prompt(struct llava_context * ctx_llava, float * image_embd, int n_img_pos, gpt_params * params, const char * prompt) {
+    int n_past = 0;
+
+    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
+
+    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
+    // GG: are we sure that the should be a trailing whitespace at the end of this string?
+    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past);
+    eval_image_embd(ctx_llava->ctx_llama, image_embd, n_img_pos, params->n_batch, &n_past);
+    eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past);
+    eval_string(ctx_llava->ctx_llama, "\nASSISTANT:",        params->n_batch, &n_past);
+
+    // generate the response
+
+    printf("\n");
+
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
+        if (strcmp(tmp, "</s>") == 0) break;
+
+        printf("%s", tmp);
+        fflush(stdout);
+    }
+
+    printf("\n");
+
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    auto ctx_llava = llava_init(&params);
+    if (ctx_llava == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
+        return 1;
+    }
+
+    float * image_embd;
+    int n_image_pos;
+    load_image(ctx_llava, &params, &image_embd, &n_image_pos);
+
+    // process the prompt
+    process_prompt(ctx_llava, image_embd, n_image_pos, &params, params.prompt.c_str());
+
+    llama_print_timings(ctx_llava->ctx_llama);
+
+    free(image_embd);
+    llava_free(ctx_llava);
+    return 0;
+}
diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h
index b794c39cc..3b4fa96cc 100644
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -149,19 +149,19 @@ inline const char * sample(struct llama_context * ctx_llama, gpt_params & params
 static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
 static const char* IMG_BASE64_TAG_END = "\">";
 
-static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
+inline void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
     begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
     end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
 }
 
-static bool prompt_contains_image(const std::string& prompt) {
+inline bool prompt_contains_image(const std::string& prompt) {
     size_t begin, end;
     find_image_tag_in_prompt(prompt, begin, end);
     return (begin != std::string::npos);
 }
 
 // replaces the base64 image tag in the prompt with `replacement`
-static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img) {    
+inline bool clip_image_load_from_prompt(const std::string& prompt, clip_image_u8 * img) {    
     size_t img_base64_str_start, img_base64_str_end;
     find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
     if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
@@ -176,7 +176,7 @@ static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img
     auto required_bytes = base64::required_encode_size(base64_str.size());
     auto img_bytes = std::vector<unsigned char>(required_bytes);
     auto img_bytes_end = base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
-    auto img_bytes_len = img_bytes_end - img_bytes.begin();
+    size_t img_bytes_len = img_bytes_end - img_bytes.begin();
 
     auto img_loaded_ok = clip_image_load_from_bytes(img_bytes.data(), img_bytes_len, img);
     if (!img_loaded_ok) {
@@ -187,7 +187,7 @@ static bool get_image_from_prompt(const std::string& prompt, clip_image_u8 * img
     return true;
 }
 
-static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
+inline std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
     size_t begin, end;
     find_image_tag_in_prompt(prompt, begin, end);
     if (begin == std::string::npos || end == std::string::npos) {
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index ffdad9c99..522334c7c 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -10,11 +10,6 @@
 
 #include "base64.hpp"
 
-static void show_additional_info(int /*argc*/, char ** argv) {
-    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
 static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_embd, int * n_img_pos) {
     auto ctx_clip = ctx_llava->ctx_clip;
     clip_image_f32 img_res;
@@ -51,7 +46,7 @@ static bool encode_image_with_clip(llava_context * ctx_llava, int n_threads, con
     return true;
 }
 
-static bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out) {
+bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out) {
 
     auto ctx_clip = ctx_llava->ctx_clip;
     float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
@@ -128,91 +123,3 @@ void llava_free(struct llava_context * ctx_llava) {
     llama_backend_free();
 }
 
-
-
-static bool load_image(llava_context * ctx_llava, gpt_params * params, float **image_embd, int * n_image_pos) {
-    // load and preprocess the image
-    clip_image_u8 img;
-    auto prompt = params->prompt;
-    if (prompt_contains_image(prompt)) {
-        if (!params->image.empty()) {
-            printf("using base64 encoded image instead of command line image path\n");
-        }
-        if (!get_image_from_prompt(prompt, &img)) {
-            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
-            return false;
-        }
-        prompt = remove_image_from_prompt(prompt);
-    } else {
-        if (!clip_image_load_from_file(params->image.c_str(), &img)) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
-            return false;
-        }
-    }
-    llava_build_img_embed(ctx_llava, params->n_threads, &img, image_embd, n_image_pos);
-
-    return true;
-}
-
-static void process_prompt(struct llava_context * ctx_llava, float * image_embd, int n_img_pos, gpt_params * params, const char * prompt) {
-    int n_past = 0;
-
-    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-
-    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"
-    // GG: are we sure that the should be a trailing whitespace at the end of this string?
-    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER: ", params->n_batch, &n_past);
-    eval_image_embd(ctx_llava->ctx_llama, image_embd, n_img_pos, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, prompt, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, "\nASSISTANT:",        params->n_batch, &n_past);
-
-    // generate the response
-
-    printf("\n");
-
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_llava->ctx_llama, *params, &n_past);
-        if (strcmp(tmp, "</s>") == 0) break;
-
-        printf("%s", tmp);
-        fflush(stdout);
-    }
-
-    printf("\n");
-
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
-        return 1;
-    }
-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    auto ctx_llava = llava_init(&params);
-    if (ctx_llava == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
-        return 1;
-    }
-
-    float * image_embd;
-    int n_image_pos;
-    load_image(ctx_llava, &params, &image_embd, &n_image_pos);
-
-    // process the prompt
-    process_prompt(ctx_llava, image_embd, n_image_pos, &params, params.prompt.c_str());
-
-    llama_print_timings(ctx_llava->ctx_llama);
-
-    free(image_embd);
-    llava_free(ctx_llava);
-    return 0;
-}
diff --git a/examples/llava/llava.h b/examples/llava/llava.h
index ddbcc8d43..1d8b87a46 100644
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -2,6 +2,7 @@
 #define LLAVA_H
 
 #include "ggml.h"
+#include "common.h"
 
 struct clip_ctx;
 
@@ -13,15 +14,12 @@ struct llava_context {
     struct clip_ctx * ctx_clip = NULL;
     struct llama_context * ctx_llama = NULL;
     struct llama_model * model = NULL;
-
-//    int n_img_pos = 0;
-//    float * image_embd = NULL;
 };
 
 struct llava_context * llava_init(gpt_params * params);
 void llava_free(struct llava_context * ctx_llava);
 
-//void llava_process_prompt(struct llava_context * ctx_llava, gpt_params * params, const char * prompt);
+bool llava_build_img_embed(struct llava_context * ctx_llava, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_image_pos_out);
 
 
 #ifdef __cplusplus