From 2bc1710e2b67b88d61792b5c706711356ae0ad0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Wed, 11 Oct 2023 23:17:50 +0300
Subject: [PATCH] command line: use gpt_params_parse()

---
 common/common.cpp        | 14 ++++++++++++++
 common/common.h          |  4 ++++
 examples/llava/README.md | 18 ++++++++----------
 examples/llava/llava.cpp | 25 ++++++++++++++-----------
 4 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 0f55c33a7..ee1392bad 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -383,6 +383,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.lora_base = argv[i];
+        } else if (arg == "--mmproj") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mmproj = argv[i];
+        } else if (arg == "--image") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.image = argv[i];
         } else if (arg == "-i" || arg == "--interactive") {
             params.interactive = true;
         } else if (arg == "--embedding") {
@@ -700,6 +712,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
+    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
     if (llama_mlock_supported()) {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }
diff --git a/common/common.h b/common/common.h
index c80215279..72e97f653 100644
--- a/common/common.h
+++ b/common/common.h
@@ -122,6 +122,10 @@ struct gpt_params {
     bool numa              = false; // attempt optimizations that help on some NUMA systems
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool infill            = false; // use infill mode
+
+    // multimodal models (see examples/llava)
+    std::string mmproj = ""; // path to multimodal projector
+    std::string image = ""; // path to an image file
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/examples/llava/README.md b/examples/llava/README.md
index a2b296331..f4d61414c 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -2,18 +2,22 @@
 
 Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
 
-The pre-converted 7b model can be found [here](https://huggingface.co/mys/ggml_llava-v1.5-7b).
+The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+models are available.
 
 After API is confirmed, more models will be supported / uploaded.
 ## Usage
-The `llava` target is cmake-only for now (TODO: add to `make`) and built as a part of examples.
+Build with cmake or run `make llava` to build it.
 
-After building, run: `./bin/llava` to see the usage. For example:
+After building, run: `./llava` to see the usage. For example:
 
 ```sh
-./bin/llava path/to/llava-v1.5-7b/ggml-model-q5_k.gguf path/to/llava-v1.5-7b/mmproj-model-f16.gguf path/to/an/image.jpg
+./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+
 ## Model conversion
 
 - Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
@@ -46,12 +50,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
 
 ## TODO
 
-These will be include in this pr:
-
-- [ ] Better command line interface.
-
-These will be another PR:
-
 - [ ] Support server mode.
 - [ ] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 213546a3e..fe3b4e87d 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -7,31 +7,34 @@
 #include "common.h"
 #include "llama.h"
 
+static void show_additional_info(int argc, char ** argv) {
+    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}
 
 int main(int argc, char ** argv) {
     ggml_time_init();
 
     gpt_params params;
 
-    if (argc < 4) {
-        printf("usage: %s <path/to/llava-v1.5/ggml-model-q5_k.gguf> <path/to/llava-v1.5/mmproj-model-f16.gguf> <path/to/an/image.jpg> [a text prompt]\n", argv[0]);
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
         return 1;
     }
 
-          params.model     = argv[1];
-    const char * clip_path = argv[2];
-    const char * img_path = argv[3];
-
-    if (argc >= 5) {
-        params.prompt = argv[4];
+    if (params.mmproj.empty() || params.image.empty()) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
     }
 
+    const char * clip_path = params.mmproj.c_str();
+    const char * img_path = params.image.c_str();
+
     if (params.prompt.empty()) {
         params.prompt = "describe the image in detail.";
     }
 
-    params.temp = 0.1;
-
     auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
 
     // load and preprocess the image
@@ -83,7 +86,7 @@ int main(int argc, char ** argv) {
     }
 
     llama_context_params ctx_params                 = llama_context_default_params();
-    ctx_params.n_ctx           = 2048;
+    ctx_params.n_ctx           = 2048; // we need a longer context size to process image embeddings
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     llama_context        * ctx_llama                = llama_new_context_with_model(model, ctx_params);