command line: use gpt_params_parse()

2023-10-11 23:17:50 +03:00 · 2023-10-11 23:17:50 +03:00 · 2bc1710e2b
commit 2bc1710e2b
parent f0f78345f2
4 changed files with 40 additions and 21 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -383,6 +383,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.lora_base = argv[i];
+        } else if (arg == "--mmproj") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.mmproj = argv[i];
+        } else if (arg == "--image") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.image = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--embedding") {
@ -700,6 +712,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
+    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
    if (llama_mlock_supported()) {
        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
--- a/common/common.h
+++ b/common/common.h
@ -122,6 +122,10 @@ struct gpt_params {
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool infill            = false; // use infill mode
+
+    // multimodal models (see examples/llava)
+    std::string mmproj = ""; // path to multimodal projector
+    std::string image = ""; // path to an image file
 };

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@ -2,18 +2,22 @@

 Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.

-The pre-converted 7b model can be found [here](https://huggingface.co/mys/ggml_llava-v1.5-7b).
+The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+models are available.

 After API is confirmed, more models will be supported / uploaded.
 ## Usage
-The `llava` target is cmake-only for now (TODO: add to `make`) and built as a part of examples.
+Build with cmake or run `make llava` to build it.

-After building, run: `./bin/llava` to see the usage. For example:
+After building, run: `./llava` to see the usage. For example:

 ```sh
-./bin/llava path/to/llava-v1.5-7b/ggml-model-q5_k.gguf path/to/llava-v1.5-7b/mmproj-model-f16.gguf path/to/an/image.jpg
+./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```

+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+
 ## Model conversion

 - Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
@ -46,12 +50,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director

 ## TODO

-These will be include in this pr:
-
- [ ] Better command line interface.
-
-These will be another PR:
-
 - [ ] Support server mode.
 - [ ] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -7,31 +7,34 @@
 #include "common.h"
 #include "llama.h"

+static void show_additional_info(int argc, char ** argv) {
+    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+}

 int main(int argc, char ** argv) {
    ggml_time_init();

    gpt_params params;

-    if (argc < 4) {
-        printf("usage: %s <path/to/llava-v1.5/ggml-model-q5_k.gguf> <path/to/llava-v1.5/mmproj-model-f16.gguf> <path/to/an/image.jpg> [a text prompt]\n", argv[0]);
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
        return 1;
    }

-          params.model     = argv[1];
-    const char * clip_path = argv[2];
-    const char * img_path = argv[3];
-
-    if (argc >= 5) {
-        params.prompt = argv[4];
+    if (params.mmproj.empty() || params.image.empty()) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
    }

+    const char * clip_path = params.mmproj.c_str();
+    const char * img_path = params.image.c_str();
+
    if (params.prompt.empty()) {
        params.prompt = "describe the image in detail.";
    }

-    params.temp = 0.1;
-
    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);

    // load and preprocess the image
@ -83,7 +86,7 @@ int main(int argc, char ** argv) {
    }

    llama_context_params ctx_params                 = llama_context_default_params();
-    ctx_params.n_ctx           = 2048;
+    ctx_params.n_ctx           = 2048; // we need a longer context size to process image embeddings
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    llama_context        * ctx_llama                = llama_new_context_with_model(model, ctx_params);