From 2bc1710e2b67b88d61792b5c706711356ae0ad0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Wed, 11 Oct 2023 23:17:50 +0300 Subject: [PATCH] command line: use gpt_params_parse() --- common/common.cpp | 14 ++++++++++++++ common/common.h | 4 ++++ examples/llava/README.md | 18 ++++++++---------- examples/llava/llava.cpp | 25 ++++++++++++++----------- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 0f55c33a7..ee1392bad 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -383,6 +383,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.lora_base = argv[i]; + } else if (arg == "--mmproj") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mmproj = argv[i]; + } else if (arg == "--image") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.image = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; } else if (arg == "--embedding") { @@ -700,6 +712,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); + printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); + printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); if (llama_mlock_supported()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } diff --git a/common/common.h b/common/common.h index c80215279..72e97f653 100644 --- a/common/common.h +++ b/common/common.h @@ -122,6 +122,10 @@ struct gpt_params { bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation bool infill = false; // use infill mode + + // multimodal models (see examples/llava) + std::string mmproj = ""; // path to multimodal projector + std::string image = ""; // path to an image file }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/llava/README.md b/examples/llava/README.md index a2b296331..f4d61414c 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -2,18 +2,22 @@ Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants. -The pre-converted 7b model can be found [here](https://huggingface.co/mys/ggml_llava-v1.5-7b). +The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) +and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) +models are available. After API is confirmed, more models will be supported / uploaded. ## Usage -The `llava` target is cmake-only for now (TODO: add to `make`) and built as a part of examples. +Build with cmake or run `make llava` to build it. -After building, run: `./bin/llava` to see the usage. For example: +After building, run: `./llava` to see the usage. For example: ```sh -./bin/llava path/to/llava-v1.5-7b/ggml-model-q5_k.gguf path/to/llava-v1.5-7b/mmproj-model-f16.gguf path/to/an/image.jpg +./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ``` +**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. + ## Model conversion - Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally: @@ -46,12 +50,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director ## TODO -These will be include in this pr: - -- [ ] Better command line interface. - -These will be another PR: - - [ ] Support server mode. - [ ] Support non-CPU backend for the image encoding part. - [ ] Support different sampling methods. diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 213546a3e..fe3b4e87d 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -7,31 +7,34 @@ #include "common.h" #include "llama.h" +static void show_additional_info(int argc, char ** argv) { + printf("\n example usage: %s -m --mproj --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); +} int main(int argc, char ** argv) { ggml_time_init(); gpt_params params; - if (argc < 4) { - printf("usage: %s [a text prompt]\n", argv[0]); + if (!gpt_params_parse(argc, argv, params)) { + show_additional_info(argc, argv); return 1; } - params.model = argv[1]; - const char * clip_path = argv[2]; - const char * img_path = argv[3]; - - if (argc >= 5) { - params.prompt = argv[4]; + if (params.mmproj.empty() || params.image.empty()) { + gpt_print_usage(argc, argv, params); + show_additional_info(argc, argv); + return 1; } + const char * clip_path = params.mmproj.c_str(); + const char * img_path = params.image.c_str(); + if (params.prompt.empty()) { params.prompt = "describe the image in detail."; } - params.temp = 0.1; - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); // load and preprocess the image @@ -83,7 +86,7 @@ int main(int argc, char ** argv) { } llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = 2048; + ctx_params.n_ctx = 2048; // we need a longer context size to process image embeddings ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);