command line: use gpt_params_parse()

This commit is contained in:
M. Yusuf Sarıgöz 2023-10-11 23:17:50 +03:00
parent f0f78345f2
commit 2bc1710e2b
4 changed files with 40 additions and 21 deletions

View file

@ -383,6 +383,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.lora_base = argv[i];
} else if (arg == "--mmproj") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mmproj = argv[i];
} else if (arg == "--image") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.image = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
@ -700,6 +712,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
if (llama_mlock_supported()) {
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
}

View file

@ -122,6 +122,10 @@ struct gpt_params {
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
bool infill = false; // use infill mode
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::string image = ""; // path to an image file
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

View file

@ -2,18 +2,22 @@
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
The pre-converted 7b model can be found [here](https://huggingface.co/mys/ggml_llava-v1.5-7b).
The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
models are available.
After API is confirmed, more models will be supported / uploaded.
## Usage
The `llava` target is cmake-only for now (TODO: add to `make`) and built as a part of examples.
Build with cmake or run `make llava` to build it.
After building, run: `./bin/llava` to see the usage. For example:
After building, run: `./llava` to see the usage. For example:
```sh
./bin/llava path/to/llava-v1.5-7b/ggml-model-q5_k.gguf path/to/llava-v1.5-7b/mmproj-model-f16.gguf path/to/an/image.jpg
./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
```
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
## Model conversion
- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
@ -46,12 +50,6 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director
## TODO
These will be include in this pr:
- [ ] Better command line interface.
These will be another PR:
- [ ] Support server mode.
- [ ] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.

View file

@ -7,31 +7,34 @@
#include "common.h"
#include "llama.h"
static void show_additional_info(int argc, char ** argv) {
printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
printf(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
int main(int argc, char ** argv) {
ggml_time_init();
gpt_params params;
if (argc < 4) {
printf("usage: %s <path/to/llava-v1.5/ggml-model-q5_k.gguf> <path/to/llava-v1.5/mmproj-model-f16.gguf> <path/to/an/image.jpg> [a text prompt]\n", argv[0]);
if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
return 1;
}
params.model = argv[1];
const char * clip_path = argv[2];
const char * img_path = argv[3];
if (argc >= 5) {
params.prompt = argv[4];
if (params.mmproj.empty() || params.image.empty()) {
gpt_print_usage(argc, argv, params);
show_additional_info(argc, argv);
return 1;
}
const char * clip_path = params.mmproj.c_str();
const char * img_path = params.image.c_str();
if (params.prompt.empty()) {
params.prompt = "describe the image in detail.";
}
params.temp = 0.1;
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
// load and preprocess the image
@ -83,7 +86,7 @@ int main(int argc, char ** argv) {
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 2048;
ctx_params.n_ctx = 2048; // we need a longer context size to process image embeddings
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);