From 23bce61a2f43fc252457fe1073a75db174f93485 Mon Sep 17 00:00:00 2001 From: sami Date: Tue, 4 Feb 2025 12:13:57 +0700 Subject: [PATCH 1/3] Added README * Also allows no --image argument cli to the qwen2vl-cli --- examples/llava/README-qwen2vl.md | 62 ++++++++++++++++++++++++++++++++ examples/llava/qwen2vl-cli.cpp | 11 +++++- 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 examples/llava/README-qwen2vl.md diff --git a/examples/llava/README-qwen2vl.md b/examples/llava/README-qwen2vl.md new file mode 100644 index 000000000..7cb7a2d78 --- /dev/null +++ b/examples/llava/README-qwen2vl.md @@ -0,0 +1,62 @@ +# QWEN2-VL + +This implementation supports all versions of Qwen2VL, e.g. [Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct). + +## Usage + +After building, run `./llama-qwen2vl-cli` to use it. Or you can also get the ready one on Huggingface, e.g. [Qwen2-VL-2B-Instruct-GGUF](https://huggingface.co/bartowski/Qwen2-VL-2B-Instruct-GGUF) : + +### The basic one for running with an image and a prompt + +```sh +./bin/llama-qwen2vl-cli -m /models/Qwen2-VL-2B-Instruct-Q4_0.gguf --mmproj /models/mmproj-Qwen2-VL-2B-Instruct-f32.gguf -p 'Describe this image.' --image '/models/test_image.jpg' +``` + +The image argument is optional in case you just want to use the model for text. However, the mmproj still has to be there as it will be loaded. + +Without defining the system prompt in the prompt, it will default to `You are a helpful assistant.`. + +### Or if you want the image to be directly in the prompt as a base64 + +```sh +./llama-qwen2vl-cli -m /models/Qwen2-VL-2B-Instruct-Q4_0.gguf --mmproj /models/mmproj-Qwen2-VL-2B-Instruct-f32.gguf -p 'Describe this image.' +``` + +### Or a complete prompt with the system message + +```sh +./llama-qwen2vl-cli -m /models/Qwen2-VL-2B-Instruct-Q4_0.gguf --mmproj /models/mmproj-Qwen2-VL-2B-Instruct-f32.gguf -p '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|vision_pad|><|vision_end|>Describe this image.' --image '/models/test_image.jpg' +``` + +**Note**: A lower temperature like 0.1 is recommended for better quality. Add `--temp 0.1` to the command to do so. +**Note**: For GPU offloading, ensure to use the `-ngl` flag as usual. + +## GGUF Conversion + +1. Clone the Qwen2-VL model: + +```sh +git clone https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct +``` + +2. Use `qwen2_vl_surgery.py` to prepare the model for conversion: + +```sh +python ./examples/llava/qwen2_vl_surgery.py ./model_path --data_type fp32 +``` + +It will generate the vision model, and output the filename in the log. + +3. Use `examples/convert_hf_to_gguf.py` to convert the Qwen2-VL model to GGUF: + +```sh +python convert_hf_to_gguf.py ./model_path -outtype f32 +``` + +Now the model is ready to use in the `model_path` directory. You can quantize them as you normally would with other GGUF files. + +*Have fun with the models ! :)* + +## Limitations + +* Currently, only support the image to be in the very beginning of the input prompt to the LLM. diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index 132a7da54..571807ac2 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -524,7 +524,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { + if (params.mmproj.empty()) { print_usage(argc, argv); return 1; } @@ -547,6 +547,15 @@ int main(int argc, char ** argv) { llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); + } else if (params.image.empty()) { + auto ctx_llava = llava_init_context(¶ms, model); + + // process the prompt + process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); + + llama_perf_context_print(ctx_llava->ctx_llama); + ctx_llava->model = NULL; + llava_free(ctx_llava); #ifndef NDEBUG } else if (params.image[0].empty()) { auto ctx_llava = llava_init_context(¶ms, model); From 8777473a43cd247156814cce5c595fe254698f6c Mon Sep 17 00:00:00 2001 From: sami Date: Tue, 4 Feb 2025 21:46:39 +0700 Subject: [PATCH 2/3] Remove white trailing --- examples/llava/README-qwen2vl.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llava/README-qwen2vl.md b/examples/llava/README-qwen2vl.md index 7cb7a2d78..8fd93a266 100644 --- a/examples/llava/README-qwen2vl.md +++ b/examples/llava/README-qwen2vl.md @@ -28,7 +28,7 @@ Without defining the system prompt in the prompt, it will default to `You are a ./llama-qwen2vl-cli -m /models/Qwen2-VL-2B-Instruct-Q4_0.gguf --mmproj /models/mmproj-Qwen2-VL-2B-Instruct-f32.gguf -p '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|vision_pad|><|vision_end|>Describe this image.' --image '/models/test_image.jpg' ``` -**Note**: A lower temperature like 0.1 is recommended for better quality. Add `--temp 0.1` to the command to do so. +**Note**: A lower temperature like 0.1 is recommended for better quality. Add `--temp 0.1` to the command to do so. **Note**: For GPU offloading, ensure to use the `-ngl` flag as usual. ## GGUF Conversion From 185e1b107e14185f585ec3dc21840915a76264a9 Mon Sep 17 00:00:00 2001 From: sami Date: Sun, 9 Feb 2025 15:41:11 +0700 Subject: [PATCH 3/3] Added GPU support on qwen2vl readme Undo changes on qwen2vl-cli --- examples/llava/README-qwen2vl.md | 5 +++-- examples/llava/qwen2vl-cli.cpp | 11 +---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/examples/llava/README-qwen2vl.md b/examples/llava/README-qwen2vl.md index 8fd93a266..fea7e9819 100644 --- a/examples/llava/README-qwen2vl.md +++ b/examples/llava/README-qwen2vl.md @@ -57,6 +57,7 @@ Now the model is ready to use in the `model_path` directory. You can quantize th *Have fun with the models ! :)* -## Limitations +## Current limitations -* Currently, only support the image to be in the very beginning of the input prompt to the LLM. +* This only supports the image to be in the very beginning of the input prompt to the LLM. +* The vision model (clip.cpp)'s GPU backend support, which Qwen2VL uses, is disabled. diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index 571807ac2..132a7da54 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -524,7 +524,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.empty()) { + if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); return 1; } @@ -547,15 +547,6 @@ int main(int argc, char ** argv) { llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); - } else if (params.image.empty()) { - auto ctx_llava = llava_init_context(¶ms, model); - - // process the prompt - process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); - - llama_perf_context_print(ctx_llava->ctx_llama); - ctx_llava->model = NULL; - llava_free(ctx_llava); #ifndef NDEBUG } else if (params.image[0].empty()) { auto ctx_llava = llava_init_context(¶ms, model);