diff --git a/examples/llava/README-qwen2vl.md b/examples/llava/README-qwen2vl.md index 8fd93a266..fea7e9819 100644 --- a/examples/llava/README-qwen2vl.md +++ b/examples/llava/README-qwen2vl.md @@ -57,6 +57,7 @@ Now the model is ready to use in the `model_path` directory. You can quantize th *Have fun with the models ! :)* -## Limitations +## Current limitations -* Currently, only support the image to be in the very beginning of the input prompt to the LLM. +* This only supports the image to be in the very beginning of the input prompt to the LLM. +* The vision model (clip.cpp)'s GPU backend support, which Qwen2VL uses, is disabled. diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index 571807ac2..132a7da54 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -524,7 +524,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.empty()) { + if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); return 1; } @@ -547,15 +547,6 @@ int main(int argc, char ** argv) { llava_image_embed_free(image_embed); ctx_llava->model = NULL; llava_free(ctx_llava); - } else if (params.image.empty()) { - auto ctx_llava = llava_init_context(¶ms, model); - - // process the prompt - process_prompt(ctx_llava, nullptr, ¶ms, params.prompt); - - llama_perf_context_print(ctx_llava->ctx_llama); - ctx_llava->model = NULL; - llava_free(ctx_llava); #ifndef NDEBUG } else if (params.image[0].empty()) { auto ctx_llava = llava_init_context(¶ms, model);