diff --git a/examples/llava/README-qwen2vl.md b/examples/llava/README-qwen2vl.md
index 8fd93a266..fea7e9819 100644
--- a/examples/llava/README-qwen2vl.md
+++ b/examples/llava/README-qwen2vl.md
@@ -57,6 +57,7 @@ Now the model is ready to use in the `model_path` directory. You can quantize th
 
 *Have fun with the models ! :)*
 
-## Limitations
+## Current limitations
 
-* Currently, only support the image to be in the very beginning of the input prompt to the LLM.
+* This only supports the image to be in the very beginning of the input prompt to the LLM.
+* The vision model (clip.cpp)'s GPU backend support, which Qwen2VL uses, is disabled.
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
index 571807ac2..132a7da54 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -524,7 +524,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    if (params.mmproj.empty()) {
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
         print_usage(argc, argv);
         return 1;
     }
@@ -547,15 +547,6 @@ int main(int argc, char ** argv) {
         llava_image_embed_free(image_embed);
         ctx_llava->model = NULL;
         llava_free(ctx_llava);
-    } else if (params.image.empty()) {
-        auto ctx_llava = llava_init_context(&params, model);
-
-        // process the prompt
-        process_prompt(ctx_llava, nullptr, &params, params.prompt);
-
-        llama_perf_context_print(ctx_llava->ctx_llama);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
 #ifndef NDEBUG
     } else if (params.image[0].empty()) {
         auto ctx_llava = llava_init_context(&params, model);