From d8f782acc318fa488e4a97dd5ed9ef254c0bff46 Mon Sep 17 00:00:00 2001
From: themanyone <nospam@thenerdshow.com>
Date: Sun, 21 Jul 2024 16:35:31 -0800
Subject: [PATCH] format batch image output according to --template

addendum

--template uses [] instead of <>
---
 common/common.cpp            |  7 +++++++
 common/common.h              |  1 +
 examples/llava/llava-cli.cpp | 21 +++++++++++++++++++--
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index dbb724fbb..e64934a73 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -728,6 +728,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.image.emplace_back(argv[i]);
         return true;
     }
+    if (arg == "--template") {
+        CHECK_ARG
+        params.templ = argv[i];
+        return true;
+    }
     if (arg == "-i" || arg == "--interactive") {
         params.interactive = true;
         return true;
@@ -1545,6 +1550,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "multi-modality" });
     options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
     options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
+    options.push_back({ "*",           "       --template STRING",      "output template replaces [image] and [description] with generated output" });
+
 
     options.push_back({ "backend" });
     options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
diff --git a/common/common.h b/common/common.h
index 184a53dc0..adbc1c8d5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -185,6 +185,7 @@ struct gpt_params {
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector
     std::vector<std::string> image; // path to image file(s)
+    std::string templ = "";         // output template
 
     // embedding
     bool embedding         = false; // get only sentence embedding
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index 8c7dd2ae3..570b2f116 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -323,10 +323,27 @@ int main(int argc, char ** argv) {
                 std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
                 return 1;
             }
-
+            size_t pos = 0;
+            std::string str = params.templ;
+            // format output according to template
+            if (!params.templ.empty()){
+                while((pos = str.find("[image]")) != std::string::npos)
+                    str = str.replace(pos, 7, image);
+                pos = str.find("[description]");
+                if (pos != std::string::npos)
+                    std::cout << str.substr(0, pos);
+                else
+                    std::cout << params.templ;
+                fflush(stdout);
+            }
             // process the prompt
             process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
+            // terminate output according to template
+            if (!params.templ.empty()){
+                if (pos != std::string::npos)
+                    std::cout << str.substr(pos + 13);
+                fflush(stdout);
+            }
             llama_print_timings(ctx_llava->ctx_llama);
             llava_image_embed_free(image_embed);
             ctx_llava->model = NULL;