Merge cf59a8d50f into b60074f1c2

2024-09-02 16:59:45 +02:00 · 2024-09-02 16:59:45 +02:00 · 8007d0665f
commit 8007d0665f
parent b60074f1c2 cf59a8d50f
3 changed files with 27 additions and 2 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1075,6 +1075,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.image.emplace_back(argv[i]);
        return true;
    }
    if (arg == "--template") {
        CHECK_ARG
        params.templ = argv[i];
        return true;
    }
    if (arg == "-i" || arg == "--interactive") {
        params.interactive = true;
        return true;
@ -1927,6 +1932,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "multi-modality" });
    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
    options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
    options.push_back({ "*",           "       --template STRING",      "output template replaces [image] and [description] with generated output" });
    options.push_back({ "backend" });
    options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
--- a/common/common.h
+++ b/common/common.h
@ -203,6 +203,7 @@ struct gpt_params {
    // multimodal models (see examples/llava)
    std::string mmproj = "";        // path to multimodal projector
    std::vector<std::string> image; // path to image file(s)
    std::string templ = "";         // output template
    // embedding
    bool embedding         = false; // get only sentence embedding
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -323,10 +323,27 @@ int main(int argc, char ** argv) {
                std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
                return 1;
            }
-
+            size_t pos = 0;
            std::string str = params.templ;
            // format output according to template
            if (!params.templ.empty()){
                while((pos = str.find("[image]")) != std::string::npos)
                    str = str.replace(pos, 7, image);
                pos = str.find("[description]");
                if (pos != std::string::npos)
                    std::cout << str.substr(0, pos);
                else
                    std::cout << params.templ;
                fflush(stdout);
            }
            // process the prompt
            process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
+            // terminate output according to template
            if (!params.templ.empty()){
                if (pos != std::string::npos)
                    std::cout << str.substr(pos + 13);
                fflush(stdout);
            }
            llama_print_timings(ctx_llava->ctx_llama);
            llava_image_embed_free(image_embed);
            ctx_llava->model = NULL;