common : rework usage print (wip)

2024-05-31 18:53:22 +03:00 · 2024-05-31 18:53:22 +03:00 · 8f717fd3bb
commit 8f717fd3bb
parent 123175ea71
9 changed files with 106 additions and 145 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -289,7 +289,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-        // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
+        // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
        params.seed = std::stoul(argv[i]);
        sparams.seed = std::stoul(argv[i]);
        return true;
@ -901,11 +901,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.interactive = true;
        return true;
    }
-    if (arg == "--interactive-specials") {
+    if (arg == "-sp" || arg == "--special") {
        params.interactive_specials = true;
        return true;
    }
    if (arg == "--special") {
        params.special = true;
        return true;
    }
@ -913,7 +909,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.embedding = true;
        return true;
    }
-    if (arg == "--interactive-first") {
+    if (arg == "-if" || arg == "--interactive-first") {
        params.interactive_first = true;
        return true;
    }
@ -965,7 +961,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.flash_attn = true;
        return true;
    }
-    if (arg == "--color") {
+    if (arg == "-co" || arg == "--color") {
        params.use_color = true;
        return true;
    }
@ -1252,10 +1248,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
        exit(0);
    }
    if (arg == "--random-prompt") {
        params.random_prompt = true;
        return true;
    }
    if (arg == "--in-prefix-bos") {
        params.input_prefix_bos = true;
        return true;
@ -1349,6 +1341,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
    return false;
 }
 #ifdef __GNUC__
 #ifdef __MINGW32__
 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
 #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
 #endif
 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    const llama_sampling_params & sparams = params.sparams;
@ -1360,52 +1362,83 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    }
    sampler_type_names.pop_back();
    struct option_info {
        LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
        option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
            va_list args_list;
            va_start(args_list, desc);
            char buffer[1024];
            vsnprintf(buffer, sizeof(buffer), desc, args_list);
            va_end(args_list);
            this->desc = buffer;
        }
        std::string tags;
        std::string args;
        std::string desc;
    };
    std::vector<option_info> options;
    // TODO: filter by tags
    options.push_back({ "*",           "-h,   --help, --usage", "print usage and exit" });
    options.push_back({ "*",           "      --version", "show version and build info" });
    options.push_back({ "*",           "-co,  --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
    options.push_back({ "*",           "-s,   --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
    options.push_back({ "*",           "-t,   --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
    options.push_back({ "*",           "-tb,  --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
    options.push_back({ "speculative", "-td,  --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
    options.push_back({ "speculative", "-tbd, --threads-batch-draft N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
    options.push_back({ "*",           "-c,   --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
    options.push_back({ "*",           "-n,   --n-predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
    options.push_back({ "*",           "-b,   --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
    options.push_back({ "*",           "-ub,  --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
    options.push_back({ "*",           "-p,   --prompt PROMPT", "prompt to start generation with (default: empty)" });
    options.push_back({ "*",           "-f,   --file FNAME", "a file containing the prompt (default: none)" });
    options.push_back({ "*",           "-bf,  --binary-file FNAME", "binary file containing the prompt (default: none)" });
    options.push_back({ "*",           "-e,   --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)" });
    options.push_back({ "main",        "      --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
    options.push_back({ "main",        "      --prompt-cache-all", "if specified, saves user input and generations to cache as well\nnot supported with --interactive or other interactive options" });
    options.push_back({ "main",        "      --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
    options.push_back({ "main",        "-r,   --reverse-prompt PROMPT", "halt generation at PROMPT, return control in interactive mode\ncan be specified more than once for multiple prompts" });
    options.push_back({ "main",        "-sp,  --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
    options.push_back({ "main",        "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
    options.push_back({ "main",        "-ins, --instruct", "run in instruction mode (use with Alpaca models) (default: %s)", params.instruct ? "true" : "false" });
    options.push_back({ "main",        "-cml, --chatml", "run in chatml mode (use with ChatML-compatible models) (default: %s)", params.chatml ? "true" : "false" });
    options.push_back({ "main infill", "-i,   --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
    options.push_back({ "main infill", "-if,  --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
    options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
    options.push_back({ "main infill", "      --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
    options.push_back({ "main infill", "      --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
    options.push_back({ "main infill", "      --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
    printf("\n");
    printf("usage: %s [options]\n", argv[0]);
    printf("\n");
-    printf("options:\n");
+    printf("options:\n\n");
-    printf("  -h, --help, --usage   print usage and exit\n");
+
-    printf("  --version             show version and build info\n");
+    for (const auto & o : options) {
-    printf("  -i, --interactive     run in interactive mode\n");
+        printf("  %-32s", o.args.c_str());
-    printf("  --special             special tokens output enabled\n");
+        if (o.args.length() > 34) {
-    printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
+            printf("\n%34s", "");
-    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
+        }
-    printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
+
-    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+        //printf("%s\n", o.desc.c_str());
-    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
+        // print line by line and pad with spaces
-    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
+        const auto desc = o.desc;
-    printf("  -r PROMPT, --reverse-prompt PROMPT\n");
+        size_t start = 0;
-    printf("                        halt generation at PROMPT, return control in interactive mode\n");
+        size_t end = desc.find('\n');
-    printf("                        (can be specified more than once for multiple prompts).\n");
+        while (end != std::string::npos) {
-    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
+            printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
-    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+            start = end + 1;
-    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
+            end = desc.find('\n', start);
-    printf("  -tb N, --threads-batch N\n");
+        }
-    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
+
-    printf("  -td N, --threads-draft N");
+        printf("%s\n", desc.substr(start).c_str());
-    printf("                        number of threads to use during generation (default: same as --threads)\n");
+    }
-    printf("  -tbd N, --threads-batch-draft N\n");
+
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
    printf("  -p PROMPT, --prompt PROMPT\n");
    printf("                        prompt to start generation with (default: empty)\n");
    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
    printf("  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
    printf("  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
    printf("                        not supported with --interactive or other interactive options\n");
    printf("  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
    printf("  --random-prompt       start with a randomized prompt.\n");
    printf("  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
    printf("  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
    printf("  -bf FNAME, --binary-file FNAME\n");
    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  logical maximum batch size (default: %d)\n", params.n_batch);
    printf("  -ub N, --ubatch-size N\n");
    printf("                        physical maximum batch size (default: %d)\n", params.n_ubatch);
    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
    printf("                        (default: %s)\n", sampler_type_names.c_str());
    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
@ -1549,6 +1582,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
    printf("  --check-tensors       check model tensor data for invalid values\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
 #endif // LOG_DISABLE_LOGS
@ -1611,24 +1645,6 @@ std::string string_get_sortable_timestamp() {
    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
 }
 std::string string_random_prompt(std::mt19937 & rng) {
    const int r = rng() % 10;
    switch (r) {
        case 0: return "So";
        case 1: return "Once upon a time";
        case 2: return "When";
        case 3: return "The";
        case 4: return "After";
        case 5: return "If";
        case 6: return "import";
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
    }
    GGML_UNREACHABLE();
 }
 void string_process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@ -2906,7 +2922,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
    fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
@ -2956,7 +2971,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
    fprintf(stream, "reverse_prompt:\n");
--- a/common/common.h
+++ b/common/common.h
@ -99,23 +99,23 @@ struct gpt_params {
    // // sampling parameters
    struct llama_sampling_params sparams;
-    std::string model                = "";  // model path
+    std::string model                = ""; // model path
-    std::string model_draft          = "";  // draft model for speculative decoding
+    std::string model_draft          = ""; // draft model for speculative decoding
    std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = "";  // model url to download
+    std::string model_url            = ""; // model url to download
-    std::string hf_repo              = "";  // HF repo
+    std::string hf_repo              = ""; // HF repo
-    std::string hf_file              = "";  // HF file
+    std::string hf_file              = ""; // HF file
    std::string prompt               = "";
-    std::string prompt_file          = "";  // store the external prompt file name
+    std::string prompt_file          = ""; // store the external prompt file name
-    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
-    std::string input_prefix         = "";  // string to prefix user inputs with
+    std::string input_prefix         = ""; // string to prefix user inputs with
-    std::string input_suffix         = "";  // string to suffix user inputs with
+    std::string input_suffix         = ""; // string to suffix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
+    std::string logdir               = ""; // directory in which to save YAML log files
    std::string logdir               = "";  // directory in which to save YAML log files
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = "";  // file for saving *all* logits
+    std::string logits_file          = ""; // file for saving *all* logits
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
    // TODO: avoid tuple, use struct
@ -143,11 +143,10 @@ struct gpt_params {
    bool   kl_divergence   = false; // compute KL divergence
    bool usage             = false; // print usage
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
    bool special           = false; // enable special token output
    bool interactive       = false; // interactive mode
    bool interactive_first = false; // wait for user input immediately
    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
@ -155,7 +154,6 @@ struct gpt_params {
    bool embedding         = false; // get only sentence embedding
    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
    bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
@ -200,7 +198,6 @@ std::vector<std::string> string_split(std::string input, char separator);
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
 std::string string_random_prompt(std::mt19937 & rng);
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -80,9 +80,6 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@ -152,9 +152,6 @@ int main(int argc, char ** argv) {
    print_build_info();
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -598,9 +598,6 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    sparams.dataset = params.prompt_file;
    g_collector.set_parameters(std::move(sparams));
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -140,27 +140,6 @@ int main(int argc, char ** argv) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
    }
    if (params.instruct) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.chatml) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for chatml mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.antiprompt.empty()) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
        printf("\n************\n");
        printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@ -168,20 +147,6 @@ int main(int argc, char ** argv) {
        return 0;
    }
    if (params.random_prompt) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (!params.path_prompt_cache.empty()) {
        printf("\n************\n");
        printf("%s: infill does not support prompt caching\n", __func__);
        printf("************\n\n");
        return 0;
    }
    if (params.rope_freq_base != 0.0) {
        LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -182,9 +182,6 @@ int main(int argc, char ** argv) {
    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    LOG("%s: llama backend init\n", __func__);
    llama_backend_init();
@ -893,7 +890,7 @@ int main(int argc, char ** argv) {
                    }
                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, params.interactive_specials);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1032,7 +1032,7 @@ struct winogrande_entry {
    std::vector<llama_token> seq_tokens[2];
 };
-static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
    std::vector<winogrande_entry> result;
    std::istringstream in(prompt);
    std::string line;
@ -2007,9 +2007,6 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = string_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/llama.cpp
+++ b/llama.cpp
@ -110,7 +110,7 @@
 //
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
-static void llama_log_internal        (ggml_log_level level, const char* format, ...);
+static void llama_log_internal        (ggml_log_level level, const char * format, ...);
 static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)