diff --git a/common/common.cpp b/common/common.cpp index 0b64134af..280595d50 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -289,7 +289,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa invalid_param = true; return true; } - // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context. + // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); sparams.seed = std::stoul(argv[i]); return true; @@ -901,11 +901,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive = true; return true; } - if (arg == "--interactive-specials") { - params.interactive_specials = true; - return true; - } - if (arg == "--special") { + if (arg == "-sp" || arg == "--special") { params.special = true; return true; } @@ -913,7 +909,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.embedding = true; return true; } - if (arg == "--interactive-first") { + if (arg == "-if" || arg == "--interactive-first") { params.interactive_first = true; return true; } @@ -965,7 +961,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.flash_attn = true; return true; } - if (arg == "--color") { + if (arg == "-co" || arg == "--color") { params.use_color = true; return true; } @@ -1252,10 +1248,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } - if (arg == "--random-prompt") { - params.random_prompt = true; - return true; - } if (arg == "--in-prefix-bos") { params.input_prefix_bos = true; return true; @@ -1349,6 +1341,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa return false; } +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) +#endif + void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { const llama_sampling_params & sparams = params.sparams; @@ -1360,52 +1362,83 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param } sampler_type_names.pop_back(); + struct option_info { + LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5) + option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) { + va_list args_list; + va_start(args_list, desc); + char buffer[1024]; + vsnprintf(buffer, sizeof(buffer), desc, args_list); + va_end(args_list); + this->desc = buffer; + } + + std::string tags; + std::string args; + std::string desc; + }; + + std::vector options; + + // TODO: filter by tags + + options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); + options.push_back({ "*", " --version", "show version and build info" }); + options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); + options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed }); + options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads }); + options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" }); + options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" }); + options.push_back({ "speculative", "-tbd, --threads-batch-draft N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)" }); + options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); + options.push_back({ "*", "-n, --n-predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); + options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); + options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); + options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: empty)" }); + options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); + options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); + options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)" }); + options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" }); + options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\nnot supported with --interactive or other interactive options" }); + options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" }); + options.push_back({ "main", "-r, --reverse-prompt PROMPT", "halt generation at PROMPT, return control in interactive mode\ncan be specified more than once for multiple prompts" }); + + options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" }); + options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" }); + options.push_back({ "main", "-ins, --instruct", "run in instruction mode (use with Alpaca models) (default: %s)", params.instruct ? "true" : "false" }); + options.push_back({ "main", "-cml, --chatml", "run in chatml mode (use with ChatML-compatible models) (default: %s)", params.chatml ? "true" : "false" }); + options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" }); + options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" }); + options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" }); + options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); + options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); + options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); + printf("\n"); printf("usage: %s [options]\n", argv[0]); printf("\n"); - printf("options:\n"); - printf(" -h, --help, --usage print usage and exit\n"); - printf(" --version show version and build info\n"); - printf(" -i, --interactive run in interactive mode\n"); - printf(" --special special tokens output enabled\n"); - printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); - printf(" --interactive-first run in interactive mode and wait for input right away\n"); - printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); - printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); - printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); - printf(" -r PROMPT, --reverse-prompt PROMPT\n"); - printf(" halt generation at PROMPT, return control in interactive mode\n"); - printf(" (can be specified more than once for multiple prompts).\n"); - printf(" --color colorise output to distinguish prompt and user input from generations\n"); - printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); - printf(" -tb N, --threads-batch N\n"); - printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); - printf(" -td N, --threads-draft N"); - printf(" number of threads to use during generation (default: same as --threads)\n"); - printf(" -tbd N, --threads-batch-draft N\n"); - printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); - printf(" -p PROMPT, --prompt PROMPT\n"); - printf(" prompt to start generation with (default: empty)\n"); - printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); - printf(" not supported with --interactive or other interactive options\n"); - printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); - printf(" --random-prompt start with a randomized prompt.\n"); - printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); - printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); - printf(" -f FNAME, --file FNAME\n"); - printf(" prompt file to start generation.\n"); - printf(" -bf FNAME, --binary-file FNAME\n"); - printf(" binary file containing multiple choice tasks.\n"); - printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); - printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); - printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch); - printf(" -ub N, --ubatch-size N\n"); - printf(" physical maximum batch size (default: %d)\n", params.n_ubatch); + printf("options:\n\n"); + + for (const auto & o : options) { + printf(" %-32s", o.args.c_str()); + if (o.args.length() > 34) { + printf("\n%34s", ""); + } + + //printf("%s\n", o.desc.c_str()); + // print line by line and pad with spaces + const auto desc = o.desc; + size_t start = 0; + size_t end = desc.find('\n'); + while (end != std::string::npos) { + printf("%s\n%34s", desc.substr(start, end - start).c_str(), ""); + start = end + 1; + end = desc.find('\n', start); + } + + printf("%s\n", desc.substr(start).c_str()); + } + printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n"); printf(" (default: %s)\n", sampler_type_names.c_str()); printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str()); @@ -1549,6 +1582,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param printf(" print token count every N tokens (default: %d)\n", params.n_print); printf(" --check-tensors check model tensor data for invalid values\n"); printf("\n"); + #ifndef LOG_DISABLE_LOGS log_print_usage(); #endif // LOG_DISABLE_LOGS @@ -1611,24 +1645,6 @@ std::string string_get_sortable_timestamp() { return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns); } -std::string string_random_prompt(std::mt19937 & rng) { - const int r = rng() % 10; - switch (r) { - case 0: return "So"; - case 1: return "Once upon a time"; - case 2: return "When"; - case 3: return "The"; - case 4: return "After"; - case 5: return "If"; - case 6: return "import"; - case 7: return "He"; - case 8: return "She"; - case 9: return "They"; - } - - GGML_UNREACHABLE(); -} - void string_process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -2906,7 +2922,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); - fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); fprintf(stream, "keep: %d # default: 0\n", params.n_keep); fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); @@ -2956,7 +2971,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens); - fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat); fprintf(stream, "reverse_prompt:\n"); diff --git a/common/common.h b/common/common.h index 46dbf61e5..69bc8e47f 100644 --- a/common/common.h +++ b/common/common.h @@ -99,23 +99,23 @@ struct gpt_params { // // sampling parameters struct llama_sampling_params sparams; - std::string model = ""; // model path - std::string model_draft = ""; // draft model for speculative decoding + std::string model = ""; // model path + std::string model_draft = ""; // draft model for speculative decoding std::string model_alias = "unknown"; // model alias - std::string model_url = ""; // model url to download - std::string hf_repo = ""; // HF repo - std::string hf_file = ""; // HF file + std::string model_url = ""; // model url to download + std::string hf_repo = ""; // HF repo + std::string hf_file = ""; // HF file std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with - std::vector antiprompt; // string upon seeing which more user input is prompted - std::string logdir = ""; // directory in which to save YAML log files + std::string prompt_file = ""; // store the external prompt file name + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with + std::string logdir = ""; // directory in which to save YAML log files std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding - std::string logits_file = ""; // file for saving *all* logits + std::string logits_file = ""; // file for saving *all* logits + std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; // TODO: avoid tuple, use struct @@ -143,11 +143,10 @@ struct gpt_params { bool kl_divergence = false; // compute KL divergence bool usage = false; // print usage - bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs - bool interactive = false; // interactive mode - bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode bool special = false; // enable special token output + bool interactive = false; // interactive mode + bool interactive_first = false; // wait for user input immediately bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache @@ -155,7 +154,6 @@ struct gpt_params { bool embedding = false; // get only sentence embedding bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\" - bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles bool cont_batching = true; // insert new sequences for decoding on-the-fly @@ -200,7 +198,6 @@ std::vector string_split(std::string input, char separator); std::string string_strip(const std::string & str); std::string string_get_sortable_timestamp(); -std::string string_random_prompt(std::mt19937 & rng); bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 1b8c443ef..244751e00 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -80,9 +80,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index fda3af26b..64cd338c2 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -152,9 +152,6 @@ int main(int argc, char ** argv) { print_build_info(); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index eb5f97cdb..e050c09d2 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -598,9 +598,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } sparams.dataset = params.prompt_file; g_collector.set_parameters(std::move(sparams)); diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index ad6ecb3f1..baedade15 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -140,27 +140,6 @@ int main(int argc, char ** argv) { LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; } - if (params.instruct) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for instruct mode\n", __func__); - printf("************\n\n"); - - return 0; - } - if (params.chatml) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for chatml mode\n", __func__); - printf("************\n\n"); - - return 0; - } - if (!params.antiprompt.empty()) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for antiprompt mode\n", __func__); - printf("************\n\n"); - - return 0; - } if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { printf("\n************\n"); printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); @@ -168,20 +147,6 @@ int main(int argc, char ** argv) { return 0; } - if (params.random_prompt) { - printf("\n************\n"); - printf("%s: please use the 'main' tool for random prompt mode\n", __func__); - printf("************\n\n"); - - return 0; - } - if (!params.path_prompt_cache.empty()) { - printf("\n************\n"); - printf("%s: infill does not support prompt caching\n", __func__); - printf("************\n\n"); - - return 0; - } if (params.rope_freq_base != 0.0) { LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 00ac6bbb1..d35b1d90a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -182,9 +182,6 @@ int main(int argc, char ** argv) { LOG_TEE("%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } LOG("%s: llama backend init\n", __func__); llama_backend_init(); @@ -893,7 +890,7 @@ int main(int argc, char ** argv) { } const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index a15c3a4c3..14137a8a7 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1032,7 +1032,7 @@ struct winogrande_entry { std::vector seq_tokens[2]; }; -static std::vector load_winogrande_from_csv(const std::string& prompt) { +static std::vector load_winogrande_from_csv(const std::string & prompt) { std::vector result; std::istringstream in(prompt); std::string line; @@ -2007,9 +2007,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = string_random_prompt(rng); - } llama_backend_init(); llama_numa_init(params.numa); diff --git a/llama.cpp b/llama.cpp index b19c6ff34..3c43b87d3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -110,7 +110,7 @@ // LLAMA_ATTRIBUTE_FORMAT(2, 3) -static void llama_log_internal (ggml_log_level level, const char* format, ...); +static void llama_log_internal (ggml_log_level level, const char * format, ...); static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); #define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)