diff --git a/common/common.cpp b/common/common.cpp index 526fff057..d28f918ef 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -720,21 +720,21 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms]() { params.verbose_prompt = true; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"--no-display-prompt"}, format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), [¶ms]() { params.display_prompt = false; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-co", "--color"}, format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), [¶ms]() { params.use_color = true; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-s", "--seed"}, "SEED", format("RNG seed (default: %d, use random seed for < 0)", params.seed), @@ -996,7 +996,9 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example ).set_env("LLAMA_ARG_FLASH_ATTN")); add_opt(llama_arg( {"-p", "--prompt"}, "PROMPT", - "prompt to start generation with\n", + ex == LLAMA_EXAMPLE_MAIN + ? "prompt to start generation with\nif -cnv is set, this will be used as system prompt" + : "prompt to start generation with", [¶ms](std::string value) { params.prompt = value; } @@ -1102,7 +1104,13 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example ).set_examples({LLAMA_EXAMPLE_MAIN})); add_opt(llama_arg( {"-cnv", "--conversation"}, - "run in conversation mode, does not print special tokens and suffix/prefix\n", + format( + "run in conversation mode:\n" + "- does not print special tokens and suffix/prefix\n" + "- interactive mode is also enabled\n" + "(default: %s)", + params.conversation ? "true" : "false" + ), [¶ms]() { params.conversation = true; } @@ -1625,14 +1633,14 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.mmproj = value; } - )); + ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(llama_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", [¶ms](std::string value) { params.image.emplace_back(value); } - )); + ).set_examples({LLAMA_EXAMPLE_LLAVA})); #ifdef GGML_USE_RPC add_opt(llama_arg( {"--rpc"}, "SERVERS", @@ -1692,7 +1700,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } } - )); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-sm", "--split-mode"}, "{none,layer,row}", "how to split the model across multiple GPUs, one of:\n" @@ -1837,7 +1845,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms](std::string value) { params.model_draft = value; } - )); + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE})); add_opt(llama_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", @@ -2178,7 +2186,7 @@ std::vector gpt_params_parser_init(gpt_params & params, llama_example [¶ms]() { params.simple_io = true; } - )); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL})); add_opt(llama_arg( {"-ld", "--logdir"}, "LOGDIR", "path under which to save YAML logs (no logging if unset)", diff --git a/common/common.h b/common/common.h index 8f5e3a96a..a8aa6fe14 100644 --- a/common/common.h +++ b/common/common.h @@ -77,6 +77,7 @@ enum llama_example { LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_EXPORT_LORA, + LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_COUNT, }; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 4dd17cf68..8a64fe1bb 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -278,7 +278,7 @@ int main(int argc, char ** argv) { gpt_params params; - auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); + auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage); if (!gpt_params_parse(argc, argv, params, options)) { return 1; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 058a6da14..c434ff608 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -40,6 +40,13 @@ static std::vector * g_output_tokens; static bool is_interacting = false; static bool need_insert_eot = false; +static void print_usage(int, char ** argv) { + printf("\nexample usage:\n"); + printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]); + printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]); + printf("\n"); +} + static bool file_exists(const std::string & path) { std::ifstream f(path.c_str()); return f.good(); @@ -131,7 +138,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector(env: LLAMA_ARG_THREADS) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | @@ -46,7 +45,7 @@ The project is under active development, and we are [looking for feedback and co | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--chunks N` | max number of chunks to process (default: -1, -1 = all) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)
(env: LLAMA_ARG_FLASH_ATTN) | -| `-p, --prompt PROMPT` | prompt to start generation with
| +| `-p, --prompt PROMPT` | prompt to start generation with | | `-f, --file FNAME` | a file containing the prompt (default: none) | | `--in-file FNAME` | an input file (repeat to specify multiple files) | | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | @@ -96,13 +95,10 @@ The project is under active development, and we are [looking for feedback and co | `-ns, --sequences N` | number of sequences to decode (default: 1) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | -| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md | -| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching | | `--mlock` | force system to keep model in RAM rather than swapping or compressing | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggerganov/llama.cpp/issues/1437 | | `-ngl, --gpu-layers N` | number of layers to store in VRAM
(env: LLAMA_ARG_N_GPU_LAYERS) | -| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:
- none: use one GPU only
- layer (default): split layers and KV across GPUs
- row: split rows across GPUs | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | @@ -115,7 +111,6 @@ The project is under active development, and we are [looking for feedback and co | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | | `-a, --alias STRING` | set alias for model name (to be used by REST API)
(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)
(env: LLAMA_ARG_MODEL) | -| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)
(env: LLAMA_ARG_MODEL_URL) | | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)
(env: LLAMA_ARG_HF_REPO) | | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)
(env: LLAMA_ARG_HF_FILE) | @@ -138,7 +133,6 @@ The project is under active development, and we are [looking for feedback and co | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted:
https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | -| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | | `--log-test` | Log test | | `--log-disable` | Log disable |