refine example-specific args

This commit is contained in:
Xuan Son Nguyen 2024-09-06 14:05:51 +02:00
parent 53244f9c58
commit e1281d0d7a
5 changed files with 29 additions and 19 deletions

View file

@ -720,21 +720,21 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params]() { [&params]() {
params.verbose_prompt = true; params.verbose_prompt = true;
} }
)); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg( add_opt(llama_arg(
{"--no-display-prompt"}, {"--no-display-prompt"},
format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
[&params]() { [&params]() {
params.display_prompt = false; params.display_prompt = false;
} }
)); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg( add_opt(llama_arg(
{"-co", "--color"}, {"-co", "--color"},
format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"), format("colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false"),
[&params]() { [&params]() {
params.use_color = true; params.use_color = true;
} }
)); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg( add_opt(llama_arg(
{"-s", "--seed"}, "SEED", {"-s", "--seed"}, "SEED",
format("RNG seed (default: %d, use random seed for < 0)", params.seed), format("RNG seed (default: %d, use random seed for < 0)", params.seed),
@ -996,7 +996,9 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
).set_env("LLAMA_ARG_FLASH_ATTN")); ).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(llama_arg( add_opt(llama_arg(
{"-p", "--prompt"}, "PROMPT", {"-p", "--prompt"}, "PROMPT",
"prompt to start generation with\n", ex == LLAMA_EXAMPLE_MAIN
? "prompt to start generation with\nif -cnv is set, this will be used as system prompt"
: "prompt to start generation with",
[&params](std::string value) { [&params](std::string value) {
params.prompt = value; params.prompt = value;
} }
@ -1102,7 +1104,13 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
).set_examples({LLAMA_EXAMPLE_MAIN})); ).set_examples({LLAMA_EXAMPLE_MAIN}));
add_opt(llama_arg( add_opt(llama_arg(
{"-cnv", "--conversation"}, {"-cnv", "--conversation"},
"run in conversation mode, does not print special tokens and suffix/prefix\n", format(
"run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n"
"(default: %s)",
params.conversation ? "true" : "false"
),
[&params]() { [&params]() {
params.conversation = true; params.conversation = true;
} }
@ -1625,14 +1633,14 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params](std::string value) { [&params](std::string value) {
params.mmproj = value; params.mmproj = value;
} }
)); ).set_examples({LLAMA_EXAMPLE_LLAVA}));
add_opt(llama_arg( add_opt(llama_arg(
{"--image"}, "FILE", {"--image"}, "FILE",
"path to an image file. use with multimodal models. Specify multiple times for batching", "path to an image file. use with multimodal models. Specify multiple times for batching",
[&params](std::string value) { [&params](std::string value) {
params.image.emplace_back(value); params.image.emplace_back(value);
} }
)); ).set_examples({LLAMA_EXAMPLE_LLAVA}));
#ifdef GGML_USE_RPC #ifdef GGML_USE_RPC
add_opt(llama_arg( add_opt(llama_arg(
{"--rpc"}, "SERVERS", {"--rpc"}, "SERVERS",
@ -1692,7 +1700,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
} }
} }
)); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(llama_arg( add_opt(llama_arg(
{"-sm", "--split-mode"}, "{none,layer,row}", {"-sm", "--split-mode"}, "{none,layer,row}",
"how to split the model across multiple GPUs, one of:\n" "how to split the model across multiple GPUs, one of:\n"
@ -1837,7 +1845,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params](std::string value) { [&params](std::string value) {
params.model_draft = value; params.model_draft = value;
} }
)); ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
add_opt(llama_arg( add_opt(llama_arg(
{"-mu", "--model-url"}, "MODEL_URL", {"-mu", "--model-url"}, "MODEL_URL",
"model download url (default: unused)", "model download url (default: unused)",
@ -2178,7 +2186,7 @@ std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example
[&params]() { [&params]() {
params.simple_io = true; params.simple_io = true;
} }
)); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
add_opt(llama_arg( add_opt(llama_arg(
{"-ld", "--logdir"}, "LOGDIR", {"-ld", "--logdir"}, "LOGDIR",
"path under which to save YAML logs (no logging if unset)", "path under which to save YAML logs (no logging if unset)",

View file

@ -77,6 +77,7 @@ enum llama_example {
LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };

View file

@ -278,7 +278,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) { if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View file

@ -40,6 +40,13 @@ static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false; static bool is_interacting = false;
static bool need_insert_eot = false; static bool need_insert_eot = false;
static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n text generation: %s -m your_model.gguf -p \"I believe the meaning of life is\" -n 128\n", argv[0]);
printf("\n chat (conversation): %s -m your_model.gguf -p \"You are a helpful assistant\" -cnv\n", argv[0]);
printf("\n");
}
static bool file_exists(const std::string & path) { static bool file_exists(const std::string & path) {
std::ifstream f(path.c_str()); std::ifstream f(path.c_str());
return f.good(); return f.good();
@ -131,7 +138,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN); auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
if (!gpt_params_parse(argc, argv, params, options)) { if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;

View file

@ -25,7 +25,6 @@ The project is under active development, and we are [looking for feedback and co
| `--verbosity N` | set specific verbosity level (default: 0) | | `--verbosity N` | set specific verbosity level (default: 0) |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) | | `--verbose-prompt` | print a verbose prompt before generation (default: false) |
| `--no-display-prompt` | don't print prompt at generation (default: false) | | `--no-display-prompt` | don't print prompt at generation (default: false) |
| `-co, --color` | colorise output to distinguish prompt and user input from generations (default: false) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) | | `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) | | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
@ -46,7 +45,7 @@ The project is under active development, and we are [looking for feedback and co
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) | | `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `-p, --prompt PROMPT` | prompt to start generation with<br/> | | `-p, --prompt PROMPT` | prompt to start generation with |
| `-f, --file FNAME` | a file containing the prompt (default: none) | | `-f, --file FNAME` | a file containing the prompt (default: none) |
| `--in-file FNAME` | an input file (repeat to specify multiple files) | | `--in-file FNAME` | an input file (repeat to specify multiple files) |
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
@ -96,13 +95,10 @@ The project is under active development, and we are [looking for feedback and co
| `-ns, --sequences N` | number of sequences to decode (default: 1) | | `-ns, --sequences N` | number of sequences to decode (default: 1) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
| `--mmproj FILE` | path to a multimodal projector file for LLaVA. see examples/llava/README.md |
| `--image FILE` | path to an image file. use with multimodal models. Specify multiple times for batching |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing | | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 | | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) | | `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-ngld, --gpu-layers-draft N` | number of layers to store in VRAM for the draft model |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@ -115,7 +111,6 @@ The project is under active development, and we are [looking for feedback and co
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) | | `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
| `-md, --model-draft FNAME` | draft model for speculative decoding (default: unused) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) | | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
| `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) | | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
@ -138,7 +133,6 @@ The project is under active development, and we are [looking for feedback and co
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles |
| `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) | | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
| `--log-test` | Log test | | `--log-test` | Log test |
| `--log-disable` | Log disable | | `--log-disable` | Log disable |