common : rework print_usage
This commit is contained in:
parent
eb858ce8a6
commit
d89006b2aa
1 changed files with 167 additions and 175 deletions
|
@ -1382,47 +1382,182 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
|
||||
// TODO: filter by tags
|
||||
|
||||
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
|
||||
options.push_back({ "*", " --version", "show version and build info" });
|
||||
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
||||
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-tbd, --threads-batch-draft N", "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
||||
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
||||
options.push_back({ "*", "-n, --n-predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
||||
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
||||
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
||||
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: empty)" });
|
||||
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
||||
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
||||
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)" });
|
||||
options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
|
||||
options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
|
||||
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
|
||||
options.push_back({ "*", " --version", "show version and build info" });
|
||||
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
||||
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
||||
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
||||
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
||||
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
||||
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
||||
options.push_back({ "*", "-n, --n-predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
||||
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
||||
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
||||
options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
|
||||
options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
|
||||
options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
|
||||
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
||||
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
|
||||
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
|
||||
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: empty)" });
|
||||
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
|
||||
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
|
||||
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)" });
|
||||
options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
|
||||
options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
|
||||
options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
|
||||
"not supported with --interactive or other interactive options" });
|
||||
options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
|
||||
options.push_back({ "main", "-r, --reverse-prompt PROMPT", "halt generation at PROMPT, return control in interactive mode\n"
|
||||
options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
|
||||
options.push_back({ "main", "-r, --reverse-prompt PROMPT",
|
||||
"halt generation at PROMPT, return control in interactive mode\n"
|
||||
"can be specified more than once for multiple prompts" });
|
||||
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
||||
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
|
||||
options.push_back({ "main", "-ins, --instruct", "run in instruction mode (use with Alpaca models) (default: %s)", params.instruct ? "true" : "false" });
|
||||
options.push_back({ "main", "-cml, --chatml", "run in chatml mode (use with ChatML-compatible models) (default: %s)", params.chatml ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
||||
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
||||
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
||||
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
||||
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
|
||||
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
|
||||
options.push_back({ "main", "-ins, --instruct", "run in instruction mode (use with Alpaca models) (default: %s)", params.instruct ? "true" : "false" });
|
||||
options.push_back({ "main", "-cml, --chatml", "run in chatml mode (use with ChatML-compatible models) (default: %s)", params.chatml ? "true" : "false" });
|
||||
options.push_back({ "main", " --cfg-negative-prompt PROMPT",
|
||||
"negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
|
||||
options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
|
||||
"negative prompt file to use for guidance" });
|
||||
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
|
||||
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
|
||||
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
|
||||
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
|
||||
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
|
||||
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
|
||||
|
||||
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
|
||||
"(default: %s)", sampler_type_names.c_str() });
|
||||
options.push_back({ "*", " --sampling-seq SEQUENCE",
|
||||
"simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
|
||||
options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
|
||||
options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
|
||||
options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
|
||||
options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
|
||||
options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
|
||||
options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
|
||||
options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
|
||||
options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
|
||||
options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
|
||||
options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
|
||||
options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
|
||||
options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
|
||||
"Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
||||
"(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
|
||||
options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
|
||||
options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
|
||||
options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
|
||||
"i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
|
||||
"or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
|
||||
options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
|
||||
options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
|
||||
options.push_back({ "*", "-j, --json-schema SCHEMA",
|
||||
"JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
|
||||
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
|
||||
|
||||
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
|
||||
"RoPE frequency scaling method, defaults to linear unless specified by the model" });
|
||||
options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
|
||||
options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
|
||||
options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
|
||||
options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
|
||||
options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
|
||||
options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
|
||||
options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
|
||||
options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
|
||||
options.push_back({ "embedding", " --pooling {none,mean,cls}",
|
||||
"pooling type for embeddings, use model default if unspecified" });
|
||||
options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
|
||||
|
||||
options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
|
||||
options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
|
||||
options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
|
||||
options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
|
||||
options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
|
||||
options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
|
||||
options.push_back({ "perplexity", " --multiple-choice-tasks N",
|
||||
"number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
|
||||
options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
|
||||
|
||||
options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
|
||||
options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
|
||||
options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
|
||||
|
||||
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
|
||||
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
|
||||
|
||||
if (llama_supports_mlock()) {
|
||||
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
|
||||
}
|
||||
if (llama_supports_mmap()) {
|
||||
options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
|
||||
}
|
||||
options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
|
||||
" - distribute: spread execution evenly over all nodes\n"
|
||||
" - isolate: only spawn threads on CPUs on the node that execution started on\n"
|
||||
" - numactl: use the CPU map provided by numactl\n"
|
||||
"if run without this previously, it is recommended to drop the system page cache before using this\n"
|
||||
"see https://github.com/ggerganov/llama.cpp/issues/1437" });
|
||||
|
||||
if (llama_supports_gpu_offload()) {
|
||||
options.push_back({ "*", "-ngl, --n-gpu-layers N", "number of layers to store in VRAM" });
|
||||
options.push_back({ "*", "-ngld, --n-gpu-layers-draft N", "number of layers to store in VRAM for the draft model" });
|
||||
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
||||
"how to split the model across multiple GPUs, one of:\n"
|
||||
" - none: use one GPU only\n"
|
||||
" - layer (default): split layers and KV across GPUs\n"
|
||||
" - row: split rows across GPUs" });
|
||||
options.push_back({ "*", "-ts, --tensor-split SPLIT",
|
||||
"fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
|
||||
options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
|
||||
"or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
|
||||
}
|
||||
|
||||
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
|
||||
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
|
||||
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
||||
options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
|
||||
options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
|
||||
options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
|
||||
options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
|
||||
options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
|
||||
options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
|
||||
options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
|
||||
options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
|
||||
options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
|
||||
options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
|
||||
options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
|
||||
options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
|
||||
"add a control vector with user defined scaling SCALE" });
|
||||
options.push_back({ "*", " --control-vector-layer-range START END",
|
||||
"layer range to apply the control vector(s) to, start and end inclusive" });
|
||||
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
|
||||
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
|
||||
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
|
||||
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
|
||||
options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
|
||||
options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
|
||||
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
||||
"path to static lookup cache to use for lookup decoding (not updated by generation)" });
|
||||
options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
|
||||
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
|
||||
options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
|
||||
"advanced option to override model metadata by key. may be specified multiple times.\n"
|
||||
"types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
|
||||
options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
|
||||
|
||||
printf("\n");
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
printf("\n");
|
||||
printf("options:\n\n");
|
||||
|
||||
for (const auto & o : options) {
|
||||
printf(" %-32s", o.args.c_str());
|
||||
if (o.args.length() > 34) {
|
||||
if (o.args.length() > 30) {
|
||||
printf("\n%34s", "");
|
||||
}
|
||||
|
||||
|
@ -1439,149 +1574,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
|
||||
printf("%s\n", desc.substr(start).c_str());
|
||||
}
|
||||
|
||||
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
|
||||
printf(" (default: %s)\n", sampler_type_names.c_str());
|
||||
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
|
||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
|
||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
|
||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
|
||||
printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
|
||||
printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
|
||||
printf(" --mirostat N use Mirostat sampling.\n");
|
||||
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
||||
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
|
||||
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
|
||||
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||
printf(" modifies the likelihood of token appearing in the completion,\n");
|
||||
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
||||
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
|
||||
printf(" --grammar-file FNAME file to read grammar from\n");
|
||||
printf(" -j SCHEMA, --json-schema SCHEMA\n");
|
||||
printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
|
||||
printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
|
||||
printf(" --cfg-negative-prompt PROMPT\n");
|
||||
printf(" negative prompt to use for guidance. (default: empty)\n");
|
||||
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
||||
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||
printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
|
||||
printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
|
||||
printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
|
||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" --pooling {none,mean,cls}\n");
|
||||
printf(" pooling type for embeddings, use model default if unspecified\n");
|
||||
printf(" -dt N, --defrag-thold N\n");
|
||||
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --penalize-nl penalize newline tokens\n");
|
||||
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||
printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
|
||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
|
||||
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
|
||||
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
|
||||
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||
printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||
printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
|
||||
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
|
||||
printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
|
||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
|
||||
printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
|
||||
if (llama_supports_mlock()) {
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_supports_mmap()) {
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
|
||||
printf(" - distribute: spread execution evenly over all nodes\n");
|
||||
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
||||
printf(" - numactl: use the CPU map provided by numactl\n");
|
||||
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
printf(" -ngld N, --n-gpu-layers-draft N\n");
|
||||
printf(" number of layers to store in VRAM for the draft model\n");
|
||||
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
|
||||
printf(" how to split the model across multiple GPUs, one of:\n");
|
||||
printf(" - none: use one GPU only\n");
|
||||
printf(" - layer (default): split layers and KV across GPUs\n");
|
||||
printf(" - row: split rows across GPUs\n");
|
||||
printf(" -ts SPLIT, --tensor-split SPLIT\n");
|
||||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
}
|
||||
printf(" --rpc SERVERS comma separated list of RPC servers\n");
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
|
||||
printf(" -gaw N, --grp-attn-w N\n");
|
||||
printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
|
||||
printf(" -dkvc, --dump-kv-cache\n");
|
||||
printf(" verbose print of the KV cache\n");
|
||||
printf(" -nkvo, --no-kv-offload\n");
|
||||
printf(" disable KV offload\n");
|
||||
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||
printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
|
||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||
printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
|
||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
printf(" --control-vector FNAME\n");
|
||||
printf(" add a control vector\n");
|
||||
printf(" --control-vector-scaled FNAME S\n");
|
||||
printf(" add a control vector with user defined scaling S\n");
|
||||
printf(" --control-vector-layer-range START END\n");
|
||||
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
|
||||
printf(" -m FNAME, --model FNAME\n");
|
||||
printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
|
||||
printf(" -md FNAME, --model-draft FNAME\n");
|
||||
printf(" draft model for speculative decoding (default: unused)\n");
|
||||
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||
printf(" model download url (default: unused)\n");
|
||||
printf(" -hfr REPO, --hf-repo REPO\n");
|
||||
printf(" Hugging Face model repository (default: unused)\n");
|
||||
printf(" -hff FILE, --hf-file FILE\n");
|
||||
printf(" Hugging Face model file (default: unused)\n");
|
||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||
printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
|
||||
printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
|
||||
printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
|
||||
printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf(" -ptc N, --print-token-count N\n");
|
||||
printf(" print token count every N tokens (default: %d)\n", params.n_print);
|
||||
printf(" --check-tensors check model tensor data for invalid values\n");
|
||||
printf("\n");
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue