Update contextual help
This commit is contained in:
parent
9c5d6f0ef6
commit
982c908984
5 changed files with 239 additions and 136 deletions
|
@ -621,6 +621,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
return true;
|
||||
}
|
||||
|
||||
// There were missing items from this list of helps so the wording needs checking (all inserted at the end, so reposition too):
|
||||
// --embedding, --beams, --ppl-stride, --ppl-output-type, memory-f32, no-mmap, mlock, use-color, nprobs
|
||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf("usage: %s [options]\n", argv[0]);
|
||||
printf("\n");
|
||||
|
@ -667,7 +669,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
||||
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
||||
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
||||
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||
printf(" -l T, --logit-bias T T = TOKEN_ID(plus/minus)BIAS\n");
|
||||
printf(" modifies the likelihood of token appearing in the completion,\n");
|
||||
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
||||
|
@ -682,7 +684,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token (default is DO penalise nl token)\n");
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
|
@ -729,6 +731,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||
printf(" --ppl-stride stride for ppl calcs. 0 (default): the pre-existing approach will be used.\n");
|
||||
printf(" --ppl-output-type 0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n");
|
||||
printf(" --embedding 0 (default): get only sentence embedding\n");
|
||||
printf(" --beams N 0 (default): if non-zero use beam search of given width N.\n");
|
||||
printf(" --memory-f32 0 (default): if true (= 1) disable f16 memory.\n");
|
||||
printf(" --no-mmap 0 (default): if true use mmap for faster loads.\n");
|
||||
printf(" --mlock 0 (default): if true keep model in memory.\n");
|
||||
printf(" --use-color 0 (default): use color to distinguish generations from inputs\n");
|
||||
printf(" --nprobs N if > 0 output the probabilities of the top N tokens\n");
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
|
|
@ -75,6 +75,7 @@ struct gpt_params {
|
|||
std::string cfg_negative_prompt; // string to help guidance
|
||||
float cfg_scale = 1.f; // How strong is guidance
|
||||
|
||||
std::string help = ""; // universal help parameter
|
||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||
std::string model_draft = ""; // draft model for speculative decoding
|
||||
std::string model_alias = "unknown"; // model alias
|
||||
|
|
|
@ -6,7 +6,22 @@ import collections
|
|||
import re
|
||||
import read_common_h
|
||||
|
||||
# update the source file - usually 'help_list.txt', so the default - in case the source file has been changed
|
||||
def update_file(file_from, file_to = "help_list.txt"):
|
||||
# Open the file_from file
|
||||
with open(file_from, "r") as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# Find lines starting with "printf(" and ending with ");" (assumes file_from is written in C/C++)
|
||||
pattern = r'printf\("\s(.*?)\);'
|
||||
matched_lines = [re.search(pattern, line).group(1) for line in lines if re.search(pattern, line)]
|
||||
|
||||
# Save matched lines to file_to
|
||||
with open(file_to, "w") as file:
|
||||
for line in matched_lines:
|
||||
file.write(line + '\n')
|
||||
|
||||
# helper fn to make the hyphenated words in a file snake-case for searching
|
||||
def replace_dashes_with_underscores(filename):
|
||||
with open(filename, 'r') as file:
|
||||
content = file.read()
|
||||
|
@ -17,6 +32,13 @@ def replace_dashes_with_underscores(filename):
|
|||
with open(filename, 'w') as file:
|
||||
file.write(replaced_content)
|
||||
|
||||
# helper fn to make the underscored words in a file hyphenated for print
|
||||
def replace_underscores_with_dashes(parameter):
|
||||
# Match '_' surrounded by word characters on both sides and replace with '-'
|
||||
return re.sub(r'(\w)_(\w)', r'\1-\2', parameter)
|
||||
|
||||
|
||||
# find all instances of "params." in the *.cpp files in a directory
|
||||
def find_arguments(directory):
|
||||
arguments = {}
|
||||
|
||||
|
@ -28,21 +50,21 @@ def find_arguments(directory):
|
|||
with open(filepath, 'r') as file:
|
||||
content = file.read()
|
||||
|
||||
# Search for the expression "params." excluding prefixes and read the attribute without trailing detritus
|
||||
# Search for the expression "params." or "params->" excluding prefixes and read the attribute without trailing detritus
|
||||
# matches = re.findall(r'(?:^|\s)params\.(.*)(?=[\). <,;}]|\Z)', content)
|
||||
matches = set(re.findall(r'(?:^|\b)params\.([a-zA-Z_0-9]*)(?=[\). <,;}]|\Z)', content))
|
||||
# Remove duplicates from matches list
|
||||
# arguments_list = list(set([match.strip() for match in matches]))
|
||||
matches = set(re.findall(r'(?:^|\b)params[->\.]([a-zA-Z_0-9]*)(?=[\). <,;}]|\Z)', content))
|
||||
|
||||
# Add the matches to the dictionary
|
||||
arguments[filepath] = matches
|
||||
|
||||
return arguments
|
||||
|
||||
# output a list of the params.attributes for each file
|
||||
def output_results(result):
|
||||
sorted_result = collections.OrderedDict(sorted(result.items()))
|
||||
all_of_them = set()
|
||||
for filename, arguments in sorted_result.items():
|
||||
arguments.add("help")
|
||||
print(f"Filename: \033[32m{filename.split('/')[-1]}\033[0m, arguments: {arguments}\n")
|
||||
for argument in arguments:
|
||||
if argument not in all_of_them:
|
||||
|
@ -50,6 +72,7 @@ def output_results(result):
|
|||
print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.")
|
||||
return sorted_result
|
||||
|
||||
# put all the words after "//" in a dict back together with spaces
|
||||
def concatenate(v):
|
||||
concatenated_element = ""
|
||||
for i, element in enumerate(v):
|
||||
|
@ -57,24 +80,78 @@ def concatenate(v):
|
|||
concatenated_element = " ".join(v[i:])
|
||||
return concatenated_element
|
||||
|
||||
def title_print(filename):
|
||||
title = filename.split('/')[-1]
|
||||
print("\n\n"+"#"*(10+len(title)))
|
||||
print(f"Filename: \033[32m{title}\033[0m")
|
||||
print("#"*(10+len(title)))
|
||||
|
||||
def substitution_list(parameters):
|
||||
# store untrapped parameters as identicals in case we need to change them later
|
||||
sub_dict = {"n_threads": "threads",
|
||||
"n_ctx": "ctx_size",
|
||||
"n_draft" : "draft",
|
||||
"n_threads_batch" : "threads_batch",
|
||||
"n_chunks" : "chunks",
|
||||
"n_batch" : "batch_size",
|
||||
"n_sequences" : "sequences",
|
||||
"n_parallel" : "parallel",
|
||||
"n_beams" : "beams",
|
||||
"n_keep" : "keep",
|
||||
"n_probs" : "nprobs",
|
||||
"path_prompt_cache" : "prompt_cache",
|
||||
"input_prefix" : "in_prefix",
|
||||
"input_suffix" : "in_suffix",
|
||||
"input_prefix_bos" : "in_prefix_bos",
|
||||
"antiprompt" : "reverse_prompt",
|
||||
"mul_mat_q" : "no_mul_mat_q",
|
||||
"use_mmap" : "no_mmap",
|
||||
"use_mlock" : "mlock",
|
||||
"model_alias" : "alias",
|
||||
"tfs_z" : "tfs",
|
||||
"use_color" : "color",
|
||||
"logit_bias" : "logit_bias",
|
||||
"ignore_eos" : "ignore_eos",
|
||||
"mirostat_tau" : "mirostat_ent",
|
||||
"mirostat_eta" : "mirostat_lr",
|
||||
"penalize_nl" : "no_penalize_nl",
|
||||
"typical_p" : "typical",
|
||||
"mem_size" : "mem_size",
|
||||
"mem_buffer" : "mem_buffer",
|
||||
"no_alloc" : "no_alloc"
|
||||
}
|
||||
new_parameters = []
|
||||
for parameter in parameters:
|
||||
if parameter in sub_dict:
|
||||
# we need both for future reference
|
||||
new_parameters.append(parameter)
|
||||
new_parameters.append(sub_dict[parameter])
|
||||
else:
|
||||
new_parameters.append(parameter)
|
||||
return new_parameters
|
||||
|
||||
# output the lines of the help file
|
||||
def find_parameters(file, sorted_result):
|
||||
with open(file, "r") as helpfile:
|
||||
lines = helpfile.read().split("\n")
|
||||
for filename, arguments in sorted_result.items():
|
||||
# we try to fix up some variant labelling in help_file.txt
|
||||
arguments = substitution_list(arguments)
|
||||
parameters = []
|
||||
for line in lines:
|
||||
for argument in arguments:
|
||||
# building pattern to avoid spurious matches
|
||||
pattern = r"(?:--{}\s)|(?:params\.{}[\s.,();])".format(argument, argument.split('n_')[-1])
|
||||
if re.search(pattern, line):
|
||||
# pattern = r"(?:--{}\s)|(?:params\.{}[\s.,\.();])".format(argument, argument.split('n_')[-1])
|
||||
pattern = r"(?:--{}\s)|(?:params\.{}(?=[\s.,\.\(\);]|\.+\w))".format(argument, argument.split('n_')[-1])
|
||||
# pattern = r"(?<=params\.)\w+(?=\.\w+|\.|,|;|\}|\{|\(|\)|\.)"
|
||||
# bit of a hack to exclude --attributes at the end of help comment lines
|
||||
if re.search(pattern, line[:50]):
|
||||
parameters.append(line)
|
||||
|
||||
all_parameters = set(parameters)
|
||||
file = filename.split('/')[-1]
|
||||
print("\n\n"+"#"*(10+len(file)))
|
||||
print(f"Filename: \033[32m{file}\033[0m")
|
||||
print("#"*(10+len(file)))
|
||||
print(f"\n\n command-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
|
||||
|
||||
title_print(filename)
|
||||
print(f"\nCommand-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
|
||||
|
||||
if not all_parameters:
|
||||
print(f" \033[032mNone\033[0m\n")
|
||||
|
@ -83,11 +160,16 @@ def find_parameters(file, sorted_result):
|
|||
else:
|
||||
help_count = 0
|
||||
for parameter in all_parameters:
|
||||
# reverse the hypthen/underscore pattern just for printing
|
||||
replaced_param = replace_underscores_with_dashes(parameter)
|
||||
if not parameter.startswith(" "):
|
||||
help_count += 1
|
||||
print(f"{help_count:>2} help: \033[33m{parameter:<30}\033[0m")
|
||||
print(f"{help_count:>2} help: \033[33m{replaced_param:<30}\033[0m")
|
||||
else:
|
||||
print(f" help: \033[33m{replaced_param:<30}\033[0m")
|
||||
|
||||
# now do it the new way
|
||||
print("\nNow we extract the original gpt_params definition and defaults for implemented arguments:\n")
|
||||
print("\nNow we extract the original gpt_params definition from common.h with the defaults for implemented arguments:\n")
|
||||
gpt_count = 0
|
||||
for k,v in read_common_h.parameters.items():
|
||||
if not read_common_h.parameters.items():
|
||||
|
@ -99,14 +181,14 @@ def find_parameters(file, sorted_result):
|
|||
print(f"{gpt_count:>2} gpt_param: \033[32m{k:>19}; \033[34mrole: \033[33m{concatenated_element:<60}\033[0m; \033[34mdefault: \033[30m{v[1]:<10}\033[0m ")
|
||||
|
||||
# searching the other way round is quicker:
|
||||
print("\nSearching the other way round is quicker:\n")
|
||||
print("\nSearching the other way round is more efficient:\n")
|
||||
key_count = 0
|
||||
for argument in arguments:
|
||||
for argument in set(arguments):
|
||||
if argument in read_common_h.parameters:
|
||||
key_count += 1
|
||||
print(f"{key_count:>2} key: {argument:>25}; role: {concatenate(read_common_h.parameters[argument]):<60}; default: {read_common_h.parameters[argument][1]:<10}")
|
||||
if help_count == gpt_count and gpt_count == key_count:
|
||||
print("\n\033[032mNo unresolved help-list incompatibilities with this app.\033[0m")
|
||||
print(f"\n\033[032mNo unresolved help-list incompatibilities with \033[33m{filename.split('/')[-1]}\033[0m")
|
||||
else:
|
||||
print("\n\033[031mThis app requires some attention regarding help-function consistency.\033[0m")
|
||||
|
||||
|
@ -114,13 +196,17 @@ def find_parameters(file, sorted_result):
|
|||
directory = '/Users/edsilm2/llama.cpp/examples'
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# update the source help file from C++ source (this works exactly as required)
|
||||
update_file("common/common.cpp", "help_list.txt")
|
||||
|
||||
# get the parameters from the common.h file utiity we import
|
||||
print(read_common_h.parameters)
|
||||
# So now we've got the gpt_parameters in this parameters dict
|
||||
|
||||
# First we alter all the hyphenated help words in help-file.txt to underscores
|
||||
# replace_dashes_with_underscores('help_list.txt')
|
||||
# This above may no longer be needed
|
||||
# we later reverse these changers before printing the help lines
|
||||
replace_dashes_with_underscores('help_list.txt')
|
||||
|
||||
print("\n####################### find parameters #################################")
|
||||
# Call the find function to collect all the params.attributes and output the result
|
||||
|
|
|
@ -7,6 +7,9 @@ with open('common/common.h', 'r') as file:
|
|||
lines = file.read().split('\n')
|
||||
|
||||
parameters = {}
|
||||
# we add the logit_bias parameter which otherwise is not found
|
||||
parameters['logit_bias']=['logit_bias', '0', '//', 'way', 'to', 'alter', 'prob', 'of', 'particular', 'words']
|
||||
|
||||
inside = False
|
||||
for line in lines:
|
||||
# non_whitespace_elements = re.findall(r"\S+", line)
|
||||
|
@ -18,17 +21,19 @@ for line in lines:
|
|||
# note: cannot use nwe[0] because types do not generate unique keys and so overwrite
|
||||
# here we deliberately add back the key so we can make a manual change when it is different
|
||||
parameters[non_whitespace_elements[1]] = non_whitespace_elements[1:]
|
||||
# remove spurious entry caused by eccentric status of logit_bias
|
||||
if "float>" in parameters and parameters["float>"][1] == 'logit_bias':
|
||||
del parameters["float>"]
|
||||
|
||||
# this is a bit of a hack to terminate the harvest
|
||||
if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill":
|
||||
inside = False
|
||||
break
|
||||
for k, v in parameters.items():
|
||||
print(f"key: {k:<20}; values: {v}")
|
||||
|
||||
concatenated_element = ""
|
||||
for i, element in enumerate(v):
|
||||
if element == "//":
|
||||
concatenated_element = " ".join(v[i:])
|
||||
# break
|
||||
print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m")
|
||||
|
||||
# this is a bit of a hack to terminate the harvest
|
||||
if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill":
|
||||
inside = False
|
||||
break
|
208
help_list.txt
208
help_list.txt
|
@ -1,104 +1,104 @@
|
|||
-h, --helpshow this help message and exit
|
||||
-i, --interactive run in interactive mode
|
||||
--interactive_first run in interactive mode and wait for input right away
|
||||
-ins, --instructrun in instruction mode (use with Alpaca models)
|
||||
--multiline_input allows you to write or paste multiple lines without ending each in '\\'
|
||||
-r PROMPT, --reverse_prompt PROMPT
|
||||
halt generation at PROMPT, return control in interactive mode
|
||||
(can be specified more than once for multiple prompts).
|
||||
--color colorise output to distinguish prompt and user input from generations
|
||||
-s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)
|
||||
-t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||
-tb N, --threads_batch N
|
||||
number of threads to use during batch and prompt processing (default: same as --threads)
|
||||
-p PROMPT, --prompt PROMPT
|
||||
prompt to start generation with (default: empty)
|
||||
-e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)
|
||||
--prompt_cache FNAME file to cache prompt state for faster startup (default: none)
|
||||
--prompt_cache_all if specified, saves user input and generations to cache as well.
|
||||
not supported with --interactive or other interactive options
|
||||
--prompt_cache_ro if specified, uses the prompt cache but does not update it.
|
||||
--random_prompt start with a randomized prompt.
|
||||
--in_prefix_bos prefix BOS to user inputs, preceding the `--in_prefix` string
|
||||
--in_prefix STRING string to prefix user inputs with (default: empty)
|
||||
--in_suffix STRING string to suffix after user inputs with (default: empty)
|
||||
-f FNAME, --file FNAME
|
||||
prompt file to start generation.
|
||||
-n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||
-c N, --ctx_size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||
-b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
--top_k N top_k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||
--top_p N top_p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||
--tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
||||
--typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
||||
--repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
||||
--repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
||||
--presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
||||
--frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
||||
--mirostat N use Mirostat sampling.
|
||||
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
|
||||
(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
||||
--mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
||||
--mirostat_ent NMirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
||||
-l TOKEN_ID(+/-)BIAS, --logit_bias TOKEN_ID(+/-)BIAS
|
||||
modifies the likelihood of token appearing in the completion,
|
||||
i.e. `--logit_bias 15043+1` to increase likelihood of token ' Hello',
|
||||
or `--logit_bias 15043_1` to decrease likelihood of token ' Hello'
|
||||
--grammar GRAMMAR BNF_like grammar to constrain generations (see samples in grammars/ dir)
|
||||
--grammar_file FNAME file to read grammar from
|
||||
--cfg_negative_prompt PROMPT
|
||||
negative prompt to use for guidance. (default: empty)
|
||||
--cfg_negative_prompt_file FNAME
|
||||
negative prompt file to use for guidance. (default: empty)
|
||||
--cfg_scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||
--rope_scale N RoPE context linear scaling factor, inverse of --rope_freq_scale
|
||||
--rope_freq_base N RoPE base frequency, used by NTK_aware scaling (default: loaded from model)
|
||||
--rope_freq_scale N RoPE frequency linear scaling factor (default: loaded from model)
|
||||
--ignore_eos ignore end of stream token and continue generating (implies --logit_bias 2_inf)
|
||||
--no_penalize_nldo not penalize newline token
|
||||
--memory_f32 use f32 instead of f16 for memory key+value (default: disabled)
|
||||
not recommended: doubles context memory required and no measurable increase in quality
|
||||
--temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
--logits_all return logits for all tokens in the batch (default: disabled)
|
||||
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f
|
||||
--hellaswag_tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||
--keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
--draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||
--chunks Nmax number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
-np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
|
||||
-ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
|
||||
-cb, --cont_batching enable continuous batching (a.k.a dynamic batching) (default: disabled)
|
||||
if (llama_mlock_supported()) {
|
||||
--mlock force system to keep model in RAM rather than swapping or compressing
|
||||
}
|
||||
if (llama_mmap_supported()) {
|
||||
--no_mmap do not memory_map model (slower load but may reduce pageouts if not using mlock)
|
||||
}
|
||||
--numa attempt optimizations that help on some NUMA systems
|
||||
if run without this previously, it is recommended to drop the system page cache before using this
|
||||
see https://github.com/ggerganov/llama.cpp/issues/1437
|
||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
-ngl N, --n_gpu_layers N
|
||||
number of layers to store in VRAM
|
||||
-ngld N, --n_gpu_layers_draft N
|
||||
number of layers to store in VRAM for the draft model
|
||||
-ts SPLIT --tensor_split SPLIT
|
||||
how to split tensors across multiple GPUs, comma_separated list of proportions, e.g. 3,1
|
||||
-mg i, --main_gpu i the GPU to use for scratch and small tensors
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
-nommq, --no_mul_mat_q
|
||||
use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.
|
||||
Not recommended since this is both slower and uses more VRAM.
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif
|
||||
--verbose_promptprint prompt before generation
|
||||
fprintf(stderr, " --simple_io use basic IO for better compatibility in subprocesses and limited consoles
|
||||
--lora FNAME apply LoRA adapter (implies --no_mmap)
|
||||
--lora_scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no_mmap)
|
||||
--lora_base FNAME optional model to use as a base for the layers modified by the LoRA adapter
|
||||
-m FNAME, --model FNAME
|
||||
model path (default: %s)\n", params.model.c_str());
|
||||
-md FNAME, --model_draft FNAME
|
||||
draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
||||
-ld LOGDIR, --logdir LOGDIR
|
||||
path under which to save YAML logs (no logging if unset)
|
||||
-h, --help show this help message and exit\n"
|
||||
-i, --interactive run in interactive mode\n"
|
||||
--interactive_first run in interactive mode and wait for input right away\n"
|
||||
-ins, --instruct run in instruction mode (use with Alpaca models)\n"
|
||||
--multiline_input allows you to write or paste multiple lines without ending each in '\\'\n"
|
||||
-r PROMPT, --reverse_prompt PROMPT\n"
|
||||
halt generation at PROMPT, return control in interactive mode\n"
|
||||
(can be specified more than once for multiple prompts).\n"
|
||||
--color colorise output to distinguish prompt and user input from generations\n"
|
||||
-s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"
|
||||
-t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads
|
||||
-tb N, --threads_batch N\n"
|
||||
number of threads to use during batch and prompt processing (default: same as --threads)\n"
|
||||
-p PROMPT, --prompt PROMPT\n"
|
||||
prompt to start generation with (default: empty)\n"
|
||||
-e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"
|
||||
--prompt_cache FNAME file to cache prompt state for faster startup (default: none)\n"
|
||||
--prompt_cache_all if specified, saves user input and generations to cache as well.\n"
|
||||
not supported with --interactive or other interactive options\n"
|
||||
--prompt_cache_ro if specified, uses the prompt cache but does not update it.\n"
|
||||
--random_prompt start with a randomized prompt.\n"
|
||||
--in_prefix_bos prefix BOS to user inputs, preceding the `--in_prefix` string\n"
|
||||
--in_prefix STRING string to prefix user inputs with (default: empty)\n"
|
||||
--in_suffix STRING string to suffix after user inputs with (default: empty)\n"
|
||||
-f FNAME, --file FNAME\n"
|
||||
prompt file to start generation.\n"
|
||||
-n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict
|
||||
-c N, --ctx_size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx
|
||||
-b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch
|
||||
--top_k N top_k sampling (default: %d, 0 = disabled)\n", params.top_k
|
||||
--top_p N top_p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p
|
||||
--tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z
|
||||
--typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p
|
||||
--repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n
|
||||
--repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty
|
||||
--presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty
|
||||
--frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty
|
||||
--mirostat N use Mirostat sampling.\n"
|
||||
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
|
||||
(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat
|
||||
--mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta
|
||||
--mirostat_ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau
|
||||
-l T, --logit_bias T T = TOKEN_ID(plus/minus)BIAS\n"
|
||||
modifies the likelihood of token appearing in the completion,\n"
|
||||
i.e. `--logit_bias 15043+1` to increase likelihood of token ' Hello',\n"
|
||||
or `--logit_bias 15043_1` to decrease likelihood of token ' Hello'\n"
|
||||
--grammar GRAMMAR BNF_like grammar to constrain generations (see samples in grammars/ dir)\n"
|
||||
--grammar_file FNAME file to read grammar from\n"
|
||||
--cfg_negative_prompt PROMPT\n"
|
||||
negative prompt to use for guidance. (default: empty)\n"
|
||||
--cfg_negative_prompt_file FNAME\n"
|
||||
negative prompt file to use for guidance. (default: empty)\n"
|
||||
--cfg_scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale
|
||||
--rope_scale N RoPE context linear scaling factor, inverse of --rope_freq_scale\n"
|
||||
--rope_freq_base N RoPE base frequency, used by NTK_aware scaling (default: loaded from model)\n"
|
||||
--rope_freq_scale N RoPE frequency linear scaling factor (default: loaded from model)\n"
|
||||
--ignore_eos ignore end of stream token and continue generating (implies --logit_bias 2_inf)\n"
|
||||
--no_penalize_nl do not penalize newline token (default is DO penalise nl token)\n"
|
||||
--memory_f32 use f32 instead of f16 for memory key+value (default: disabled)\n"
|
||||
not recommended: doubles context memory required and no measurable increase in quality\n"
|
||||
--temp N temperature (default: %.1f)\n", (double)params.temp
|
||||
--logits_all return logits for all tokens in the batch (default: disabled)\n"
|
||||
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"
|
||||
--hellaswag_tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks
|
||||
--keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep
|
||||
--draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft
|
||||
--chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks
|
||||
-np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel
|
||||
-ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences
|
||||
-cb, --cont_batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"
|
||||
--mlock force system to keep model in RAM rather than swapping or compressing\n"
|
||||
--no_mmap do not memory_map model (slower load but may reduce pageouts if not using mlock)\n"
|
||||
--numa attempt optimizations that help on some NUMA systems\n"
|
||||
if run without this previously, it is recommended to drop the system page cache before using this\n"
|
||||
see https://github.com/ggerganov/llama.cpp/issues/1437\n"
|
||||
-ngl N, --n_gpu_layers N\n"
|
||||
number of layers to store in VRAM\n"
|
||||
-ngld N, --n_gpu_layers_draft N\n"
|
||||
number of layers to store in VRAM for the draft model\n"
|
||||
-ts SPLIT --tensor_split SPLIT\n"
|
||||
how to split tensors across multiple GPUs, comma_separated list of proportions, e.g. 3,1\n"
|
||||
-mg i, --main_gpu i the GPU to use for scratch and small tensors\n"
|
||||
-nommq, --no_mul_mat_q\n"
|
||||
use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"
|
||||
Not recommended since this is both slower and uses more VRAM.\n"
|
||||
--verbose_prompt print prompt before generation\n"
|
||||
--lora FNAME apply LoRA adapter (implies --no_mmap)\n"
|
||||
--lora_scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no_mmap)\n"
|
||||
--lora_base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"
|
||||
-m FNAME, --model FNAME\n"
|
||||
model path (default: %s)\n", params.model.c_str()
|
||||
-md FNAME, --model_draft FNAME\n"
|
||||
draft model for speculative decoding (default: %s)\n", params.model.c_str()
|
||||
-ld LOGDIR, --logdir LOGDIR\n"
|
||||
path under which to save YAML logs (no logging if unset)\n"
|
||||
--ppl_stride stride for ppl calcs. 0 (default): the pre_existing approach will be used.\n"
|
||||
--ppl_output_type 0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n"
|
||||
--embedding 0 (default): get only sentence embedding\n"
|
||||
--beams N 0 (default): if non_zero use beam search of given width N.\n"
|
||||
--memory_f32 0 (default): if true (= 1) disable f16 memory.\n"
|
||||
--no_mmap 0 (default): if true use mmap for faster loads.\n"
|
||||
--mlock 0 (default): if true keep model in memory.\n"
|
||||
--use_color 0 (default): use color to distinguish generations from inputs\n"
|
||||
--nprobs N if > 0 output the probabilities of the top N tokens\n"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue