Update contextual help

This commit is contained in:
pudepiedj 2023-10-08 22:26:13 +01:00
parent 9c5d6f0ef6
commit 982c908984
5 changed files with 239 additions and 136 deletions

View file

@ -621,6 +621,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true; return true;
} }
// There were missing items from this list of helps so the wording needs checking (all inserted at the end, so reposition too):
// --embedding, --beams, --ppl-stride, --ppl-output-type, memory-f32, no-mmap, mlock, use-color, nprobs
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf("usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
printf("\n"); printf("\n");
@ -667,7 +669,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); printf(" -l T, --logit-bias T T = TOKEN_ID(plus/minus)BIAS\n");
printf(" modifies the likelihood of token appearing in the completion,\n"); printf(" modifies the likelihood of token appearing in the completion,\n");
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
@ -682,7 +684,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --no-penalize-nl do not penalize newline token (default is DO penalise nl token)\n");
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
@ -729,6 +731,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str()); printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n");
printf(" path under which to save YAML logs (no logging if unset)\n"); printf(" path under which to save YAML logs (no logging if unset)\n");
printf(" --ppl-stride stride for ppl calcs. 0 (default): the pre-existing approach will be used.\n");
printf(" --ppl-output-type 0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n");
printf(" --embedding 0 (default): get only sentence embedding\n");
printf(" --beams N 0 (default): if non-zero use beam search of given width N.\n");
printf(" --memory-f32 0 (default): if true (= 1) disable f16 memory.\n");
printf(" --no-mmap 0 (default): if true use mmap for faster loads.\n");
printf(" --mlock 0 (default): if true keep model in memory.\n");
printf(" --use-color 0 (default): use color to distinguish generations from inputs\n");
printf(" --nprobs N if > 0 output the probabilities of the top N tokens\n");
printf("\n"); printf("\n");
} }

View file

@ -75,6 +75,7 @@ struct gpt_params {
std::string cfg_negative_prompt; // string to help guidance std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance float cfg_scale = 1.f; // How strong is guidance
std::string help = ""; // universal help parameter
std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias

View file

@ -6,7 +6,22 @@ import collections
import re import re
import read_common_h import read_common_h
# update the source file - usually 'help_list.txt', so the default - in case the source file has been changed
def update_file(file_from, file_to = "help_list.txt"):
# Open the file_from file
with open(file_from, "r") as file:
lines = file.readlines()
# Find lines starting with "printf(" and ending with ");" (assumes file_from is written in C/C++)
pattern = r'printf\("\s(.*?)\);'
matched_lines = [re.search(pattern, line).group(1) for line in lines if re.search(pattern, line)]
# Save matched lines to file_to
with open(file_to, "w") as file:
for line in matched_lines:
file.write(line + '\n')
# helper fn to make the hyphenated words in a file snake-case for searching
def replace_dashes_with_underscores(filename): def replace_dashes_with_underscores(filename):
with open(filename, 'r') as file: with open(filename, 'r') as file:
content = file.read() content = file.read()
@ -17,6 +32,13 @@ def replace_dashes_with_underscores(filename):
with open(filename, 'w') as file: with open(filename, 'w') as file:
file.write(replaced_content) file.write(replaced_content)
# helper fn to make the underscored words in a file hyphenated for print
def replace_underscores_with_dashes(parameter):
# Match '_' surrounded by word characters on both sides and replace with '-'
return re.sub(r'(\w)_(\w)', r'\1-\2', parameter)
# find all instances of "params." in the *.cpp files in a directory
def find_arguments(directory): def find_arguments(directory):
arguments = {} arguments = {}
@ -28,21 +50,21 @@ def find_arguments(directory):
with open(filepath, 'r') as file: with open(filepath, 'r') as file:
content = file.read() content = file.read()
# Search for the expression "params." excluding prefixes and read the attribute without trailing detritus # Search for the expression "params." or "params->" excluding prefixes and read the attribute without trailing detritus
# matches = re.findall(r'(?:^|\s)params\.(.*)(?=[\). <,;}]|\Z)', content) # matches = re.findall(r'(?:^|\s)params\.(.*)(?=[\). <,;}]|\Z)', content)
matches = set(re.findall(r'(?:^|\b)params\.([a-zA-Z_0-9]*)(?=[\). <,;}]|\Z)', content)) matches = set(re.findall(r'(?:^|\b)params[->\.]([a-zA-Z_0-9]*)(?=[\). <,;}]|\Z)', content))
# Remove duplicates from matches list
# arguments_list = list(set([match.strip() for match in matches]))
# Add the matches to the dictionary # Add the matches to the dictionary
arguments[filepath] = matches arguments[filepath] = matches
return arguments return arguments
# output a list of the params.attributes for each file
def output_results(result): def output_results(result):
sorted_result = collections.OrderedDict(sorted(result.items())) sorted_result = collections.OrderedDict(sorted(result.items()))
all_of_them = set() all_of_them = set()
for filename, arguments in sorted_result.items(): for filename, arguments in sorted_result.items():
arguments.add("help")
print(f"Filename: \033[32m{filename.split('/')[-1]}\033[0m, arguments: {arguments}\n") print(f"Filename: \033[32m{filename.split('/')[-1]}\033[0m, arguments: {arguments}\n")
for argument in arguments: for argument in arguments:
if argument not in all_of_them: if argument not in all_of_them:
@ -50,6 +72,7 @@ def output_results(result):
print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.") print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.")
return sorted_result return sorted_result
# put all the words after "//" in a dict back together with spaces
def concatenate(v): def concatenate(v):
concatenated_element = "" concatenated_element = ""
for i, element in enumerate(v): for i, element in enumerate(v):
@ -57,24 +80,78 @@ def concatenate(v):
concatenated_element = " ".join(v[i:]) concatenated_element = " ".join(v[i:])
return concatenated_element return concatenated_element
def title_print(filename):
title = filename.split('/')[-1]
print("\n\n"+"#"*(10+len(title)))
print(f"Filename: \033[32m{title}\033[0m")
print("#"*(10+len(title)))
def substitution_list(parameters):
# store untrapped parameters as identicals in case we need to change them later
sub_dict = {"n_threads": "threads",
"n_ctx": "ctx_size",
"n_draft" : "draft",
"n_threads_batch" : "threads_batch",
"n_chunks" : "chunks",
"n_batch" : "batch_size",
"n_sequences" : "sequences",
"n_parallel" : "parallel",
"n_beams" : "beams",
"n_keep" : "keep",
"n_probs" : "nprobs",
"path_prompt_cache" : "prompt_cache",
"input_prefix" : "in_prefix",
"input_suffix" : "in_suffix",
"input_prefix_bos" : "in_prefix_bos",
"antiprompt" : "reverse_prompt",
"mul_mat_q" : "no_mul_mat_q",
"use_mmap" : "no_mmap",
"use_mlock" : "mlock",
"model_alias" : "alias",
"tfs_z" : "tfs",
"use_color" : "color",
"logit_bias" : "logit_bias",
"ignore_eos" : "ignore_eos",
"mirostat_tau" : "mirostat_ent",
"mirostat_eta" : "mirostat_lr",
"penalize_nl" : "no_penalize_nl",
"typical_p" : "typical",
"mem_size" : "mem_size",
"mem_buffer" : "mem_buffer",
"no_alloc" : "no_alloc"
}
new_parameters = []
for parameter in parameters:
if parameter in sub_dict:
# we need both for future reference
new_parameters.append(parameter)
new_parameters.append(sub_dict[parameter])
else:
new_parameters.append(parameter)
return new_parameters
# output the lines of the help file
def find_parameters(file, sorted_result): def find_parameters(file, sorted_result):
with open(file, "r") as helpfile: with open(file, "r") as helpfile:
lines = helpfile.read().split("\n") lines = helpfile.read().split("\n")
for filename, arguments in sorted_result.items(): for filename, arguments in sorted_result.items():
# we try to fix up some variant labelling in help_file.txt
arguments = substitution_list(arguments)
parameters = [] parameters = []
for line in lines: for line in lines:
for argument in arguments: for argument in arguments:
# building pattern to avoid spurious matches # building pattern to avoid spurious matches
pattern = r"(?:--{}\s)|(?:params\.{}[\s.,();])".format(argument, argument.split('n_')[-1]) # pattern = r"(?:--{}\s)|(?:params\.{}[\s.,\.();])".format(argument, argument.split('n_')[-1])
if re.search(pattern, line): pattern = r"(?:--{}\s)|(?:params\.{}(?=[\s.,\.\(\);]|\.+\w))".format(argument, argument.split('n_')[-1])
# pattern = r"(?<=params\.)\w+(?=\.\w+|\.|,|;|\}|\{|\(|\)|\.)"
# bit of a hack to exclude --attributes at the end of help comment lines
if re.search(pattern, line[:50]):
parameters.append(line) parameters.append(line)
all_parameters = set(parameters) all_parameters = set(parameters)
file = filename.split('/')[-1]
print("\n\n"+"#"*(10+len(file))) title_print(filename)
print(f"Filename: \033[32m{file}\033[0m") print(f"\nCommand-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
print("#"*(10+len(file)))
print(f"\n\n command-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
if not all_parameters: if not all_parameters:
print(f" \033[032mNone\033[0m\n") print(f" \033[032mNone\033[0m\n")
@ -83,11 +160,16 @@ def find_parameters(file, sorted_result):
else: else:
help_count = 0 help_count = 0
for parameter in all_parameters: for parameter in all_parameters:
help_count += 1 # reverse the hypthen/underscore pattern just for printing
print(f"{help_count:>2} help: \033[33m{parameter:<30}\033[0m") replaced_param = replace_underscores_with_dashes(parameter)
if not parameter.startswith(" "):
help_count += 1
print(f"{help_count:>2} help: \033[33m{replaced_param:<30}\033[0m")
else:
print(f" help: \033[33m{replaced_param:<30}\033[0m")
# now do it the new way # now do it the new way
print("\nNow we extract the original gpt_params definition and defaults for implemented arguments:\n") print("\nNow we extract the original gpt_params definition from common.h with the defaults for implemented arguments:\n")
gpt_count = 0 gpt_count = 0
for k,v in read_common_h.parameters.items(): for k,v in read_common_h.parameters.items():
if not read_common_h.parameters.items(): if not read_common_h.parameters.items():
@ -99,14 +181,14 @@ def find_parameters(file, sorted_result):
print(f"{gpt_count:>2} gpt_param: \033[32m{k:>19}; \033[34mrole: \033[33m{concatenated_element:<60}\033[0m; \033[34mdefault: \033[30m{v[1]:<10}\033[0m ") print(f"{gpt_count:>2} gpt_param: \033[32m{k:>19}; \033[34mrole: \033[33m{concatenated_element:<60}\033[0m; \033[34mdefault: \033[30m{v[1]:<10}\033[0m ")
# searching the other way round is quicker: # searching the other way round is quicker:
print("\nSearching the other way round is quicker:\n") print("\nSearching the other way round is more efficient:\n")
key_count = 0 key_count = 0
for argument in arguments: for argument in set(arguments):
if argument in read_common_h.parameters: if argument in read_common_h.parameters:
key_count += 1 key_count += 1
print(f"{key_count:>2} key: {argument:>25}; role: {concatenate(read_common_h.parameters[argument]):<60}; default: {read_common_h.parameters[argument][1]:<10}") print(f"{key_count:>2} key: {argument:>25}; role: {concatenate(read_common_h.parameters[argument]):<60}; default: {read_common_h.parameters[argument][1]:<10}")
if help_count == gpt_count and gpt_count == key_count: if help_count == gpt_count and gpt_count == key_count:
print("\n\033[032mNo unresolved help-list incompatibilities with this app.\033[0m") print(f"\n\033[032mNo unresolved help-list incompatibilities with \033[33m{filename.split('/')[-1]}\033[0m")
else: else:
print("\n\033[031mThis app requires some attention regarding help-function consistency.\033[0m") print("\n\033[031mThis app requires some attention regarding help-function consistency.\033[0m")
@ -114,13 +196,17 @@ def find_parameters(file, sorted_result):
directory = '/Users/edsilm2/llama.cpp/examples' directory = '/Users/edsilm2/llama.cpp/examples'
if __name__ == '__main__': if __name__ == '__main__':
# update the source help file from C++ source (this works exactly as required)
update_file("common/common.cpp", "help_list.txt")
# get the parameters from the common.h file utiity we import # get the parameters from the common.h file utiity we import
print(read_common_h.parameters) print(read_common_h.parameters)
# So now we've got the gpt_parameters in this parameters dict # So now we've got the gpt_parameters in this parameters dict
# First we alter all the hyphenated help words in help-file.txt to underscores # First we alter all the hyphenated help words in help-file.txt to underscores
# replace_dashes_with_underscores('help_list.txt') # we later reverse these changers before printing the help lines
# This above may no longer be needed replace_dashes_with_underscores('help_list.txt')
print("\n####################### find parameters #################################") print("\n####################### find parameters #################################")
# Call the find function to collect all the params.attributes and output the result # Call the find function to collect all the params.attributes and output the result

View file

@ -7,6 +7,9 @@ with open('common/common.h', 'r') as file:
lines = file.read().split('\n') lines = file.read().split('\n')
parameters = {} parameters = {}
# we add the logit_bias parameter which otherwise is not found
parameters['logit_bias']=['logit_bias', '0', '//', 'way', 'to', 'alter', 'prob', 'of', 'particular', 'words']
inside = False inside = False
for line in lines: for line in lines:
# non_whitespace_elements = re.findall(r"\S+", line) # non_whitespace_elements = re.findall(r"\S+", line)
@ -18,17 +21,19 @@ for line in lines:
# note: cannot use nwe[0] because types do not generate unique keys and so overwrite # note: cannot use nwe[0] because types do not generate unique keys and so overwrite
# here we deliberately add back the key so we can make a manual change when it is different # here we deliberately add back the key so we can make a manual change when it is different
parameters[non_whitespace_elements[1]] = non_whitespace_elements[1:] parameters[non_whitespace_elements[1]] = non_whitespace_elements[1:]
for k, v in parameters.items(): # remove spurious entry caused by eccentric status of logit_bias
print(f"key: {k:<20}; values: {v}") if "float>" in parameters and parameters["float>"][1] == 'logit_bias':
del parameters["float>"]
concatenated_element = ""
for i, element in enumerate(v):
if element == "//":
concatenated_element = " ".join(v[i:])
# break
print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m")
# this is a bit of a hack to terminate the harvest # this is a bit of a hack to terminate the harvest
if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill": if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill":
inside = False inside = False
break break
for k, v in parameters.items():
print(f"key: {k:<20}; values: {v}")
concatenated_element = ""
for i, element in enumerate(v):
if element == "//":
concatenated_element = " ".join(v[i:])
# break
print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m")

View file

@ -1,104 +1,104 @@
-h, --helpshow this help message and exit -h, --help show this help message and exit\n"
-i, --interactive run in interactive mode -i, --interactive run in interactive mode\n"
--interactive_first run in interactive mode and wait for input right away --interactive_first run in interactive mode and wait for input right away\n"
-ins, --instructrun in instruction mode (use with Alpaca models) -ins, --instruct run in instruction mode (use with Alpaca models)\n"
--multiline_input allows you to write or paste multiple lines without ending each in '\\' --multiline_input allows you to write or paste multiple lines without ending each in '\\'\n"
-r PROMPT, --reverse_prompt PROMPT -r PROMPT, --reverse_prompt PROMPT\n"
halt generation at PROMPT, return control in interactive mode halt generation at PROMPT, return control in interactive mode\n"
(can be specified more than once for multiple prompts). (can be specified more than once for multiple prompts).\n"
--color colorise output to distinguish prompt and user input from generations --color colorise output to distinguish prompt and user input from generations\n"
-s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0) -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"
-t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads
-tb N, --threads_batch N -tb N, --threads_batch N\n"
number of threads to use during batch and prompt processing (default: same as --threads) number of threads to use during batch and prompt processing (default: same as --threads)\n"
-p PROMPT, --prompt PROMPT -p PROMPT, --prompt PROMPT\n"
prompt to start generation with (default: empty) prompt to start generation with (default: empty)\n"
-e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"
--prompt_cache FNAME file to cache prompt state for faster startup (default: none) --prompt_cache FNAME file to cache prompt state for faster startup (default: none)\n"
--prompt_cache_all if specified, saves user input and generations to cache as well. --prompt_cache_all if specified, saves user input and generations to cache as well.\n"
not supported with --interactive or other interactive options not supported with --interactive or other interactive options\n"
--prompt_cache_ro if specified, uses the prompt cache but does not update it. --prompt_cache_ro if specified, uses the prompt cache but does not update it.\n"
--random_prompt start with a randomized prompt. --random_prompt start with a randomized prompt.\n"
--in_prefix_bos prefix BOS to user inputs, preceding the `--in_prefix` string --in_prefix_bos prefix BOS to user inputs, preceding the `--in_prefix` string\n"
--in_prefix STRING string to prefix user inputs with (default: empty) --in_prefix STRING string to prefix user inputs with (default: empty)\n"
--in_suffix STRING string to suffix after user inputs with (default: empty) --in_suffix STRING string to suffix after user inputs with (default: empty)\n"
-f FNAME, --file FNAME -f FNAME, --file FNAME\n"
prompt file to start generation. prompt file to start generation.\n"
-n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict
-c N, --ctx_size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); -c N, --ctx_size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx
-b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch
--top_k N top_k sampling (default: %d, 0 = disabled)\n", params.top_k); --top_k N top_k sampling (default: %d, 0 = disabled)\n", params.top_k
--top_p N top_p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); --top_p N top_p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p
--tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z
--typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p
--repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); --repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n
--repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); --repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty
--presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); --presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty
--frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty
--mirostat N use Mirostat sampling. --mirostat N use Mirostat sampling.\n"
Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used. Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat
--mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); --mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta
--mirostat_ent NMirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); --mirostat_ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau
-l TOKEN_ID(+/-)BIAS, --logit_bias TOKEN_ID(+/-)BIAS -l T, --logit_bias T T = TOKEN_ID(plus/minus)BIAS\n"
modifies the likelihood of token appearing in the completion, modifies the likelihood of token appearing in the completion,\n"
i.e. `--logit_bias 15043+1` to increase likelihood of token ' Hello', i.e. `--logit_bias 15043+1` to increase likelihood of token ' Hello',\n"
or `--logit_bias 15043_1` to decrease likelihood of token ' Hello' or `--logit_bias 15043_1` to decrease likelihood of token ' Hello'\n"
--grammar GRAMMAR BNF_like grammar to constrain generations (see samples in grammars/ dir) --grammar GRAMMAR BNF_like grammar to constrain generations (see samples in grammars/ dir)\n"
--grammar_file FNAME file to read grammar from --grammar_file FNAME file to read grammar from\n"
--cfg_negative_prompt PROMPT --cfg_negative_prompt PROMPT\n"
negative prompt to use for guidance. (default: empty) negative prompt to use for guidance. (default: empty)\n"
--cfg_negative_prompt_file FNAME --cfg_negative_prompt_file FNAME\n"
negative prompt file to use for guidance. (default: empty) negative prompt file to use for guidance. (default: empty)\n"
--cfg_scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); --cfg_scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale
--rope_scale N RoPE context linear scaling factor, inverse of --rope_freq_scale --rope_scale N RoPE context linear scaling factor, inverse of --rope_freq_scale\n"
--rope_freq_base N RoPE base frequency, used by NTK_aware scaling (default: loaded from model) --rope_freq_base N RoPE base frequency, used by NTK_aware scaling (default: loaded from model)\n"
--rope_freq_scale N RoPE frequency linear scaling factor (default: loaded from model) --rope_freq_scale N RoPE frequency linear scaling factor (default: loaded from model)\n"
--ignore_eos ignore end of stream token and continue generating (implies --logit_bias 2_inf) --ignore_eos ignore end of stream token and continue generating (implies --logit_bias 2_inf)\n"
--no_penalize_nldo not penalize newline token --no_penalize_nl do not penalize newline token (default is DO penalise nl token)\n"
--memory_f32 use f32 instead of f16 for memory key+value (default: disabled) --memory_f32 use f32 instead of f16 for memory key+value (default: disabled)\n"
not recommended: doubles context memory required and no measurable increase in quality not recommended: doubles context memory required and no measurable increase in quality\n"
--temp N temperature (default: %.1f)\n", (double)params.temp); --temp N temperature (default: %.1f)\n", (double)params.temp
--logits_all return logits for all tokens in the batch (default: disabled) --logits_all return logits for all tokens in the batch (default: disabled)\n"
--hellaswag compute HellaSwag score over random tasks from datafile supplied with -f --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"
--hellaswag_tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); --hellaswag_tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks
--keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep
--draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft
--chunks Nmax number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks
-np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel); -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel
-ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences
-cb, --cont_batching enable continuous batching (a.k.a dynamic batching) (default: disabled) -cb, --cont_batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"
if (llama_mlock_supported()) { --mlock force system to keep model in RAM rather than swapping or compressing\n"
--mlock force system to keep model in RAM rather than swapping or compressing --no_mmap do not memory_map model (slower load but may reduce pageouts if not using mlock)\n"
} --numa attempt optimizations that help on some NUMA systems\n"
if (llama_mmap_supported()) { if run without this previously, it is recommended to drop the system page cache before using this\n"
--no_mmap do not memory_map model (slower load but may reduce pageouts if not using mlock) see https://github.com/ggerganov/llama.cpp/issues/1437\n"
} -ngl N, --n_gpu_layers N\n"
--numa attempt optimizations that help on some NUMA systems number of layers to store in VRAM\n"
if run without this previously, it is recommended to drop the system page cache before using this -ngld N, --n_gpu_layers_draft N\n"
see https://github.com/ggerganov/llama.cpp/issues/1437 number of layers to store in VRAM for the draft model\n"
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD -ts SPLIT --tensor_split SPLIT\n"
-ngl N, --n_gpu_layers N how to split tensors across multiple GPUs, comma_separated list of proportions, e.g. 3,1\n"
number of layers to store in VRAM -mg i, --main_gpu i the GPU to use for scratch and small tensors\n"
-ngld N, --n_gpu_layers_draft N -nommq, --no_mul_mat_q\n"
number of layers to store in VRAM for the draft model use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"
-ts SPLIT --tensor_split SPLIT Not recommended since this is both slower and uses more VRAM.\n"
how to split tensors across multiple GPUs, comma_separated list of proportions, e.g. 3,1 --verbose_prompt print prompt before generation\n"
-mg i, --main_gpu i the GPU to use for scratch and small tensors --lora FNAME apply LoRA adapter (implies --no_mmap)\n"
#ifdef GGML_USE_CUBLAS --lora_scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no_mmap)\n"
-nommq, --no_mul_mat_q --lora_base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"
use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels. -m FNAME, --model FNAME\n"
Not recommended since this is both slower and uses more VRAM. model path (default: %s)\n", params.model.c_str()
#endif // GGML_USE_CUBLAS -md FNAME, --model_draft FNAME\n"
#endif draft model for speculative decoding (default: %s)\n", params.model.c_str()
--verbose_promptprint prompt before generation -ld LOGDIR, --logdir LOGDIR\n"
fprintf(stderr, " --simple_io use basic IO for better compatibility in subprocesses and limited consoles path under which to save YAML logs (no logging if unset)\n"
--lora FNAME apply LoRA adapter (implies --no_mmap) --ppl_stride stride for ppl calcs. 0 (default): the pre_existing approach will be used.\n"
--lora_scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no_mmap) --ppl_output_type 0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n"
--lora_base FNAME optional model to use as a base for the layers modified by the LoRA adapter --embedding 0 (default): get only sentence embedding\n"
-m FNAME, --model FNAME --beams N 0 (default): if non_zero use beam search of given width N.\n"
model path (default: %s)\n", params.model.c_str()); --memory_f32 0 (default): if true (= 1) disable f16 memory.\n"
-md FNAME, --model_draft FNAME --no_mmap 0 (default): if true use mmap for faster loads.\n"
draft model for speculative decoding (default: %s)\n", params.model.c_str()); --mlock 0 (default): if true keep model in memory.\n"
-ld LOGDIR, --logdir LOGDIR --use_color 0 (default): use color to distinguish generations from inputs\n"
path under which to save YAML logs (no logging if unset) --nprobs N if > 0 output the probabilities of the top N tokens\n"