diff --git a/common/common.cpp b/common/common.cpp
index 4b233786a..4d4ecf03d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -621,6 +621,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
+// There were missing items from this list of helps so the wording needs checking (all inserted at the end, so reposition too):
+// --embedding, --beams, --ppl-stride, --ppl-output-type, memory-f32, no-mmap, mlock, use-color, nprobs
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("usage: %s [options]\n", argv[0]);
     printf("\n");
@@ -667,7 +669,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
     printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
     printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    printf("  -l T, --logit-bias T  T = TOKEN_ID(plus/minus)BIAS\n");
     printf("                        modifies the likelihood of token appearing in the completion,\n");
     printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
     printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
@@ -682,7 +684,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    printf("  --no-penalize-nl      do not penalize newline token\n");
+    printf("  --no-penalize-nl      do not penalize newline token (default is DO penalise nl token)\n");
     printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
     printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
@@ -729,6 +731,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
+    printf("  --ppl-stride          stride for ppl calcs. 0 (default): the pre-existing approach will be used.\n");
+    printf("  --ppl-output-type     0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n");
+    printf("  --embedding           0 (default): get only sentence embedding\n");
+    printf("  --beams N             0 (default): if non-zero use beam search of given width N.\n");
+    printf("  --memory-f32          0 (default): if true (= 1) disable f16 memory.\n");   
+    printf("  --no-mmap             0 (default): if true use mmap for faster loads.\n");
+    printf("  --mlock               0 (default): if true keep model in memory.\n");  
+    printf("  --use-color           0 (default): use color to distinguish generations from inputs\n");
+    printf("  --nprobs N            if > 0 output the probabilities of the top N tokens\n"); 
     printf("\n");
 }
 
diff --git a/common/common.h b/common/common.h
index 887142cf9..d48ee22e1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -75,6 +75,7 @@ struct gpt_params {
     std::string cfg_negative_prompt;       // string to help guidance
     float       cfg_scale         = 1.f;   // How strong is guidance
 
+    std::string help              = "";  // universal help parameter
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
diff --git a/examples/cmap-example/find_implemented_args.py b/examples/cmap-example/find_implemented_args.py
index 31d439698..ac6e6cfdc 100644
--- a/examples/cmap-example/find_implemented_args.py
+++ b/examples/cmap-example/find_implemented_args.py
@@ -6,7 +6,22 @@ import collections
 import re
 import read_common_h
 
+# update the source file - usually 'help_list.txt', so the default - in case the source file has been changed
+def update_file(file_from, file_to = "help_list.txt"):
+    # Open the file_from file
+    with open(file_from, "r") as file:
+        lines = file.readlines()
 
+    # Find lines starting with "printf(" and ending with ");" (assumes file_from is written in C/C++)
+    pattern = r'printf\("\s(.*?)\);'
+    matched_lines = [re.search(pattern, line).group(1) for line in lines if re.search(pattern, line)]
+
+    # Save matched lines to file_to
+    with open(file_to, "w") as file:
+        for line in matched_lines:
+            file.write(line + '\n')
+
+# helper fn to make the hyphenated words in a file snake-case for searching
 def replace_dashes_with_underscores(filename):
     with open(filename, 'r') as file:
         content = file.read()
@@ -17,6 +32,13 @@ def replace_dashes_with_underscores(filename):
     with open(filename, 'w') as file:
         file.write(replaced_content)
 
+# helper fn to make the underscored words in a file hyphenated for print
+def replace_underscores_with_dashes(parameter):
+    # Match '_' surrounded by word characters on both sides and replace with '-'
+    return re.sub(r'(\w)_(\w)', r'\1-\2', parameter)
+
+
+# find all instances of "params." in the *.cpp files in a directory
 def find_arguments(directory):
     arguments = {}
 
@@ -28,21 +50,21 @@ def find_arguments(directory):
                 with open(filepath, 'r') as file:
                     content = file.read()
 
-                    # Search for the expression "params." excluding prefixes and read the attribute without trailing detritus
+                    # Search for the expression "params." or "params->" excluding prefixes and read the attribute without trailing detritus
                     # matches = re.findall(r'(?:^|\s)params\.(.*)(?=[\). <,;}]|\Z)', content)
-                    matches = set(re.findall(r'(?:^|\b)params\.([a-zA-Z_0-9]*)(?=[\). <,;}]|\Z)', content))
-                    # Remove duplicates from matches list
-                    # arguments_list = list(set([match.strip() for match in matches]))
+                    matches = set(re.findall(r'(?:^|\b)params[->\.]([a-zA-Z_0-9]*)(?=[\). <,;}]|\Z)', content))
 
                     # Add the matches to the dictionary
                     arguments[filepath] = matches
 
     return arguments
 
+# output a list of the params.attributes for each file
 def output_results(result):
     sorted_result = collections.OrderedDict(sorted(result.items()))
     all_of_them = set()
     for filename, arguments in sorted_result.items():
+        arguments.add("help")
         print(f"Filename: \033[32m{filename.split('/')[-1]}\033[0m, arguments: {arguments}\n")
         for argument in arguments:
             if argument not in all_of_them:
@@ -50,6 +72,7 @@ def output_results(result):
     print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.")
     return sorted_result
 
+# put all the words after "//" in a dict back together with spaces
 def concatenate(v):
     concatenated_element = ""
     for i, element in enumerate(v):
@@ -57,24 +80,78 @@ def concatenate(v):
             concatenated_element = " ".join(v[i:])
     return concatenated_element
 
+def title_print(filename):
+    title = filename.split('/')[-1]
+    print("\n\n"+"#"*(10+len(title)))         
+    print(f"Filename: \033[32m{title}\033[0m")
+    print("#"*(10+len(title))) 
+
+def substitution_list(parameters):
+    # store untrapped parameters as identicals in case we need to change them later
+    sub_dict = {"n_threads": "threads",
+                "n_ctx": "ctx_size",
+                "n_draft" : "draft",
+                "n_threads_batch" : "threads_batch",
+                "n_chunks" : "chunks",
+                "n_batch" : "batch_size",
+                "n_sequences" : "sequences",
+                "n_parallel" : "parallel",
+                "n_beams" : "beams",
+                "n_keep" : "keep",
+                "n_probs" : "nprobs",
+                "path_prompt_cache" : "prompt_cache",
+                "input_prefix" : "in_prefix",
+                "input_suffix" : "in_suffix",
+                "input_prefix_bos" : "in_prefix_bos",
+                "antiprompt" : "reverse_prompt",
+                "mul_mat_q" : "no_mul_mat_q",
+                "use_mmap" : "no_mmap",
+                "use_mlock" : "mlock",
+                "model_alias" : "alias",
+                "tfs_z" : "tfs",
+                "use_color" : "color",
+                "logit_bias" : "logit_bias",
+                "ignore_eos" : "ignore_eos",
+                "mirostat_tau" : "mirostat_ent",
+                "mirostat_eta" : "mirostat_lr",
+                "penalize_nl" : "no_penalize_nl",
+                "typical_p" : "typical",
+                "mem_size" : "mem_size",
+                "mem_buffer" : "mem_buffer",
+                "no_alloc" : "no_alloc"
+                }
+    new_parameters = []
+    for parameter in parameters:
+        if parameter in sub_dict:
+            # we need both for future reference 
+            new_parameters.append(parameter)
+            new_parameters.append(sub_dict[parameter])
+        else:
+            new_parameters.append(parameter)
+    return new_parameters
+
+# output the lines of the help file
 def find_parameters(file, sorted_result):
      with open(file, "r") as helpfile:
         lines = helpfile.read().split("\n")
         for filename, arguments in sorted_result.items():
+            # we try to fix up some variant labelling in help_file.txt
+            arguments = substitution_list(arguments)
             parameters = []
             for line in lines:
                 for argument in arguments:
                     # building pattern to avoid spurious matches
-                    pattern = r"(?:--{}\s)|(?:params\.{}[\s.,();])".format(argument, argument.split('n_')[-1])
-                    if re.search(pattern, line):
+                    # pattern = r"(?:--{}\s)|(?:params\.{}[\s.,\.();])".format(argument, argument.split('n_')[-1])
+                    pattern = r"(?:--{}\s)|(?:params\.{}(?=[\s.,\.\(\);]|\.+\w))".format(argument, argument.split('n_')[-1])
+                    # pattern = r"(?<=params\.)\w+(?=\.\w+|\.|,|;|\}|\{|\(|\)|\.)"
+                    # bit of a hack to exclude --attributes at the end of help comment lines
+                    if re.search(pattern, line[:50]):
                         parameters.append(line)
     
             all_parameters = set(parameters)
-            file = filename.split('/')[-1]
-            print("\n\n"+"#"*(10+len(file)))         
-            print(f"Filename: \033[32m{file}\033[0m")
-            print("#"*(10+len(file))) 
-            print(f"\n\n    command-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
+
+            title_print(filename)
+            print(f"\nCommand-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
 
             if not all_parameters:
                 print(f"    \033[032mNone\033[0m\n")
@@ -83,11 +160,16 @@ def find_parameters(file, sorted_result):
             else:
                 help_count = 0
                 for parameter in all_parameters:
-                    help_count += 1
-                    print(f"{help_count:>2} help: \033[33m{parameter:<30}\033[0m")
+                    # reverse the hypthen/underscore pattern just for printing
+                    replaced_param = replace_underscores_with_dashes(parameter)
+                    if not parameter.startswith("    "):
+                        help_count += 1
+                        print(f"{help_count:>2} help: \033[33m{replaced_param:<30}\033[0m")
+                    else:
+                        print(f"   help: \033[33m{replaced_param:<30}\033[0m")
 
                 # now do it the new way
-                print("\nNow we extract the original gpt_params definition and defaults for implemented arguments:\n")
+                print("\nNow we extract the original gpt_params definition from common.h with the defaults for implemented arguments:\n")
                 gpt_count = 0
                 for k,v in read_common_h.parameters.items():
                     if not read_common_h.parameters.items():
@@ -99,14 +181,14 @@ def find_parameters(file, sorted_result):
                         print(f"{gpt_count:>2} gpt_param: \033[32m{k:>19}; \033[34mrole: \033[33m{concatenated_element:<60}\033[0m;  \033[34mdefault: \033[30m{v[1]:<10}\033[0m ")
                 
                 # searching the other way round is quicker:
-                print("\nSearching the other way round is quicker:\n")
+                print("\nSearching the other way round is more efficient:\n")
                 key_count = 0
-                for argument in arguments:
+                for argument in set(arguments):
                     if argument in read_common_h.parameters:
                         key_count += 1
                         print(f"{key_count:>2} key: {argument:>25}; role: {concatenate(read_common_h.parameters[argument]):<60}; default: {read_common_h.parameters[argument][1]:<10}")
                 if help_count == gpt_count and gpt_count == key_count:
-                    print("\n\033[032mNo unresolved help-list incompatibilities with this app.\033[0m")
+                    print(f"\n\033[032mNo unresolved help-list incompatibilities with \033[33m{filename.split('/')[-1]}\033[0m")
                 else:
                     print("\n\033[031mThis app requires some attention regarding help-function consistency.\033[0m")
 
@@ -114,13 +196,17 @@ def find_parameters(file, sorted_result):
 directory = '/Users/edsilm2/llama.cpp/examples'
 
 if __name__ == '__main__':
+   
+   # update the source help file from C++ source (this works exactly as required)
+    update_file("common/common.cpp", "help_list.txt")
+
     # get the parameters from the common.h file utiity we import
     print(read_common_h.parameters)
     # So now we've got the gpt_parameters in this parameters dict
 
     # First we alter all the hyphenated help words in help-file.txt to underscores
-    # replace_dashes_with_underscores('help_list.txt')
-    # This above may no longer be needed
+    # we later reverse these changers before printing the help lines
+    replace_dashes_with_underscores('help_list.txt')
 
     print("\n####################### find parameters #################################")
     # Call the find function to collect all the params.attributes and output the result
diff --git a/examples/cmap-example/read_common_h.py b/examples/cmap-example/read_common_h.py
index a683d7662..1c18d4960 100644
--- a/examples/cmap-example/read_common_h.py
+++ b/examples/cmap-example/read_common_h.py
@@ -7,6 +7,9 @@ with open('common/common.h', 'r') as file:
     lines = file.read().split('\n')
 
 parameters = {}
+# we add the logit_bias parameter which otherwise is not found
+parameters['logit_bias']=['logit_bias', '0', '//', 'way', 'to', 'alter', 'prob', 'of', 'particular', 'words']
+
 inside = False
 for line in lines:
     # non_whitespace_elements = re.findall(r"\S+", line)
@@ -18,17 +21,19 @@ for line in lines:
         # note: cannot use nwe[0] because types do not generate unique keys and so overwrite
         # here we deliberately add back the key so we can make a manual change when it is different
         parameters[non_whitespace_elements[1]] = non_whitespace_elements[1:]
-        for k, v in parameters.items():
-            print(f"key: {k:<20}; values: {v}")
-            
-            concatenated_element = ""
-            for i, element in enumerate(v):
-                if element == "//":
-                    concatenated_element = " ".join(v[i:])
-                    # break
-            print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m")
-    
+        # remove spurious entry caused by eccentric status of logit_bias
+        if "float>" in parameters and parameters["float>"][1] == 'logit_bias':
+            del parameters["float>"]
+
     # this is a bit of a hack to terminate the harvest 
     if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill":
         inside = False
-        break
\ No newline at end of file
+        break
+for k, v in parameters.items():
+    print(f"key: {k:<20}; values: {v}")
+    concatenated_element = ""
+    for i, element in enumerate(v):
+        if element == "//":
+            concatenated_element = " ".join(v[i:])
+            # break
+    print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m")
diff --git a/help_list.txt b/help_list.txt
index 7bf5b8c78..97b91a982 100644
--- a/help_list.txt
+++ b/help_list.txt
@@ -1,104 +1,104 @@
--h, --helpshow this help message and exit
--i, --interactive     run in interactive mode
---interactive_first   run in interactive mode and wait for input right away
--ins, --instructrun in instruction mode (use with Alpaca models)
---multiline_input     allows you to write or paste multiple lines without ending each in '\\'
--r PROMPT, --reverse_prompt PROMPT
-    halt generation at PROMPT, return control in interactive mode
-    (can be specified more than once for multiple prompts).
---color   colorise output to distinguish prompt and user input from generations
--s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)
--t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
--tb N, --threads_batch N
-    number of threads to use during batch and prompt processing (default: same as --threads)
--p PROMPT, --prompt PROMPT
-    prompt to start generation with (default: empty)
--e, --escape    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)
---prompt_cache FNAME  file to cache prompt state for faster startup (default: none)
---prompt_cache_all    if specified, saves user input and generations to cache as well.
-    not supported with --interactive or other interactive options
---prompt_cache_ro     if specified, uses the prompt cache but does not update it.
---random_prompt start with a randomized prompt.
---in_prefix_bos prefix BOS to user inputs, preceding the `--in_prefix` string
---in_prefix STRING    string to prefix user inputs with (default: empty)
---in_suffix STRING    string to suffix after user inputs with (default: empty)
--f FNAME, --file FNAME
-    prompt file to start generation.
--n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
--c N, --ctx_size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
--b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
---top_k N top_k sampling (default: %d, 0 = disabled)\n", params.top_k);
---top_p N top_p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
---tfs N   tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
---typical N     locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
---repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
---repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
---presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
---frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
---mirostat N    use Mirostat sampling.
-    Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.
-    (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
---mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
---mirostat_ent NMirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
--l TOKEN_ID(+/-)BIAS, --logit_bias TOKEN_ID(+/-)BIAS
-    modifies the likelihood of token appearing in the completion,
-    i.e. `--logit_bias 15043+1` to increase likelihood of token ' Hello',
-    or `--logit_bias 15043_1` to decrease likelihood of token ' Hello'
---grammar GRAMMAR     BNF_like grammar to constrain generations (see samples in grammars/ dir)
---grammar_file FNAME  file to read grammar from
---cfg_negative_prompt PROMPT
-    negative prompt to use for guidance. (default: empty)
---cfg_negative_prompt_file FNAME
-    negative prompt file to use for guidance. (default: empty)
---cfg_scale N   strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
---rope_scale N  RoPE context linear scaling factor, inverse of --rope_freq_scale
---rope_freq_base N    RoPE base frequency, used by NTK_aware scaling (default: loaded from model)
---rope_freq_scale N   RoPE frequency linear scaling factor (default: loaded from model)
---ignore_eos    ignore end of stream token and continue generating (implies --logit_bias 2_inf)
---no_penalize_nldo not penalize newline token
---memory_f32    use f32 instead of f16 for memory key+value (default: disabled)
-    not recommended: doubles context memory required and no measurable increase in quality
---temp N  temperature (default: %.1f)\n", (double)params.temp);
---logits_all    return logits for all tokens in the batch (default: disabled)
---hellaswag     compute HellaSwag score over random tasks from datafile supplied with -f
---hellaswag_tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
---keep N  number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
---draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
---chunks Nmax number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
--np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
--ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
--cb, --cont_batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)
-    if (llama_mlock_supported()) {
-    --mlock   force system to keep model in RAM rather than swapping or compressing
-    }
-    if (llama_mmap_supported()) {
-    --no_mmap do not memory_map model (slower load but may reduce pageouts if not using mlock)
-    }
---numa    attempt optimizations that help on some NUMA systems
-    if run without this previously, it is recommended to drop the system page cache before using this
-    see https://github.com/ggerganov/llama.cpp/issues/1437
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
--ngl N, --n_gpu_layers N
-    number of layers to store in VRAM
--ngld N, --n_gpu_layers_draft N
-    number of layers to store in VRAM for the draft model
--ts SPLIT --tensor_split SPLIT
-    how to split tensors across multiple GPUs, comma_separated list of proportions, e.g. 3,1
--mg i, --main_gpu i   the GPU to use for scratch and small tensors
-#ifdef GGML_USE_CUBLAS
--nommq, --no_mul_mat_q
-    use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.
-    Not recommended since this is both slower and uses more VRAM.
-#endif // GGML_USE_CUBLAS
-#endif
---verbose_promptprint prompt before generation
-    fprintf(stderr, "  --simple_io     use basic IO for better compatibility in subprocesses and limited consoles
---lora FNAME    apply LoRA adapter (implies --no_mmap)
---lora_scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no_mmap)
---lora_base FNAME     optional model to use as a base for the layers modified by the LoRA adapter
--m FNAME, --model FNAME
-    model path (default: %s)\n", params.model.c_str());
--md FNAME, --model_draft FNAME
-    draft model for speculative decoding (default: %s)\n", params.model.c_str());
--ld LOGDIR, --logdir LOGDIR
-    path under which to save YAML logs (no logging if unset)
\ No newline at end of file
+ -h, --help            show this help message and exit\n"
+ -i, --interactive     run in interactive mode\n"
+ --interactive_first   run in interactive mode and wait for input right away\n"
+ -ins, --instruct      run in instruction mode (use with Alpaca models)\n"
+ --multiline_input     allows you to write or paste multiple lines without ending each in '\\'\n"
+ -r PROMPT, --reverse_prompt PROMPT\n"
+                       halt generation at PROMPT, return control in interactive mode\n"
+                       (can be specified more than once for multiple prompts).\n"
+ --color               colorise output to distinguish prompt and user input from generations\n"
+ -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n"
+ -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads
+ -tb N, --threads_batch N\n"
+                       number of threads to use during batch and prompt processing (default: same as --threads)\n"
+ -p PROMPT, --prompt PROMPT\n"
+                       prompt to start generation with (default: empty)\n"
+ -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"
+ --prompt_cache FNAME  file to cache prompt state for faster startup (default: none)\n"
+ --prompt_cache_all    if specified, saves user input and generations to cache as well.\n"
+                       not supported with --interactive or other interactive options\n"
+ --prompt_cache_ro     if specified, uses the prompt cache but does not update it.\n"
+ --random_prompt       start with a randomized prompt.\n"
+ --in_prefix_bos       prefix BOS to user inputs, preceding the `--in_prefix` string\n"
+ --in_prefix STRING    string to prefix user inputs with (default: empty)\n"
+ --in_suffix STRING    string to suffix after user inputs with (default: empty)\n"
+ -f FNAME, --file FNAME\n"
+                       prompt file to start generation.\n"
+ -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict
+ -c N, --ctx_size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx
+ -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch
+ --top_k N             top_k sampling (default: %d, 0 = disabled)\n", params.top_k
+ --top_p N             top_p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p
+ --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z
+ --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p
+ --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n
+ --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty
+ --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty
+ --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty
+ --mirostat N          use Mirostat sampling.\n"
+                       Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+                       (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat
+ --mirostat_lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta
+ --mirostat_ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau
+ -l T, --logit_bias T  T = TOKEN_ID(plus/minus)BIAS\n"
+                       modifies the likelihood of token appearing in the completion,\n"
+                       i.e. `--logit_bias 15043+1` to increase likelihood of token ' Hello',\n"
+                       or `--logit_bias 15043_1` to decrease likelihood of token ' Hello'\n"
+ --grammar GRAMMAR     BNF_like grammar to constrain generations (see samples in grammars/ dir)\n"
+ --grammar_file FNAME  file to read grammar from\n"
+ --cfg_negative_prompt PROMPT\n"
+                       negative prompt to use for guidance. (default: empty)\n"
+ --cfg_negative_prompt_file FNAME\n"
+                       negative prompt file to use for guidance. (default: empty)\n"
+ --cfg_scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale
+ --rope_scale N        RoPE context linear scaling factor, inverse of --rope_freq_scale\n"
+ --rope_freq_base N    RoPE base frequency, used by NTK_aware scaling (default: loaded from model)\n"
+ --rope_freq_scale N   RoPE frequency linear scaling factor (default: loaded from model)\n"
+ --ignore_eos          ignore end of stream token and continue generating (implies --logit_bias 2_inf)\n"
+ --no_penalize_nl      do not penalize newline token (default is DO penalise nl token)\n"
+ --memory_f32          use f32 instead of f16 for memory key+value (default: disabled)\n"
+                       not recommended: doubles context memory required and no measurable increase in quality\n"
+ --temp N              temperature (default: %.1f)\n", (double)params.temp
+ --logits_all          return logits for all tokens in the batch (default: disabled)\n"
+ --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n"
+ --hellaswag_tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks
+ --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep
+ --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft
+ --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks
+ -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel
+ -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences
+ -cb, --cont_batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"
+ --mlock               force system to keep model in RAM rather than swapping or compressing\n"
+ --no_mmap             do not memory_map model (slower load but may reduce pageouts if not using mlock)\n"
+ --numa                attempt optimizations that help on some NUMA systems\n"
+                       if run without this previously, it is recommended to drop the system page cache before using this\n"
+                       see https://github.com/ggerganov/llama.cpp/issues/1437\n"
+ -ngl N, --n_gpu_layers N\n"
+                       number of layers to store in VRAM\n"
+ -ngld N, --n_gpu_layers_draft N\n"
+                       number of layers to store in VRAM for the draft model\n"
+ -ts SPLIT --tensor_split SPLIT\n"
+                       how to split tensors across multiple GPUs, comma_separated list of proportions, e.g. 3,1\n"
+ -mg i, --main_gpu i   the GPU to use for scratch and small tensors\n"
+ -nommq, --no_mul_mat_q\n"
+                       use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"
+                       Not recommended since this is both slower and uses more VRAM.\n"
+ --verbose_prompt      print prompt before generation\n"
+ --lora FNAME          apply LoRA adapter (implies --no_mmap)\n"
+ --lora_scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no_mmap)\n"
+ --lora_base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"
+ -m FNAME, --model FNAME\n"
+                       model path (default: %s)\n", params.model.c_str()
+ -md FNAME, --model_draft FNAME\n"
+                       draft model for speculative decoding (default: %s)\n", params.model.c_str()
+ -ld LOGDIR, --logdir LOGDIR\n"
+                       path under which to save YAML logs (no logging if unset)\n"
+ --ppl_stride          stride for ppl calcs. 0 (default): the pre_existing approach will be used.\n"
+ --ppl_output_type     0 (default): ppl output as usual, 1: ppl output num_tokens, one per line\n"
+ --embedding           0 (default): get only sentence embedding\n"
+ --beams N             0 (default): if non_zero use beam search of given width N.\n"
+ --memory_f32          0 (default): if true (= 1) disable f16 memory.\n"
+ --no_mmap             0 (default): if true use mmap for faster loads.\n"
+ --mlock               0 (default): if true keep model in memory.\n"
+ --use_color           0 (default): use color to distinguish generations from inputs\n"
+ --nprobs N            if > 0 output the probabilities of the top N tokens\n"