Update helper dev

2023-10-07 21:40:45 +01:00 · 2023-10-07 21:40:45 +01:00 · 9c5d6f0ef6
commit 9c5d6f0ef6
parent 0d70518220
4 changed files with 112 additions and 31 deletions
--- a/common/common.h
+++ b/common/common.h
@ -35,21 +35,21 @@ int32_t get_num_physical_cores();

 struct gpt_params {
    uint32_t seed                           = -1;   // RNG seed
-    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_batch                 = -1;   // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads                       = get_num_physical_cores(); // user-defined or num of internal physical cores
+    int32_t n_threads_batch                 = -1;   // num threads for batch proc (-1 = use n_threads)
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
-    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch                         = 512;  // batch size for prompt proc (>=32 to use BLAS)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel                      = 1;    // number of parallel sequences to decode
    int32_t n_sequences                     = 1;    // number of sequences to decode
-    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t n_gpu_layers                    = -1;   // num layers stored in VRAM (-1 for default)
+    int32_t n_gpu_layers_draft              = -1;   // num layers stored in VRAM for draft mod (-1 for default)
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t n_probs                         = 0;    // if > 0, output probabilities of top n_probs tokens.
    int32_t n_beams                         = 0;    // if non-zero then use beam search of given width.
    float   rope_freq_base                  = 0.0f; // RoPE base frequency
    float   rope_freq_scale                 = 0.0f; // RoPE frequency scaling factor
@ -61,7 +61,7 @@ struct gpt_params {
    float   typical_p         = 1.00f; // 1.0 = disabled
    float   temp              = 0.80f; // 1.0 = disabled
    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable, -1 = cxt size)
    float   frequency_penalty = 0.00f; // 0.0 = disabled
    float   presence_penalty  = 0.00f; // 0.0 = disabled
    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
@ -78,7 +78,7 @@ struct gpt_params {
    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_draft       = "";                              // draft model for speculative decoding
    std::string model_alias       = "unknown"; // model alias
-    std::string prompt            = "";
+    std::string prompt            = "";  // user-provided single prompt
    std::string prompt_file       = "";  // store the external prompt file
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
@ -90,11 +90,11 @@ struct gpt_params {
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter

-    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+    int  ppl_stride        = 0;     // stride for ppl calcs. 0: the pre-existing approach will be used.
+    int  ppl_output_type   = 0;     // 0: ppl output as usual, 1: ppl output = num_tokens, ppl, one per line
                                    //                                       (which is more convenient to use for plotting)
                                    //
-    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    bool hellaswag         = false; // compute HellaSwag score from datafile given in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
@ -109,7 +109,7 @@ struct gpt_params {
    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
-    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
+    bool simple_io         = false; // improves compat'y with subprocs and ltd consoles
    bool cont_batching     = false; // insert new sequences for decoding on-the-fly

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
--- a/examples/cmap-example/find_implemented_args.py
+++ b/examples/cmap-example/find_implemented_args.py
@ -4,6 +4,8 @@ import os
 import re
 import collections
 import re
+import read_common_h
+

 def replace_dashes_with_underscores(filename):
    with open(filename, 'r') as file:
@ -48,6 +50,13 @@ def output_results(result):
    print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.")
    return sorted_result

+def concatenate(v):
+    concatenated_element = ""
+    for i, element in enumerate(v):
+        if element == "//":
+            concatenated_element = " ".join(v[i:])
+    return concatenated_element
+
 def find_parameters(file, sorted_result):
     with open(file, "r") as helpfile:
        lines = helpfile.read().split("\n")
@ -59,36 +68,68 @@ def find_parameters(file, sorted_result):
                    pattern = r"(?:--{}\s)|(?:params\.{}[\s.,();])".format(argument, argument.split('n_')[-1])
                    if re.search(pattern, line):
                        parameters.append(line)
-            '''for line in lines:
-                for argument in arguments:
-                    # need to try to avoid spurious matches
-                    argument1 = "--" + argument + " "
-                    if argument1 in line:
-                        parameters.append(line)
-                    # need to try to avoid spurious matches
-                    argument2 = "params." + argument.split('n_')[-1]
-                    if argument2 in line:
-                        parameters.append(line)
-                    argument3 = "params." + argument
-                    if argument3 in line:
-                        parameters.append(line)'''
+    
            all_parameters = set(parameters)
-            print(f"\n\nFilename: \033[32m{filename.split('/')[-1]}\033[0m\n\n    command-line arguments available and gpt-params functions implemented:\n")
+            file = filename.split('/')[-1]
+            print("\n\n"+"#"*(10+len(file)))         
+            print(f"Filename: \033[32m{file}\033[0m")
+            print("#"*(10+len(file))) 
+            print(f"\n\n    command-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
+
            if not all_parameters:
                print(f"    \033[032mNone\033[0m\n")
-            else:
-                for parameter in all_parameters:
-                    print(f"    help: \033[33m{parameter:<30}\033[0m")
            
+            # first do it the original way 
+            else:
+                help_count = 0
+                for parameter in all_parameters:
+                    help_count += 1
+                    print(f"{help_count:>2} help: \033[33m{parameter:<30}\033[0m")
+
+                # now do it the new way
+                print("\nNow we extract the original gpt_params definition and defaults for implemented arguments:\n")
+                gpt_count = 0
+                for k,v in read_common_h.parameters.items():
+                    if not read_common_h.parameters.items():
+                        print(f"    \033[032mNone\033[0m\n")
+                    elif k in arguments:
+                        # print(f"gpt_params: \033[33m{k:>20}\033[0m values: {v}")
+                        concatenated_element = concatenate(v)
+                        gpt_count += 1
+                        print(f"{gpt_count:>2} gpt_param: \033[32m{k:>19}; \033[34mrole: \033[33m{concatenated_element:<60}\033[0m;  \033[34mdefault: \033[30m{v[1]:<10}\033[0m ")
+                
+                # searching the other way round is quicker:
+                print("\nSearching the other way round is quicker:\n")
+                key_count = 0
+                for argument in arguments:
+                    if argument in read_common_h.parameters:
+                        key_count += 1
+                        print(f"{key_count:>2} key: {argument:>25}; role: {concatenate(read_common_h.parameters[argument]):<60}; default: {read_common_h.parameters[argument][1]:<10}")
+                if help_count == gpt_count and gpt_count == key_count:
+                    print("\n\033[032mNo unresolved help-list incompatibilities with this app.\033[0m")
+                else:
+                    print("\n\033[031mThis app requires some attention regarding help-function consistency.\033[0m")

 # Specify the directory you want to search for cpp files
 directory = '/Users/edsilm2/llama.cpp/examples'

 if __name__ == '__main__':
+    # get the parameters from the common.h file utiity we import
+    print(read_common_h.parameters)
+    # So now we've got the gpt_parameters in this parameters dict
+
    # First we alter all the hyphenated help words in help-file.txt to underscores
-    replace_dashes_with_underscores('help_list.txt')
-    # Call the find function and output the result
+    # replace_dashes_with_underscores('help_list.txt')
+    # This above may no longer be needed
+
+    print("\n####################### find parameters #################################")
+    # Call the find function to collect all the params.attributes and output the result
    result = find_arguments(directory)
+
+    print("\n######################################## output_results #################################")
+    # sort the results and output them
    sorted = output_results(result)
+
+    print("\n######################## find help context parameters #################################")
    # analyse the files and what they contain
    find_parameters("help_list.txt", sorted)
--- a/examples/cmap-example/read_common_h.py
+++ b/examples/cmap-example/read_common_h.py
@ -0,0 +1,34 @@
+# read common.h and extract the parameters name list
+
+import re
+
+# Read the file into separate lines
+with open('common/common.h', 'r') as file:
+    lines = file.read().split('\n')
+
+parameters = {}
+inside = False
+for line in lines:
+    # non_whitespace_elements = re.findall(r"\S+", line)
+    non_whitespace_elements = re.findall(r"[^\s}{=;]+", line)
+    print(f"nwe = \033[33m{non_whitespace_elements}\033[0m")
+    if non_whitespace_elements and non_whitespace_elements[0] == "struct":
+        inside = True
+    if len(non_whitespace_elements) > 2 and inside:
+        # note: cannot use nwe[0] because types do not generate unique keys and so overwrite
+        # here we deliberately add back the key so we can make a manual change when it is different
+        parameters[non_whitespace_elements[1]] = non_whitespace_elements[1:]
+        for k, v in parameters.items():
+            print(f"key: {k:<20}; values: {v}")
+            
+            concatenated_element = ""
+            for i, element in enumerate(v):
+                if element == "//":
+                    concatenated_element = " ".join(v[i:])
+                    # break
+            print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m")
+    
+    # this is a bit of a hack to terminate the harvest 
+    if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill":
+        inside = False
+        break
--- a/examples/parallel/README.md
+++ b/examples/parallel/README.md
@ -1,3 +1,9 @@
 # llama.cpp/example/parallel

-Simplified simluation for serving incoming requests in parallel
+Simplified simulation for serving incoming requests in parallel
+
+Running this using the 100 questions in examples/jeopardy/questions.txt
+on an M2 MAX (38 core) with 32GB unified memory on MacOS Sonoma 14.0
+takes about 235 seconds with sequential responses (-ns 1) and 45 seconds
+with 64 parallel responses (-ns 64) in both cases generating 100 answers (-np 100)
+using a context of 8192 (-c 8192).