Update helper dev
This commit is contained in:
parent
0d70518220
commit
9c5d6f0ef6
4 changed files with 112 additions and 31 deletions
|
@ -35,21 +35,21 @@ int32_t get_num_physical_cores();
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = -1; // RNG seed
|
uint32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores(); // user-defined or num of internal physical cores
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
int32_t n_threads_batch = -1; // num threads for batch proc (-1 = use n_threads)
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt proc (>=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
|
||||||
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
|
||||||
int32_t n_parallel = 1; // number of parallel sequences to decode
|
int32_t n_parallel = 1; // number of parallel sequences to decode
|
||||||
int32_t n_sequences = 1; // number of sequences to decode
|
int32_t n_sequences = 1; // number of sequences to decode
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // num layers stored in VRAM (-1 for default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // num layers stored in VRAM for draft mod (-1 for default)
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if > 0, output probabilities of top n_probs tokens.
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 0.0f; // RoPE base frequency
|
float rope_freq_base = 0.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
||||||
|
@ -61,7 +61,7 @@ struct gpt_params {
|
||||||
float typical_p = 1.00f; // 1.0 = disabled
|
float typical_p = 1.00f; // 1.0 = disabled
|
||||||
float temp = 0.80f; // 1.0 = disabled
|
float temp = 0.80f; // 1.0 = disabled
|
||||||
float repeat_penalty = 1.10f; // 1.0 = disabled
|
float repeat_penalty = 1.10f; // 1.0 = disabled
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable, -1 = cxt size)
|
||||||
float frequency_penalty = 0.00f; // 0.0 = disabled
|
float frequency_penalty = 0.00f; // 0.0 = disabled
|
||||||
float presence_penalty = 0.00f; // 0.0 = disabled
|
float presence_penalty = 0.00f; // 0.0 = disabled
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
@ -78,7 +78,7 @@ struct gpt_params {
|
||||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string prompt = "";
|
std::string prompt = ""; // user-provided single prompt
|
||||||
std::string prompt_file = ""; // store the external prompt file
|
std::string prompt_file = ""; // store the external prompt file
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with
|
||||||
|
@ -90,11 +90,11 @@ struct gpt_params {
|
||||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
std::string lora_base = ""; // base model path for the lora adapter
|
||||||
|
|
||||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int ppl_stride = 0; // stride for ppl calcs. 0: the pre-existing approach will be used.
|
||||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int ppl_output_type = 0; // 0: ppl output as usual, 1: ppl output = num_tokens, ppl, one per line
|
||||||
// (which is more convenient to use for plotting)
|
// (which is more convenient to use for plotting)
|
||||||
//
|
//
|
||||||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
bool hellaswag = false; // compute HellaSwag score from datafile given in prompt
|
||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||||
|
@ -109,7 +109,7 @@ struct gpt_params {
|
||||||
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool interactive_first = false; // wait for user input immediately
|
bool interactive_first = false; // wait for user input immediately
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compat'y with subprocs and ltd consoles
|
||||||
bool cont_batching = false; // insert new sequences for decoding on-the-fly
|
bool cont_batching = false; // insert new sequences for decoding on-the-fly
|
||||||
|
|
||||||
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
||||||
|
|
|
@ -4,6 +4,8 @@ import os
|
||||||
import re
|
import re
|
||||||
import collections
|
import collections
|
||||||
import re
|
import re
|
||||||
|
import read_common_h
|
||||||
|
|
||||||
|
|
||||||
def replace_dashes_with_underscores(filename):
|
def replace_dashes_with_underscores(filename):
|
||||||
with open(filename, 'r') as file:
|
with open(filename, 'r') as file:
|
||||||
|
@ -48,6 +50,13 @@ def output_results(result):
|
||||||
print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.")
|
print(f"\033[32mAll of them: \033[0m{sorted(all_of_them)}.")
|
||||||
return sorted_result
|
return sorted_result
|
||||||
|
|
||||||
|
def concatenate(v):
|
||||||
|
concatenated_element = ""
|
||||||
|
for i, element in enumerate(v):
|
||||||
|
if element == "//":
|
||||||
|
concatenated_element = " ".join(v[i:])
|
||||||
|
return concatenated_element
|
||||||
|
|
||||||
def find_parameters(file, sorted_result):
|
def find_parameters(file, sorted_result):
|
||||||
with open(file, "r") as helpfile:
|
with open(file, "r") as helpfile:
|
||||||
lines = helpfile.read().split("\n")
|
lines = helpfile.read().split("\n")
|
||||||
|
@ -59,36 +68,68 @@ def find_parameters(file, sorted_result):
|
||||||
pattern = r"(?:--{}\s)|(?:params\.{}[\s.,();])".format(argument, argument.split('n_')[-1])
|
pattern = r"(?:--{}\s)|(?:params\.{}[\s.,();])".format(argument, argument.split('n_')[-1])
|
||||||
if re.search(pattern, line):
|
if re.search(pattern, line):
|
||||||
parameters.append(line)
|
parameters.append(line)
|
||||||
'''for line in lines:
|
|
||||||
for argument in arguments:
|
|
||||||
# need to try to avoid spurious matches
|
|
||||||
argument1 = "--" + argument + " "
|
|
||||||
if argument1 in line:
|
|
||||||
parameters.append(line)
|
|
||||||
# need to try to avoid spurious matches
|
|
||||||
argument2 = "params." + argument.split('n_')[-1]
|
|
||||||
if argument2 in line:
|
|
||||||
parameters.append(line)
|
|
||||||
argument3 = "params." + argument
|
|
||||||
if argument3 in line:
|
|
||||||
parameters.append(line)'''
|
|
||||||
all_parameters = set(parameters)
|
all_parameters = set(parameters)
|
||||||
print(f"\n\nFilename: \033[32m{filename.split('/')[-1]}\033[0m\n\n command-line arguments available and gpt-params functions implemented:\n")
|
file = filename.split('/')[-1]
|
||||||
|
print("\n\n"+"#"*(10+len(file)))
|
||||||
|
print(f"Filename: \033[32m{file}\033[0m")
|
||||||
|
print("#"*(10+len(file)))
|
||||||
|
print(f"\n\n command-line arguments available and gpt-params functions implemented (TODO: multi-line helps NEED SOME WORK):\n")
|
||||||
|
|
||||||
if not all_parameters:
|
if not all_parameters:
|
||||||
print(f" \033[032mNone\033[0m\n")
|
print(f" \033[032mNone\033[0m\n")
|
||||||
else:
|
|
||||||
for parameter in all_parameters:
|
|
||||||
print(f" help: \033[33m{parameter:<30}\033[0m")
|
|
||||||
|
|
||||||
|
# first do it the original way
|
||||||
|
else:
|
||||||
|
help_count = 0
|
||||||
|
for parameter in all_parameters:
|
||||||
|
help_count += 1
|
||||||
|
print(f"{help_count:>2} help: \033[33m{parameter:<30}\033[0m")
|
||||||
|
|
||||||
|
# now do it the new way
|
||||||
|
print("\nNow we extract the original gpt_params definition and defaults for implemented arguments:\n")
|
||||||
|
gpt_count = 0
|
||||||
|
for k,v in read_common_h.parameters.items():
|
||||||
|
if not read_common_h.parameters.items():
|
||||||
|
print(f" \033[032mNone\033[0m\n")
|
||||||
|
elif k in arguments:
|
||||||
|
# print(f"gpt_params: \033[33m{k:>20}\033[0m values: {v}")
|
||||||
|
concatenated_element = concatenate(v)
|
||||||
|
gpt_count += 1
|
||||||
|
print(f"{gpt_count:>2} gpt_param: \033[32m{k:>19}; \033[34mrole: \033[33m{concatenated_element:<60}\033[0m; \033[34mdefault: \033[30m{v[1]:<10}\033[0m ")
|
||||||
|
|
||||||
|
# searching the other way round is quicker:
|
||||||
|
print("\nSearching the other way round is quicker:\n")
|
||||||
|
key_count = 0
|
||||||
|
for argument in arguments:
|
||||||
|
if argument in read_common_h.parameters:
|
||||||
|
key_count += 1
|
||||||
|
print(f"{key_count:>2} key: {argument:>25}; role: {concatenate(read_common_h.parameters[argument]):<60}; default: {read_common_h.parameters[argument][1]:<10}")
|
||||||
|
if help_count == gpt_count and gpt_count == key_count:
|
||||||
|
print("\n\033[032mNo unresolved help-list incompatibilities with this app.\033[0m")
|
||||||
|
else:
|
||||||
|
print("\n\033[031mThis app requires some attention regarding help-function consistency.\033[0m")
|
||||||
|
|
||||||
# Specify the directory you want to search for cpp files
|
# Specify the directory you want to search for cpp files
|
||||||
directory = '/Users/edsilm2/llama.cpp/examples'
|
directory = '/Users/edsilm2/llama.cpp/examples'
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
# get the parameters from the common.h file utiity we import
|
||||||
|
print(read_common_h.parameters)
|
||||||
|
# So now we've got the gpt_parameters in this parameters dict
|
||||||
|
|
||||||
# First we alter all the hyphenated help words in help-file.txt to underscores
|
# First we alter all the hyphenated help words in help-file.txt to underscores
|
||||||
replace_dashes_with_underscores('help_list.txt')
|
# replace_dashes_with_underscores('help_list.txt')
|
||||||
# Call the find function and output the result
|
# This above may no longer be needed
|
||||||
|
|
||||||
|
print("\n####################### find parameters #################################")
|
||||||
|
# Call the find function to collect all the params.attributes and output the result
|
||||||
result = find_arguments(directory)
|
result = find_arguments(directory)
|
||||||
|
|
||||||
|
print("\n######################################## output_results #################################")
|
||||||
|
# sort the results and output them
|
||||||
sorted = output_results(result)
|
sorted = output_results(result)
|
||||||
|
|
||||||
|
print("\n######################## find help context parameters #################################")
|
||||||
# analyse the files and what they contain
|
# analyse the files and what they contain
|
||||||
find_parameters("help_list.txt", sorted)
|
find_parameters("help_list.txt", sorted)
|
34
examples/cmap-example/read_common_h.py
Normal file
34
examples/cmap-example/read_common_h.py
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
# read common.h and extract the parameters name list
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Read the file into separate lines
|
||||||
|
with open('common/common.h', 'r') as file:
|
||||||
|
lines = file.read().split('\n')
|
||||||
|
|
||||||
|
parameters = {}
|
||||||
|
inside = False
|
||||||
|
for line in lines:
|
||||||
|
# non_whitespace_elements = re.findall(r"\S+", line)
|
||||||
|
non_whitespace_elements = re.findall(r"[^\s}{=;]+", line)
|
||||||
|
print(f"nwe = \033[33m{non_whitespace_elements}\033[0m")
|
||||||
|
if non_whitespace_elements and non_whitespace_elements[0] == "struct":
|
||||||
|
inside = True
|
||||||
|
if len(non_whitespace_elements) > 2 and inside:
|
||||||
|
# note: cannot use nwe[0] because types do not generate unique keys and so overwrite
|
||||||
|
# here we deliberately add back the key so we can make a manual change when it is different
|
||||||
|
parameters[non_whitespace_elements[1]] = non_whitespace_elements[1:]
|
||||||
|
for k, v in parameters.items():
|
||||||
|
print(f"key: {k:<20}; values: {v}")
|
||||||
|
|
||||||
|
concatenated_element = ""
|
||||||
|
for i, element in enumerate(v):
|
||||||
|
if element == "//":
|
||||||
|
concatenated_element = " ".join(v[i:])
|
||||||
|
# break
|
||||||
|
print(" "*10 + f"parameter: \033[32m{k:>40} \033[34mdefault: \033[30m{v[1]:>5} \033[34mcommment: \033[33m{concatenated_element:80}\033[0m")
|
||||||
|
|
||||||
|
# this is a bit of a hack to terminate the harvest
|
||||||
|
if len(non_whitespace_elements) > 2 and non_whitespace_elements[1] == "infill":
|
||||||
|
inside = False
|
||||||
|
break
|
|
@ -1,3 +1,9 @@
|
||||||
# llama.cpp/example/parallel
|
# llama.cpp/example/parallel
|
||||||
|
|
||||||
Simplified simluation for serving incoming requests in parallel
|
Simplified simulation for serving incoming requests in parallel
|
||||||
|
|
||||||
|
Running this using the 100 questions in examples/jeopardy/questions.txt
|
||||||
|
on an M2 MAX (38 core) with 32GB unified memory on MacOS Sonoma 14.0
|
||||||
|
takes about 235 seconds with sequential responses (-ns 1) and 45 seconds
|
||||||
|
with 64 parallel responses (-ns 64) in both cases generating 100 answers (-np 100)
|
||||||
|
using a context of 8192 (-c 8192).
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue