Introduce ggml_compute_threadpool

- OpenMP functional: check
- Vanilla ggml functional: Check
- ggml w/threadpool functional: Check
- OpenMP no regression: No glaring problems
- Vanilla ggml no regression: No glaring problems
- ggml w/threadpool no regression: No glaring problems
This commit is contained in:
Faisal Zaghloul 2024-07-31 12:42:30 -04:00 committed by fmz
parent 3246fe84d7
commit 130adf8415
20 changed files with 1169 additions and 216 deletions

View file

@ -277,6 +277,36 @@ void gpt_params_handle_model_default(gpt_params & params) {
}
}
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
int32_t n_set = 0;
if (cpuparams.n_threads < 0) {
// Assuming everything about cpuparams is invalid
if (role_model != nullptr) {
cpuparams = *role_model;
} else {
cpuparams.n_threads = std::thread::hardware_concurrency();
}
}
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
if (cpuparams.cpumask[i]) {
n_set++;
}
}
if (n_set == 0) {
// You hit the jackpot!
memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS);
n_set = GGML_MAX_N_THREADS;
}
if (n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues.
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
}
}
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
std::string arg;
@ -296,6 +326,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
}
postprocess_cpu_params(params.cpuparams, nullptr);
postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
}
@ -331,7 +366,7 @@ void gpt_params_parse_from_env(gpt_params & params) {
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
get_env("LLAMA_ARG_THREADS", params.n_threads);
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
get_env("LLAMA_ARG_BATCH", params.n_batch);
@ -368,6 +403,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return true;
}
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) {
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
return false;
}
size_t start_i;
size_t end_i;
if (dash_loc == 0) {
start_i = 0;
} else {
start_i = std::stoull(range.substr(0, dash_loc));
if (start_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "Start index out of bounds!\n");
return false;
}
}
if (dash_loc == range.length() - 1) {
end_i = GGML_MAX_N_THREADS - 1;
} else {
end_i = std::stoull(range.substr(dash_loc + 1));
if (end_i >= GGML_MAX_N_THREADS) {
fprintf(stderr, "End index out of bounds!\n");
return false;
}
}
for (size_t i = start_i; i <= end_i; i++) {
boolmask[i] = true;
}
return true;
}
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
// Discard potential 0x prefix
size_t start_i = 0;
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
start_i = 2;
}
size_t num_digits = mask.length() - start_i;
if (num_digits > 128) num_digits = 128;
size_t end_i = num_digits + start_i;
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
char c = mask.at(i);
int8_t id = c;
if ((c >= '0' && c <= '9')) {
id -= '0';
} else if (c >= 'a' && c <= 'f') {
id -= 'a' - 10;
} else if (c >= 'A' && c <= 'F') {
id -= 'A' - 10;
} else {
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
return false;
}
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
}
return true;
}
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
@ -384,36 +492,137 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "-t" || arg == "--threads") {
CHECK_ARG
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
params.cpuparams.n_threads = std::stoi(argv[i]);
if (params.cpuparams.n_threads <= 0) {
params.cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-C" || arg == "--cpu-mask") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
return true;
}
if (arg == "-Cr" || arg == "--cpu-range") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
return true;
}
if (arg == "--prio") {
CHECK_ARG
params.cpuparams.priority = std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict") {
params.cpuparams.strict_cpu = true;
return true;
}
if (arg == "--poll") {
params.cpuparams.poll = true;
return true;
}
if (arg == "-tb" || arg == "--threads-batch") {
CHECK_ARG
params.n_threads_batch = std::stoi(argv[i]);
if (params.n_threads_batch <= 0) {
params.n_threads_batch = std::thread::hardware_concurrency();
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.cpuparams_batch.n_threads <= 0) {
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
CHECK_ARG
std::string mask = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "-Crb" || arg == "--cpu-range_batch") {
CHECK_ARG
std::string range = argv[i];
params.cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch") {
CHECK_ARG
params.cpuparams_batch.priority = std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch") {
params.cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch") {
params.cpuparams_batch.poll = true;
return true;
}
if (arg == "-td" || arg == "--threads-draft") {
CHECK_ARG
params.n_threads_draft = std::stoi(argv[i]);
if (params.n_threads_draft <= 0) {
params.n_threads_draft = std::thread::hardware_concurrency();
params.draft_cpuparams.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams.n_threads <= 0) {
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
CHECK_ARG
std::string mask = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "-Crd" || arg == "--cpu-range-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
return true;
}
if (arg == "--prio-draft") {
CHECK_ARG
params.draft_cpuparams.priority = std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-draft") {
params.draft_cpuparams.strict_cpu = true;
return true;
}
if (arg == "--poll-draft") {
params.draft_cpuparams.poll = true;
return true;
}
if (arg == "-tbd" || arg == "--threads-batch-draft") {
CHECK_ARG
params.n_threads_batch_draft = std::stoi(argv[i]);
if (params.n_threads_batch_draft <= 0) {
params.n_threads_batch_draft = std::thread::hardware_concurrency();
params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
if (params.draft_cpuparams_batch.n_threads <= 0) {
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
}
return true;
}
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
CHECK_ARG
std::string range = argv[i];
params.draft_cpuparams_batch.mask_valid = true;
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
return true;
}
if (arg == "--prio-batch-draft") {
CHECK_ARG
params.draft_cpuparams_batch.priority = std::stoul(argv[i]);
return true;
}
if (arg == "--cpu-strict-batch-draft") {
params.draft_cpuparams_batch.strict_cpu = true;
return true;
}
if (arg == "--poll-batch-draft") {
params.draft_cpuparams_batch.poll = true;
return true;
}
if (arg == "-p" || arg == "--prompt") {
CHECK_ARG
params.prompt = argv[i];
@ -1498,11 +1707,38 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
options.push_back({ "*", " --cpu-strict", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
options.push_back({ "*", " --poll", "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi",
"ranges of CPUs for affinity. Complements --cpu-mask-batch"});
options.push_back({ "*", " --cpu-strict-batch", "use strict CPU placement (default: same as --cpu-strict)"});
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
options.push_back({ "*", " --poll-batch", "use polling to wait for work (default: --poll)"});
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi",
"Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
options.push_back({ "speculative", " --cpu-strict-draft", "Use strict CPU placement for draft model (default: same as --cpu-strict)"});
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
options.push_back({ "speculative", " --poll-draft", "Use polling to wait for draft model work (default: same as --poll])"});
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M",
"Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
options.push_back({ "speculative", " --cpu-strict-batch-draft",
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
options.push_back({ "speculative", " --priority-batch-draft N",
"Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
options.push_back({ "speculative", " --poll-batch-draft", "Use polling to wait for draft model work (default: --poll-draft)"});
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
@ -1774,7 +2010,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
printf("usage: %s [options]\n", argv[0]);
@ -1806,9 +2041,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os;
os << "system_info: n_threads = " << params.n_threads;
if (params.n_threads_batch != -1) {
os << " (n_threads_batch = " << params.n_threads_batch << ")";
os << "system_info: n_threads = " << params.cpuparams.n_threads;
if (params.cpuparams_batch.n_threads != -1) {
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
}
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
// TODO: windows + arm64 + mingw64
@ -2332,8 +2567,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
cparams.n_seq_max = params.n_parallel;
cparams.n_batch = params.n_batch;
cparams.n_ubatch = params.n_ubatch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
cparams.n_threads = params.cpuparams.n_threads;
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
cparams.seed = params.seed;
cparams.logits_all = params.logits_all;
cparams.embeddings = params.embedding;
@ -2359,6 +2595,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return cparams;
}
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
struct ggml_threadpool_params tpp;
tpp.mask_specified = params.mask_valid;
if (params.mask_valid) {
std::memcpy(&tpp.cpumask, &params.cpumask, GGML_MAX_N_THREADS);
}
tpp.n_threads = params.n_threads;
tpp.prio = params.priority;
tpp.poll = params.poll;
tpp.strict_cpu = params.strict_cpu;
return tpp;
}
#ifdef LLAMA_USE_CURL
static bool starts_with(const std::string & str, const std::string & prefix) {
@ -3348,7 +3600,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);

View file

@ -67,13 +67,18 @@ enum dimre_method {
DIMRE_METHOD_MEAN,
};
struct cpu_params {
int32_t n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
bool poll = false; // Use polling (busywait) to wait for work
};
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -100,6 +105,11 @@ struct gpt_params {
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
struct cpu_params cpuparams;
struct cpu_params cpuparams_batch;
struct cpu_params draft_cpuparams;
struct cpu_params draft_cpuparams_batch;
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
@ -204,7 +214,7 @@ struct gpt_params {
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1";
std::string public_path = "";
@ -277,6 +287,10 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
//
// String utils
//
@ -327,8 +341,9 @@ struct llama_init_result {
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);

View file

@ -50,6 +50,6 @@ else()
endif()
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
#add_subdirectory(speculative)
add_subdirectory(tokenize)
endif()

View file

@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);

View file

@ -21,7 +21,7 @@
#endif
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);

View file

@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
if (use_pca) {
// run PCA
PCA::pca_params pca_params;
pca_params.n_threads = params.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_threads = params.cpuparams.n_threads;
pca_params.n_batch = params.n_pca_batch;
pca_params.n_iterations = params.n_pca_iterations;
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
} else {

View file

@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
g_verbose = (params.verbosity == 1);
try {
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
ctx.run_merge();
} catch (const std::exception & err) {
fprintf(stderr, "%s\n", err.what());

View file

@ -235,6 +235,7 @@ struct cmd_params {
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
cpu_params cpuparams;
int reps;
bool verbose;
output_formats output_format;
@ -261,6 +262,7 @@ static const cmd_params cmd_params_defaults = {
/* use_mmap */ {true},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* cpuparams */ {},
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN,
@ -289,6 +291,11 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -mt, --max-threads <n> (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@ -492,6 +499,30 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
} else if (arg == "-mt" || arg == "--max-threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.n_threads = std::stoi(argv[i]);
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.priority = std::stoul(argv[i]);
} else if (arg == "--cpu-strict") {
params.cpuparams.strict_cpu = true;
} else if (arg == "--poll") {
params.cpuparams.poll = true;
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
@ -1402,6 +1433,23 @@ int main(int argc, char ** argv) {
llama_model * lmodel = nullptr;
const cmd_params_instance * prev_inst = nullptr;
postprocess_cpu_params(params.cpuparams);
struct ggml_threadpool_params tpp;
tpp.n_threads = params.cpuparams.n_threads;
tpp.mask_specified = params.cpuparams.mask_valid;
tpp.strict_cpu = params.cpuparams.strict_cpu;
tpp.prio = params.cpuparams.priority;
tpp.poll = params.cpuparams.poll;
std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
for (const auto & inst : params_instances) {
// keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@ -1427,6 +1475,7 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx);
llama_kv_cache_clear(ctx);
llama_attach_threadpool(ctx, threadpool);
// warmup run
if (t.n_prompt > 0) {
@ -1468,6 +1517,8 @@ int main(int argc, char ** argv) {
llama_free(ctx);
}
ggml_release_threadpool(threadpool);
llama_free_model(lmodel);
if (p) {

View file

@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
if (!params->image.empty()) {
LOG_TEE("using base64 encoded image instead of command line image path\n");
}
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
if (!embed) {
LOG_TEE("%s: can't load image from prompt\n", __func__);
return NULL;
}
params->prompt = remove_image_from_prompt(prompt);
} else {
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
if (!embed) {
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
return NULL;

View file

@ -221,6 +221,33 @@ int main(int argc, char ** argv) {
return 1;
}
LOG("%s: llama threadpool init = n_threads = %d\n",
__func__,
(int32_t) params.cpuparams.n_threads
);
struct ggml_threadpool_params tpp_batch =
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams);
struct ggml_compute_threadpool * threadpool_batch = ggml_create_threadpool(&tpp_batch);
if (!threadpool_batch) {
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
exit(1);
}
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_batch_threadpool(ctx, threadpool_batch);
llama_attach_threadpool(ctx, threadpool);
if (ctx_guidance) {
llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
llama_attach_threadpool(ctx_guidance, threadpool);
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
@ -989,6 +1016,9 @@ int main(int argc, char ** argv) {
llama_sampling_free(ctx_sampling);
llama_backend_free();
ggml_release_threadpool(threadpool);
ggml_release_threadpool(threadpool_batch);
#ifndef LOG_DISABLE_LOGS
LOG_TEE("Log end\n");
#endif // LOG_DISABLE_LOGS

View file

@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) {
});
LOG_INFO("system info", {
{"n_threads", params.n_threads},
{"n_threads_batch", params.n_threads_batch},
{"n_threads", params.cpuparams.n_threads},
{"n_threads_batch", params.cpuparams_batch.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});

View file

@ -146,7 +146,7 @@ option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library"
set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"ggml: metal minimum macOS version")
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_OPENMP "ggml: use OpenMP" OFF)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)

View file

@ -7,8 +7,9 @@ extern "C" {
#endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_compute_threadpool * ggml_compute_threadpool_t;
// Tensor allocator
struct ggml_tallocr {

View file

@ -102,6 +102,7 @@ extern "C" {
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer

View file

@ -231,6 +231,8 @@
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64
#define GGML_MAX_N_THREADS 512
#endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
@ -624,6 +626,17 @@ extern "C" {
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS];
bool mask_specified;
int32_t n_threads;
int32_t prio;
bool poll;
bool strict_cpu;
};
struct ggml_compute_threadpool; // forward declaration, see ggml.c
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
@ -631,6 +644,7 @@ extern "C" {
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_compute_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
@ -2010,10 +2024,20 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params);
GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool);
GGML_API int32_t ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool);
GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool);
GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_compute_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

View file

@ -722,7 +722,9 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
#endif
struct ggml_backend_cpu_context {
int n_threads;
int n_threads;
ggml_compute_threadpool_t threadpool;
void * work_data;
size_t work_size;
@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
if (cpu_plan->cplan.work_size > 0) {
@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
if (cpu_ctx->work_size < cplan.work_size) {
free(cpu_ctx->work_data);
@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
}
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->threadpool = NULL;
ctx->work_data = NULL;
ctx->work_size = 0;
ctx->abort_callback = NULL;
@ -903,6 +906,13 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
ctx->n_threads = n_threads;
}
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
ctx->threadpool = threadpool;
}
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));

File diff suppressed because it is too large Load diff

View file

@ -428,6 +428,18 @@ extern "C" {
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
struct llama_context * ctx,
ggml_compute_threadpool_t threadpool);
LLAMA_API void llama_attach_batch_threadpool(
struct llama_context * ctx,
ggml_compute_threadpool_t threadpool);
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);

View file

@ -3091,6 +3091,9 @@ struct llama_context {
#endif
ggml_backend_t backend_cpu = nullptr;
ggml_compute_threadpool_t threadpool = nullptr;
ggml_compute_threadpool_t threadpool_batch = nullptr;
bool has_evaluated_once = false;
int64_t t_start_us;
@ -15494,9 +15497,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
}
static void llama_graph_compute(
llama_context & lctx,
ggml_cgraph * gf,
int n_threads) {
llama_context & lctx,
ggml_cgraph * gf,
int n_threads,
ggml_compute_threadpool * threadpool) {
#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(lctx.backend_metal)) {
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
@ -15505,6 +15509,7 @@ static void llama_graph_compute(
if (lctx.backend_cpu != nullptr) {
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
}
#ifdef GGML_USE_BLAS
@ -15518,6 +15523,42 @@ static void llama_graph_compute(
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
}
// Optionally swaps the batch and single-tok threadpools.
// Returns the number of threads, and if a valid threadpool exists, returns it too.
static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
llama_context & lctx,
int32_t n_tokens) {
const auto & cparams = lctx.cparams;
int32_t n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
ggml_compute_threadpool_t threadpool = nullptr; // nullptr -> disposable threadpool
// A batch threadpool without a non-batch threadpool isn't supported.
GGML_ASSERT(!lctx.threadpool_batch || lctx.threadpool);
if (lctx.threadpool_batch && lctx.threadpool) {
// Switch between the 2 threadpools as needed
if (n_tokens > 1) {
ggml_pause_threadpool(lctx.threadpool);
ggml_resume_threadpool(lctx.threadpool_batch);
threadpool = lctx.threadpool_batch;
n_threads = cparams.n_threads_batch;
} else {
ggml_pause_threadpool(lctx.threadpool_batch);
ggml_resume_threadpool(lctx.threadpool);
threadpool = lctx.threadpool;
n_threads = cparams.n_threads;
}
} else if (lctx.threadpool) {
ggml_resume_threadpool(lctx.threadpool);
threadpool = lctx.threadpool;
n_threads = cparams.n_threads;
}
return std::make_pair(n_threads, threadpool);
}
// decode a batch of tokens by evaluating the transformer
//
// - lctx: llama context
@ -15624,7 +15665,12 @@ static int llama_decode_internal(
lctx.n_outputs = n_outputs_new;
}
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
std::pair<int32_t, ggml_compute_threadpool_t> threads =
llama_swap_threadpools(lctx, n_tokens);
int32_t n_threads = threads.first;
ggml_compute_threadpool_t threadpool = threads.second;
GGML_ASSERT(n_threads > 0);
// non-causal masks do not use the KV cache
@ -15686,7 +15732,7 @@ static int llama_decode_internal(
llama_set_inputs(lctx, ubatch);
llama_graph_compute(lctx, gf, n_threads);
llama_graph_compute(lctx, gf, n_threads, threadpool);
// update the kv ring buffer
{
@ -15863,7 +15909,11 @@ static int llama_encode_internal(
lctx.inp_embd_enc = NULL;
lctx.n_outputs = n_tokens;
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
std::pair<int32_t, ggml_compute_threadpool_t> threads =
llama_swap_threadpools(lctx, n_tokens);
int32_t n_threads = threads.first;
ggml_compute_threadpool_t threadpool = threads.second;
GGML_ASSERT(n_threads > 0);
ggml_backend_sched_reset(lctx.sched);
@ -15895,7 +15945,7 @@ static int llama_encode_internal(
llama_set_inputs(lctx, ubatch);
llama_graph_compute(lctx, gf, n_threads);
llama_graph_compute(lctx, gf, n_threads, threadpool);
// extract embeddings
if (embd) {
@ -16177,7 +16227,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
#endif
//const int64_t t_end = ggml_time_us();
@ -16203,7 +16253,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
llama_set_k_shift(lctx);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
need_reserve = true;
}
@ -17451,6 +17501,31 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
}
}
void llama_attach_threadpool(
struct llama_context * ctx,
ggml_compute_threadpool_t threadpool) {
ctx->threadpool = threadpool;
}
void llama_attach_batch_threadpool(
struct llama_context * ctx,
ggml_compute_threadpool_t threadpool_batch) {
ctx->threadpool_batch = threadpool_batch;
}
void llama_detach_threadpool(struct llama_context * ctx) {
ctx->threadpool = nullptr;
}
void llama_detach_batch_threadpool(struct llama_context * ctx) {
ctx->threadpool = nullptr;
}
void llama_detach_threadpools(struct llama_context * ctx) {
llama_detach_threadpool(ctx);
llama_detach_batch_threadpool(ctx);
}
void llama_backend_free(void) {
ggml_quantize_free();
}

View file

@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
}
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
if (plan.work_size > 0) {
buf.resize(plan.work_size);