From 65c11d415dcfc0d379c8ab14170bb984212995f4 Mon Sep 17 00:00:00 2001 From: fmz Date: Tue, 28 May 2024 22:31:15 -0700 Subject: [PATCH] llama-bench threadpool CLI params --- common/common.cpp | 10 +- common/common.h | 4 + examples/llama-bench/llama-bench.cpp | 145 +++++++++++++++++++-------- ggml.h | 2 +- 4 files changed, 114 insertions(+), 47 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6306fb572..1a284c56d 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -218,7 +218,7 @@ void gpt_params_handle_model_default(gpt_params & params) { } } -static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr) { +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) { int32_t n_set = 0; if (cpuparams.n_threads < 0) { @@ -226,7 +226,7 @@ static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role if (role_model != nullptr) { cpuparams = *role_model; } else { - cpuparams.n_threads = cpu_get_num_math(); + cpuparams.n_threads = std::thread::hardware_concurrency(); } } @@ -235,11 +235,13 @@ static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role n_set++; } } + if (n_set == 0) { // You hit the jackpot! memset(&cpuparams.cpumask[0], 1, GGML_N_CORES_MAX); n_set = GGML_N_CORES_MAX; } + if (n_set < cpuparams.n_threads) { // Not enough set bits, may experience performance issues. fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); @@ -313,7 +315,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } -static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) { +bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) { size_t dash_loc = range.find('-'); if (dash_loc == std::string::npos) { fprintf(stderr, "Format of CPU range is invalid! Expected []-[].\n"); @@ -350,7 +352,7 @@ static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_C return true; } -static bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) { +bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) { // Discard potential 0x prefix size_t start_i = 0; if (mask.length() >= 2 && mask.substr(0, 2) == "0x") { diff --git a/common/common.h b/common/common.h index 9c5ab959f..039042c81 100644 --- a/common/common.h +++ b/common/common.h @@ -198,6 +198,10 @@ bool gpt_params_parse (int argc, char ** argv, gpt_params & params); bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); +bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_N_CORES_MAX]); +bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_N_CORES_MAX]); +void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); + std::string gpt_params_get_system_info(const gpt_params & params); // diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 81b84eb44..701d8cbce 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -186,11 +186,18 @@ struct cmd_params { std::vector use_mmap; std::vector embeddings; ggml_numa_strategy numa; + cpu_params cpuparams; int reps; bool verbose; output_formats output_format; }; +int32_t n_threads = -1; +bool cpumask[GGML_N_CORES_MAX] = { false }; // CPU affinity mask. +bool mask_valid = false; // Default: any CPU +int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) +bool strict_cpu = false; // Use strict CPU placement +bool poll = false; // Use polling (busywait) to wait for work static const cmd_params cmd_params_defaults = { /* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* n_prompt */ {512}, @@ -210,6 +217,7 @@ static const cmd_params cmd_params_defaults = { /* use_mmap */ {true}, /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, + /* cpuparams */ {int32_t(std::thread::hardware_concurrency()), {false}, false, 1, false, false}, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN @@ -236,6 +244,11 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" --numa (default: disabled)\n"); + printf(" -mt, --max-threads (default: %d)\n", cmd_params_defaults.cpuparams.n_threads); + printf(" -C, --cpu-mask (default: 0x0)\n"); + printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu); + printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority); + printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -272,7 +285,7 @@ static ggml_type ggml_type_from_name(const std::string & s) { } -static cmd_params parse_cmd_params(int argc, char ** argv) { +static cmd_params parse_cmd_params(int argc, char** argv) { cmd_params params; std::string arg; bool invalid_param = false; @@ -292,28 +305,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (arg == "-h" || arg == "--help") { print_usage(argc, argv); exit(0); - } else if (arg == "-m" || arg == "--model") { + } + else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.model.insert(params.model.end(), p.begin(), p.end()); - } else if (arg == "-p" || arg == "--n-prompt") { + } + else if (arg == "-p" || arg == "--n-prompt") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); - } else if (arg == "-n" || arg == "--n-gen") { + } + else if (arg == "-n" || arg == "--n-gen") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); - } else if (arg == "-pg") { + } + else if (arg == "-pg") { if (++i >= argc) { invalid_param = true; break; @@ -323,29 +340,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.n_pg.push_back({std::stoi(p[0]), std::stoi(p[1])}); - } else if (arg == "-b" || arg == "--batch-size") { + params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); + } + else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); - } else if (arg == "-ub" || arg == "--ubatch-size") { + } + else if (arg == "-ub" || arg == "--ubatch-size") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); - } else if (arg == "-ctk" || arg == "--cache-type-k") { + } + else if (arg == "-ctk" || arg == "--cache-type-k") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); std::vector types; - for (const auto & t : p) { + for (const auto& t : p) { ggml_type gt = ggml_type_from_name(t); if (gt == GGML_TYPE_COUNT) { invalid_param = true; @@ -354,14 +374,15 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { types.push_back(gt); } params.type_k.insert(params.type_k.end(), types.begin(), types.end()); - } else if (arg == "-ctv" || arg == "--cache-type-v") { + } + else if (arg == "-ctv" || arg == "--cache-type-v") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); std::vector types; - for (const auto & t : p) { + for (const auto& t : p) { ggml_type gt = ggml_type_from_name(t); if (gt == GGML_TYPE_COUNT) { invalid_param = true; @@ -370,66 +391,104 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { types.push_back(gt); } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); - } else if (arg == "-t" || arg == "--threads") { + } + else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); - } else if (arg == "-ngl" || arg == "--n-gpu-layers") { + } + else if (arg == "-ngl" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); - } else if (arg == "-sm" || arg == "--split-mode") { + } + else if (arg == "-sm" || arg == "--split-mode") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); std::vector modes; - for (const auto & m : p) { + for (const auto& m : p) { llama_split_mode mode; if (m == "none") { mode = LLAMA_SPLIT_MODE_NONE; - } else if (m == "layer") { + } + else if (m == "layer") { mode = LLAMA_SPLIT_MODE_LAYER; - } else if (m == "row") { + } + else if (m == "row") { mode = LLAMA_SPLIT_MODE_ROW; - } else { + } + else { invalid_param = true; break; } modes.push_back(mode); } params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); - } else if (arg == "-mg" || arg == "--main-gpu") { + } + else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; break; } params.main_gpu = split(argv[i], split_delim); - } else if (arg == "-nkvo" || arg == "--no-kv-offload") { + } + else if (arg == "-nkvo" || arg == "--no-kv-offload") { if (++i >= argc) { invalid_param = true; break; } auto p = split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "--numa") { + } + else if (arg == "--numa") { if (++i >= argc) { invalid_param = true; break; - } else { + } + else { std::string value(argv[i]); - /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else { invalid_param = true; break; } } + + } + else if (arg == "-mt" || arg == "--max-threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cpuparams.n_threads = std::stoi(argv[i]); + } + else if (arg == "-C" || arg == "--cpu-mask") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string mask = argv[i]; + params.cpuparams.mask_valid = true; + invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask); + } + else if (arg == "--prio") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.cpuparams.priority = std::stoul(argv[i]); + } else if (arg == "--cpu-strict") { + params.cpuparams.strict_cpu = true; + } else if (arg == "--poll") { + params.cpuparams.poll = true; } else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) { invalid_param = true; @@ -1303,6 +1362,23 @@ int main(int argc, char ** argv) { llama_model * lmodel = nullptr; const cmd_params_instance * prev_inst = nullptr; + postprocess_cpu_params(params.cpuparams); + + struct ggml_threadpool_params tpp; + tpp.n_threads = params.cpuparams.n_threads; + tpp.mask_specified = params.cpuparams.mask_valid; + tpp.strict_cpu = params.cpuparams.strict_cpu; + tpp.prio = params.cpuparams.priority; + tpp.poll = params.cpuparams.poll; + + std::memcpy(&tpp.cpumask[0], ¶ms.cpuparams.cpumask[0], GGML_N_CORES_MAX); + + struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); + if (!threadpool) { + LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); + exit(1); + } + for (const auto & inst : params_instances) { // keep the same model between tests when possible if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { @@ -1329,21 +1405,6 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); - struct ggml_threadpool_params tpp; - tpp.n_threads = t.n_threads; - - // TODO: expose these via cli opts - tpp.mask_specified = false; - tpp.strict_cpu = false; - tpp.prio = 1; - tpp.poll = false; - - struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp); - if (!threadpool) { - LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); - exit(1); - } - llama_set_n_threads(ctx, t.n_threads, t.n_threads); llama_attach_threadpool(ctx, threadpool); @@ -1378,8 +1439,8 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_release_threadpool(threadpool); } + ggml_release_threadpool(threadpool); llama_free_model(lmodel); diff --git a/ggml.h b/ggml.h index 7020cf28f..5c63d825e 100644 --- a/ggml.h +++ b/ggml.h @@ -274,7 +274,7 @@ #define GGML_UNREACHABLE() ((void) 0) #endif -#define GGML_N_CORES_MAX 512 +#define GGML_N_CORES_MAX 16 // used to copy the number of elements and stride in bytes of tensors into local variables. // main purpose is to reduce code duplication and improve readability.