From 5d4c0a132707797c809e4d750f7fa6c5c833d5c0 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 27 Aug 2024 16:31:34 -0700 Subject: [PATCH] threadpool: move process priority setting into the apps (bench and cli) This avoids changing the overall process priority on Windows for the apps that use ggml/llama.cpp directy. --- common/common.cpp | 59 ++++++++++++++++++++-- common/common.h | 3 +- examples/llama-bench/llama-bench.cpp | 14 +++--- examples/main/main.cpp | 2 + ggml/include/ggml.h | 20 +++++--- ggml/src/ggml.c | 74 ++++++++++------------------ 6 files changed, 108 insertions(+), 64 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index faaea88cd..9191ade71 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -251,6 +251,57 @@ int32_t cpu_get_num_math() { return cpu_get_num_physical_cores(); } +// Helper for setting process priority + +#if defined(_WIN32) + +bool set_process_priority(enum ggml_sched_priority prio) { + if (prio == GGML_SCHED_PRIO_NORMAL) { + return true; + } + + DWORD p = NORMAL_PRIORITY_CLASS; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; + case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; + case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; + case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break; + } + + if (!SetPriorityClass(GetCurrentProcess(), p)) { + fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); + return false; + } + + return true; +} + +#else // MacOS and POSIX +#include +#include + +bool set_process_priority(enum ggml_sched_priority prio) { + if (prio == GGML_SCHED_PRIO_NORMAL) { + return true; + } + + int32_t p = 0; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: p = 0; break; + case GGML_SCHED_PRIO_MEDIUM: p = -5; break; + case GGML_SCHED_PRIO_HIGH: p = -10; break; + case GGML_SCHED_PRIO_REALTIME: p = -20; break; + } + + if (!setpriority(PRIO_PROCESS, 0, p)) { + fprintf(stderr, "warn: failed to set process priority class %d : %s (%d)\n", prio, strerror(errno), errno); + return false; + } + return true; +} + +#endif + // // CLI argument parsing // @@ -508,7 +559,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--prio") { CHECK_ARG - params.cpuparams.priority = std::stoul(argv[i]); + params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); return true; } if (arg == "--cpu-strict") { @@ -545,7 +596,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--prio-batch") { CHECK_ARG - params.cpuparams_batch.priority = std::stoul(argv[i]); + params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); return true; } if (arg == "--cpu-strict-batch") { @@ -581,7 +632,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--prio-draft") { CHECK_ARG - params.draft_cpuparams.priority = std::stoul(argv[i]); + params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]); return true; } if (arg == "--cpu-strict-draft") { @@ -610,7 +661,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--prio-batch-draft") { CHECK_ARG - params.draft_cpuparams_batch.priority = std::stoul(argv[i]); + params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]); return true; } if (arg == "--cpu-strict-batch-draft") { diff --git a/common/common.h b/common/common.h index a665716be..cb5e7f6df 100644 --- a/common/common.h +++ b/common/common.h @@ -71,7 +71,7 @@ struct cpu_params { int n_threads = -1; bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool mask_valid = false; // Default: any CPU - int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) + enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) bool strict_cpu = false; // Use strict CPU placement uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) }; @@ -290,6 +290,7 @@ std::string gpt_params_get_system_info(const gpt_params & params); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); +bool set_process_priority(enum ggml_sched_priority prio); // // String utils diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a0cbc2ae9..b20101353 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -240,7 +240,7 @@ struct cmd_params { std::vector embeddings; ggml_numa_strategy numa; int reps; - int prio; + ggml_sched_priority prio; int delay; bool verbose; output_formats output_format; @@ -271,7 +271,7 @@ static const cmd_params cmd_params_defaults = { /* embeddings */ {false}, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, - /* prio */ 0, + /* prio */ GGML_SCHED_PRIO_NORMAL, /* delay */ 0, /* verbose */ false, /* output_format */ MARKDOWN, @@ -585,7 +585,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.prio = std::stoi(argv[i]); + params.prio = (enum ggml_sched_priority) std::stoi(argv[i]); } else if (arg == "--delay") { if (++i >= argc) { invalid_param = true; @@ -1470,6 +1470,8 @@ int main(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); + set_process_priority(params.prio); + // initialize printer std::unique_ptr p = create_printer(params.output_format); std::unique_ptr p_err = create_printer(params.output_format_stderr); @@ -1525,9 +1527,9 @@ int main(int argc, char ** argv) { LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); exit(1); } - tpp.strict_cpu = t.cpu_strict; - tpp.poll = t.poll; - tpp.prio = params.prio; + tpp.strict_cpu = t.cpu_strict; + tpp.poll = t.poll; + tpp.prio = params.prio; struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); if (!threadpool) { diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a64c1bc25..bdaf0dbb6 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -230,6 +230,8 @@ int main(int argc, char ** argv) { struct ggml_threadpool_params tpp = ggml_threadpool_params_from_cpu_params(params.cpuparams); + set_process_priority(params.cpuparams.priority); + struct ggml_compute_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { threadpool_batch = ggml_create_threadpool(&tpp_batch); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1df73d328..1ced22eec 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -626,15 +626,23 @@ extern "C" { // If it returns true, the computation is aborted typedef bool (*ggml_abort_callback)(void * data); + // Scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + // Threadpool params // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - int32_t prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state }; struct ggml_compute_threadpool; // forward declaration, see ggml.c diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 030e26104..dd08b77f8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -18655,18 +18655,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { static thread_ret_t ggml_graph_compute_secondary_thread(void* data); -enum { - SCHED_PRIO_NORMAL, - SCHED_PRIO_MEDIUM, - SCHED_PRIO_HIGH, - SCHED_PRIO_REALTIME -}; - #if defined(_WIN32) #include "windows.h" // TODO: support > 64 CPUs -static bool ggml_thread_apply_affinity(bool * mask) { +bool ggml_thread_apply_affinity(bool * mask) { HANDLE h = GetCurrentThread(); uint64_t bitmask = 0ULL; @@ -18700,35 +18693,22 @@ static bool ggml_thread_apply_affinity(bool * mask) { return m != 0; } -static bool ggml_thread_apply_thread_priority(int32_t prio) { - DWORD p = NORMAL_PRIORITY_CLASS; +static bool ggml_thread_apply_priority(int32_t prio) { + // Note that on Windows the Process Priority Class must be updated in order to set Thread priority. + // This is up to the applications. + DWORD p = THREAD_PRIORITY_NORMAL; + switch (prio) { + case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; + case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; + case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; + case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; + } - if (prio == SCHED_PRIO_NORMAL) { + if (prio == GGML_SCHED_PRIO_NORMAL) { // Keep inherited policy/priority return true; } - // On Windows we have to update Process Priority Class in order to set Thread priority. - - switch (prio) { - case SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; - case SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; - case SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; - case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break; - } - - if (!SetPriorityClass(GetCurrentProcess(), p)) { - fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError()); - return false; - } - - switch (prio) { - case SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; - case SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; - case SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; - case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; - } - if (!SetThreadPriority(GetCurrentThread(), p)) { fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError()); return false; @@ -18747,17 +18727,17 @@ static bool ggml_thread_apply_affinity(const bool * mask) { return true; } -static bool ggml_thread_apply_thread_priority(int32_t prio) { +static bool ggml_thread_apply_priority(int32_t prio) { struct sched_param p; int32_t policy = SCHED_OTHER; switch (prio) { - case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; - case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; - case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; - case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; } - if (prio == SCHED_PRIO_NORMAL) { + if (prio == GGML_SCHED_PRIO_NORMAL) { // Keep inherited policy/priority return true; } @@ -18802,17 +18782,17 @@ static bool ggml_thread_apply_affinity(const bool * mask) { return true; } -static bool ggml_thread_apply_thread_priority(int32_t prio) { +static bool ggml_thread_apply_priority(int32_t prio) { struct sched_param p; int32_t policy = SCHED_OTHER; switch (prio) { - case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; - case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; - case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; - case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; + case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; + case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; + case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; + case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; } - if (prio == SCHED_PRIO_NORMAL) { + if (prio == GGML_SCHED_PRIO_NORMAL) { // Keep inherited policy/priority return true; } @@ -19190,7 +19170,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_threadpool * threadpool = state->threadpool; - ggml_thread_apply_thread_priority(threadpool->prio); + ggml_thread_apply_priority(threadpool->prio); if (ggml_thread_cpumask_is_valid(state->cpumask)) { ggml_thread_apply_affinity(state->cpumask); } @@ -19238,7 +19218,7 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo if (threadpool->pause) { // Update main thread prio and affinity to match the threadpool settings - ggml_thread_apply_thread_priority(threadpool->prio); + ggml_thread_apply_priority(threadpool->prio); if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { ggml_thread_apply_affinity(threadpool->workers[0].cpumask); } @@ -19333,7 +19313,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl( if (!threadpool->pause) { // Update main thread prio and affinity at the start, otherwise we'll do it in resume - ggml_thread_apply_thread_priority(threadpool->prio); + ggml_thread_apply_priority(threadpool->prio); if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { ggml_thread_apply_affinity(threadpool->workers[0].cpumask); }