threadpool: move process priority setting into the apps (bench and cli)

This avoids changing the overall process priority on Windows for the apps
that use ggml/llama.cpp directy.
This commit is contained in:
Max Krasnyansky 2024-08-27 16:31:34 -07:00
parent 3bcc4dee9a
commit 5d4c0a1327
6 changed files with 108 additions and 64 deletions

View file

@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
return cpu_get_num_physical_cores(); return cpu_get_num_physical_cores();
} }
// Helper for setting process priority
#if defined(_WIN32)
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
DWORD p = NORMAL_PRIORITY_CLASS;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
return true;
}
#else // MacOS and POSIX
#include <sys/types.h>
#include <sys/resource.h>
bool set_process_priority(enum ggml_sched_priority prio) {
if (prio == GGML_SCHED_PRIO_NORMAL) {
return true;
}
int32_t p = 0;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
case GGML_SCHED_PRIO_HIGH: p = -10; break;
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
}
if (!setpriority(PRIO_PROCESS, 0, p)) {
fprintf(stderr, "warn: failed to set process priority class %d : %s (%d)\n", prio, strerror(errno), errno);
return false;
}
return true;
}
#endif
// //
// CLI argument parsing // CLI argument parsing
// //
@ -508,7 +559,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
if (arg == "--prio") { if (arg == "--prio") {
CHECK_ARG CHECK_ARG
params.cpuparams.priority = std::stoul(argv[i]); params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true; return true;
} }
if (arg == "--cpu-strict") { if (arg == "--cpu-strict") {
@ -545,7 +596,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
if (arg == "--prio-batch") { if (arg == "--prio-batch") {
CHECK_ARG CHECK_ARG
params.cpuparams_batch.priority = std::stoul(argv[i]); params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true; return true;
} }
if (arg == "--cpu-strict-batch") { if (arg == "--cpu-strict-batch") {
@ -581,7 +632,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
if (arg == "--prio-draft") { if (arg == "--prio-draft") {
CHECK_ARG CHECK_ARG
params.draft_cpuparams.priority = std::stoul(argv[i]); params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true; return true;
} }
if (arg == "--cpu-strict-draft") { if (arg == "--cpu-strict-draft") {
@ -610,7 +661,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
if (arg == "--prio-batch-draft") { if (arg == "--prio-batch-draft") {
CHECK_ARG CHECK_ARG
params.draft_cpuparams_batch.priority = std::stoul(argv[i]); params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
return true; return true;
} }
if (arg == "--cpu-strict-batch-draft") { if (arg == "--cpu-strict-batch-draft") {

View file

@ -71,7 +71,7 @@ struct cpu_params {
int n_threads = -1; int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU bool mask_valid = false; // Default: any CPU
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
}; };
@ -290,6 +290,7 @@ std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr); void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
bool set_process_priority(enum ggml_sched_priority prio);
// //
// String utils // String utils

View file

@ -240,7 +240,7 @@ struct cmd_params {
std::vector<bool> embeddings; std::vector<bool> embeddings;
ggml_numa_strategy numa; ggml_numa_strategy numa;
int reps; int reps;
int prio; ggml_sched_priority prio;
int delay; int delay;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
@ -271,7 +271,7 @@ static const cmd_params cmd_params_defaults = {
/* embeddings */ {false}, /* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED, /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5, /* reps */ 5,
/* prio */ 0, /* prio */ GGML_SCHED_PRIO_NORMAL,
/* delay */ 0, /* delay */ 0,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN, /* output_format */ MARKDOWN,
@ -585,7 +585,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.prio = std::stoi(argv[i]); params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
} else if (arg == "--delay") { } else if (arg == "--delay") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1470,6 +1470,8 @@ int main(int argc, char ** argv) {
llama_backend_init(); llama_backend_init();
llama_numa_init(params.numa); llama_numa_init(params.numa);
set_process_priority(params.prio);
// initialize printer // initialize printer
std::unique_ptr<printer> p = create_printer(params.output_format); std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr); std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
@ -1525,9 +1527,9 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str()); LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
exit(1); exit(1);
} }
tpp.strict_cpu = t.cpu_strict; tpp.strict_cpu = t.cpu_strict;
tpp.poll = t.poll; tpp.poll = t.poll;
tpp.prio = params.prio; tpp.prio = params.prio;
struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) { if (!threadpool) {

View file

@ -230,6 +230,8 @@ int main(int argc, char ** argv) {
struct ggml_threadpool_params tpp = struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams); ggml_threadpool_params_from_cpu_params(params.cpuparams);
set_process_priority(params.cpuparams.priority);
struct ggml_compute_threadpool * threadpool_batch = NULL; struct ggml_compute_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_create_threadpool(&tpp_batch); threadpool_batch = ggml_create_threadpool(&tpp_batch);

View file

@ -626,15 +626,23 @@ extern "C" {
// If it returns true, the computation is aborted // If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data); typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params // Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params { struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads int n_threads; // number of threads
int32_t prio; // thread priority enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement bool strict_cpu; // strict cpu placement
bool paused; // start in paused state bool paused; // start in paused state
}; };
struct ggml_compute_threadpool; // forward declaration, see ggml.c struct ggml_compute_threadpool; // forward declaration, see ggml.c

View file

@ -18655,18 +18655,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
static thread_ret_t ggml_graph_compute_secondary_thread(void* data); static thread_ret_t ggml_graph_compute_secondary_thread(void* data);
enum {
SCHED_PRIO_NORMAL,
SCHED_PRIO_MEDIUM,
SCHED_PRIO_HIGH,
SCHED_PRIO_REALTIME
};
#if defined(_WIN32) #if defined(_WIN32)
#include "windows.h" #include "windows.h"
// TODO: support > 64 CPUs // TODO: support > 64 CPUs
static bool ggml_thread_apply_affinity(bool * mask) { bool ggml_thread_apply_affinity(bool * mask) {
HANDLE h = GetCurrentThread(); HANDLE h = GetCurrentThread();
uint64_t bitmask = 0ULL; uint64_t bitmask = 0ULL;
@ -18700,35 +18693,22 @@ static bool ggml_thread_apply_affinity(bool * mask) {
return m != 0; return m != 0;
} }
static bool ggml_thread_apply_thread_priority(int32_t prio) { static bool ggml_thread_apply_priority(int32_t prio) {
DWORD p = NORMAL_PRIORITY_CLASS; // Note that on Windows the Process Priority Class must be updated in order to set Thread priority.
// This is up to the applications.
DWORD p = THREAD_PRIORITY_NORMAL;
switch (prio) {
case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
}
if (prio == SCHED_PRIO_NORMAL) { if (prio == GGML_SCHED_PRIO_NORMAL) {
// Keep inherited policy/priority // Keep inherited policy/priority
return true; return true;
} }
// On Windows we have to update Process Priority Class in order to set Thread priority.
switch (prio) {
case SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
case SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
case SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
case SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
}
if (!SetPriorityClass(GetCurrentProcess(), p)) {
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
return false;
}
switch (prio) {
case SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break;
case SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break;
case SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break;
case SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break;
}
if (!SetThreadPriority(GetCurrentThread(), p)) { if (!SetThreadPriority(GetCurrentThread(), p)) {
fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError()); fprintf(stderr, "warn: failed to set thread priority %d : (%d)\n", prio, (int) GetLastError());
return false; return false;
@ -18747,17 +18727,17 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
return true; return true;
} }
static bool ggml_thread_apply_thread_priority(int32_t prio) { static bool ggml_thread_apply_priority(int32_t prio) {
struct sched_param p; struct sched_param p;
int32_t policy = SCHED_OTHER; int32_t policy = SCHED_OTHER;
switch (prio) { switch (prio) {
case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
} }
if (prio == SCHED_PRIO_NORMAL) { if (prio == GGML_SCHED_PRIO_NORMAL) {
// Keep inherited policy/priority // Keep inherited policy/priority
return true; return true;
} }
@ -18802,17 +18782,17 @@ static bool ggml_thread_apply_affinity(const bool * mask) {
return true; return true;
} }
static bool ggml_thread_apply_thread_priority(int32_t prio) { static bool ggml_thread_apply_priority(int32_t prio) {
struct sched_param p; struct sched_param p;
int32_t policy = SCHED_OTHER; int32_t policy = SCHED_OTHER;
switch (prio) { switch (prio) {
case SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break;
case SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break;
case SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break;
case SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break; case GGML_SCHED_PRIO_REALTIME: policy = SCHED_FIFO; p.sched_priority = 90; break;
} }
if (prio == SCHED_PRIO_NORMAL) { if (prio == GGML_SCHED_PRIO_NORMAL) {
// Keep inherited policy/priority // Keep inherited policy/priority
return true; return true;
} }
@ -19190,7 +19170,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_compute_state * state = (struct ggml_compute_state *) data;
struct ggml_compute_threadpool * threadpool = state->threadpool; struct ggml_compute_threadpool * threadpool = state->threadpool;
ggml_thread_apply_thread_priority(threadpool->prio); ggml_thread_apply_priority(threadpool->prio);
if (ggml_thread_cpumask_is_valid(state->cpumask)) { if (ggml_thread_cpumask_is_valid(state->cpumask)) {
ggml_thread_apply_affinity(state->cpumask); ggml_thread_apply_affinity(state->cpumask);
} }
@ -19238,7 +19218,7 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo
if (threadpool->pause) { if (threadpool->pause) {
// Update main thread prio and affinity to match the threadpool settings // Update main thread prio and affinity to match the threadpool settings
ggml_thread_apply_thread_priority(threadpool->prio); ggml_thread_apply_priority(threadpool->prio);
if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
ggml_thread_apply_affinity(threadpool->workers[0].cpumask); ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
} }
@ -19333,7 +19313,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
if (!threadpool->pause) { if (!threadpool->pause) {
// Update main thread prio and affinity at the start, otherwise we'll do it in resume // Update main thread prio and affinity at the start, otherwise we'll do it in resume
ggml_thread_apply_thread_priority(threadpool->prio); ggml_thread_apply_priority(threadpool->prio);
if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { if (ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) {
ggml_thread_apply_affinity(threadpool->workers[0].cpumask); ggml_thread_apply_affinity(threadpool->workers[0].cpumask);
} }