threadpool: add support for hybrid polling
poll params (--poll, ...) now specify "polling level", i.e. how aggresively we poll before waiting on cond.var. poll=0 means no polling, 1 means poll for 128K rounds then wait, 2 for 256K rounds, ... The default value of 50 (ie 50x128K rounds) seems like a decent default across modern platforms. We can tune this further as things evolve.
This commit is contained in:
parent
494e27c793
commit
b630acdb73
4 changed files with 58 additions and 48 deletions
|
@ -1717,7 +1717,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
|
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
|
||||||
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
|
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
|
||||||
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
|
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
|
||||||
options.push_back({ "*", " --poll <0|1>", "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
|
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
|
||||||
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
||||||
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
|
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
|
||||||
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi",
|
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi",
|
||||||
|
|
|
@ -73,7 +73,7 @@ struct cpu_params {
|
||||||
bool mask_valid = false; // Default: any CPU
|
bool mask_valid = false; // Default: any CPU
|
||||||
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||||
bool strict_cpu = false; // Use strict CPU placement
|
bool strict_cpu = false; // Use strict CPU placement
|
||||||
bool poll = true; // Use polling (busywait) to wait for work (default matches OpenMP)
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
|
|
|
@ -627,13 +627,13 @@ extern "C" {
|
||||||
typedef bool (*ggml_abort_callback)(void * data);
|
typedef bool (*ggml_abort_callback)(void * data);
|
||||||
|
|
||||||
struct ggml_threadpool_params {
|
struct ggml_threadpool_params {
|
||||||
bool cpumask[GGML_MAX_N_THREADS];
|
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
|
||||||
bool mask_specified;
|
bool mask_specified; // mask is non-empty
|
||||||
int32_t n_threads;
|
int32_t n_threads; // number of threads
|
||||||
int32_t prio;
|
int32_t prio; // thread priority
|
||||||
bool poll;
|
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||||
bool strict_cpu;
|
bool strict_cpu; // strict cpu placement
|
||||||
bool paused;
|
bool paused; // start in paused state
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_compute_threadpool; // forward declaration, see ggml.c
|
struct ggml_compute_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
|
@ -1973,7 +1973,7 @@ struct ggml_compute_threadpool {
|
||||||
|
|
||||||
int32_t prio; // Scheduling priority
|
int32_t prio; // Scheduling priority
|
||||||
bool disposable; // Doesn't initialize a conv-var
|
bool disposable; // Doesn't initialize a conv-var
|
||||||
bool poll; // Use polling (busywait) // TODO
|
uint32_t poll; // Polling level (0 - no polling)
|
||||||
|
|
||||||
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
@ -19156,35 +19156,50 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
|
|
||||||
|
static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
|
||||||
|
struct ggml_compute_threadpool * threadpool = state->threadpool;
|
||||||
|
return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
|
||||||
|
struct ggml_compute_threadpool * threadpool = state->threadpool;
|
||||||
|
if (threadpool->stop || threadpool->pause) return true;
|
||||||
|
return ggml_graph_compute_got_work(state);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
|
||||||
|
struct ggml_compute_threadpool * threadpool = state->threadpool;
|
||||||
|
|
||||||
|
// This seems to make 0 ... 100 a decent range for polling level across modern processors.
|
||||||
|
// Perhaps, we can adjust it dynamically based on load and things.
|
||||||
|
const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
|
||||||
|
|
||||||
|
for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
|
||||||
|
// No new work. Keep polling.
|
||||||
|
__cpu_relax();
|
||||||
|
}
|
||||||
|
|
||||||
|
return ggml_graph_compute_got_work(state);
|
||||||
|
}
|
||||||
|
|
||||||
static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
|
static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
|
||||||
struct ggml_compute_threadpool * threadpool = state->threadpool;
|
struct ggml_compute_threadpool * threadpool = state->threadpool;
|
||||||
|
|
||||||
if (threadpool->poll) {
|
if (ggml_graph_compute_poll_for_work(state)) {
|
||||||
while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
|
return ggml_graph_compute_got_work(state);
|
||||||
threadpool->stop ||
|
|
||||||
threadpool->pause
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
// No new work. Yield and keep polling.
|
|
||||||
__cpu_relax();
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
ggml_mutex_lock_shared(&threadpool->mutex);
|
ggml_mutex_lock_shared(&threadpool->mutex);
|
||||||
while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
|
while (!ggml_graph_compute_ready(state)) {
|
||||||
threadpool->stop ||
|
|
||||||
threadpool->pause
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
// No new work. Wait for the signal.
|
// No new work. Wait for the signal.
|
||||||
|
GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
|
||||||
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
|
ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
|
||||||
}
|
}
|
||||||
ggml_mutex_unlock_shared(&threadpool->mutex);
|
ggml_mutex_unlock_shared(&threadpool->mutex);
|
||||||
}
|
|
||||||
return threadpool->new_work;
|
return ggml_graph_compute_got_work(state);
|
||||||
}
|
}
|
||||||
|
|
||||||
static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
|
||||||
|
@ -19404,24 +19419,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
__thread_affinity(threadpool->workers[0].cpumask);
|
__thread_affinity(threadpool->workers[0].cpumask);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!threadpool->poll) {
|
// always take the mutex here because the worker threads are doing hybrid poll/wait
|
||||||
|
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
threadpool->new_work = true;
|
threadpool->new_work = true;
|
||||||
if (threadpool->pause) {
|
if (!threadpool->pause) {
|
||||||
__ggml_resume_threadpool(threadpool);
|
|
||||||
} else {
|
|
||||||
ggml_cond_broadcast(&threadpool->cond);
|
ggml_cond_broadcast(&threadpool->cond);
|
||||||
}
|
|
||||||
ggml_mutex_unlock(&threadpool->mutex);
|
|
||||||
} else {
|
} else {
|
||||||
threadpool->new_work = true;
|
// resume does cond broadcast
|
||||||
if (threadpool->pause) {
|
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
|
||||||
__ggml_resume_threadpool(threadpool);
|
__ggml_resume_threadpool(threadpool);
|
||||||
|
}
|
||||||
ggml_mutex_unlock(&threadpool->mutex);
|
ggml_mutex_unlock(&threadpool->mutex);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
// this is a work thread too
|
// this is a work thread too
|
||||||
ggml_graph_compute_thread(&threadpool->workers[0]);
|
ggml_graph_compute_thread(&threadpool->workers[0]);
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue