diff --git a/common/common.cpp b/common/common.cpp index 2fc4d6da7..56e86a07a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -295,13 +295,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) } } - if (n_set == 0) { - // You hit the jackpot! - memset(&cpuparams.cpumask[0], 1, GGML_MAX_N_THREADS); - n_set = GGML_MAX_N_THREADS; - } - - if (n_set < cpuparams.n_threads) { + if (n_set && n_set < cpuparams.n_threads) { // Not enough set bits, may experience performance issues. fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); } @@ -2606,16 +2600,15 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) { struct ggml_threadpool_params tpp; - tpp.mask_specified = params.mask_valid; + ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults + if (params.mask_valid) { std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS); } - tpp.n_threads = params.n_threads; tpp.prio = params.priority; tpp.poll = params.poll; tpp.strict_cpu = params.strict_cpu; - tpp.paused = false; return tpp; } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 571ca6dd2..aca5f83c3 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1462,14 +1462,13 @@ int main(int argc, char ** argv) { llama_kv_cache_clear(ctx); - struct ggml_threadpool_params tpp; - tpp.n_threads = t.n_threads; - tpp.mask_specified = params.cpuparams.mask_valid; + struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads); tpp.strict_cpu = params.cpuparams.strict_cpu; tpp.prio = params.cpuparams.priority; tpp.poll = params.cpuparams.poll; - - std::memcpy(&tpp.cpumask[0], ¶ms.cpuparams.cpumask[0], GGML_MAX_N_THREADS); + if (params.cpuparams.mask_valid) { + std::memcpy(&tpp.cpumask[0], ¶ms.cpuparams.cpumask[0], GGML_MAX_N_THREADS); + } struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); if (!threadpool) { diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1f9e6756e..0accc3908 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -626,9 +626,10 @@ extern "C" { // If it returns true, the computation is aborted typedef bool (*ggml_abort_callback)(void * data); + // Threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores - bool mask_specified; // mask is non-empty + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) int n_threads; // number of threads int32_t prio; // thread priority uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) @@ -2025,6 +2026,8 @@ extern "C" { GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init(struct ggml_threadpool_params *p, int n_threads); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params); GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 2c8f5a7e3..0e46bcea9 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1987,7 +1987,6 @@ struct ggml_compute_state { #ifndef GGML_USE_OPENMP ggml_thread_t thrd; bool cpumask[GGML_MAX_N_THREADS]; - bool mask_specified; int last_graph; bool pending; #endif @@ -18828,11 +18827,14 @@ static bool ggml_thread_apply_thread_priority(int32_t prio) { #endif -static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { - if (!global_mask) { - memset(local_mask, 1, GGML_MAX_N_THREADS); - return; +static bool ggml_thread_cpumask_is_valid(const bool * mask) { + for (int i = 0; i < GGML_MAX_N_THREADS; i++) { + if (mask[i]) { return true; } } + return false; +} + +static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { if (!strict) { memcpy(local_mask, global_mask, GGML_MAX_N_THREADS); return; @@ -19189,8 +19191,10 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { struct ggml_compute_threadpool * threadpool = state->threadpool; ggml_thread_apply_thread_priority(threadpool->prio); - if (state->mask_specified) + + if (ggml_thread_cpumask_is_valid(state->cpumask)) { ggml_thread_apply_affinity(state->cpumask); + } while (true) { // Check if we need to sleep @@ -19249,17 +19253,27 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo #endif // GGML_USE_OPENMP +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { if (p0->n_threads != p1->n_threads ) return false; if (p0->prio != p1->prio ) return false; if (p0->poll != p1->poll ) return false; if (p0->strict_cpu != p1->strict_cpu ) return false; - if (p0->mask_specified != p1->mask_specified) return false; - if (p0->mask_specified) { - return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; - } - - return true; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } static struct ggml_compute_threadpool * ggml_create_threadpool_impl( @@ -19312,16 +19326,13 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl( for (int j = 0; j < tpp->n_threads; j++) { workers[j] = (struct ggml_compute_state) { .thrd = 0, - .mask_specified = tpp->mask_specified, .threadpool = threadpool, .ith = j, .last_graph = 0, .pending = false }; - if (tpp->mask_specified) { - ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); - } + ggml_thread_cpumask_next(tpp->cpumask, workers[j].cpumask, tpp->strict_cpu, &cpumask_iter); // Spin threads for all secondary workers if (j > 0) { @@ -19357,15 +19368,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); disposable_threadpool = true; - struct ggml_threadpool_params ttp = { - .mask_specified = false, - .n_threads = n_threads, - .prio = 0, - .poll = 1, - .strict_cpu = false, - .paused = false - }; - + struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); threadpool = ggml_create_threadpool_impl(&ttp, cgraph, cplan); } else { // Reset some of the parameters that need resetting @@ -19407,7 +19410,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } #else // Update main thread affinity to match the current threadpool - if (threadpool->workers[0].mask_specified) { + if (!ggml_thread_cpumask_is_valid(threadpool->workers[0].cpumask)) { ggml_thread_apply_affinity(threadpool->workers[0].cpumask); }