threadpool: reduce pause/resume/wakeup overhead in common cases

We now start threadpool in paused state only if we have two.
The resume is now implicit (ie new work) which allows for reduced locking and context-switch overhead.
This commit is contained in:
Max Krasnyansky 2024-08-10 16:12:06 -07:00 committed by fmz
parent 48aa8eec07
commit 494e27c793
5 changed files with 47 additions and 25 deletions

View file

@ -2615,6 +2615,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
tpp.prio = params.priority; tpp.prio = params.priority;
tpp.poll = params.poll; tpp.poll = params.poll;
tpp.strict_cpu = params.strict_cpu; tpp.strict_cpu = params.strict_cpu;
tpp.paused = false;
return tpp; return tpp;
} }

View file

@ -230,17 +230,6 @@ int main(int argc, char ** argv) {
struct ggml_threadpool_params tpp = struct ggml_threadpool_params tpp =
ggml_threadpool_params_from_cpu_params(params.cpuparams); ggml_threadpool_params_from_cpu_params(params.cpuparams);
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool);
if (ctx_guidance) {
llama_attach_threadpool(ctx_guidance, threadpool);
}
struct ggml_compute_threadpool * threadpool_batch = NULL; struct ggml_compute_threadpool * threadpool_batch = NULL;
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
threadpool_batch = ggml_create_threadpool(&tpp_batch); threadpool_batch = ggml_create_threadpool(&tpp_batch);
@ -253,6 +242,20 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
llama_attach_batch_threadpool(ctx_guidance, threadpool_batch); llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
} }
// Start the non-batch threadpool in the paused state
tpp.paused = true;
}
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_attach_threadpool(ctx, threadpool);
if (ctx_guidance) {
llama_attach_threadpool(ctx_guidance, threadpool);
} }
const int n_ctx_train = llama_n_ctx_train(model); const int n_ctx_train = llama_n_ctx_train(model);

View file

@ -633,6 +633,7 @@ extern "C" {
int32_t prio; int32_t prio;
bool poll; bool poll;
bool strict_cpu; bool strict_cpu;
bool paused;
}; };
struct ggml_compute_threadpool; // forward declaration, see ggml.c struct ggml_compute_threadpool; // forward declaration, see ggml.c

View file

@ -18885,14 +18885,27 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
GGML_ALIGNED_FREE(threadpool); GGML_ALIGNED_FREE(threadpool);
} }
#ifndef GGML_USE_OPENMP
// pause/resume must be called under mutex
static void __ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
GGML_PRINT_DEBUG("Pausing threadpool\n");
threadpool->pause = true;
ggml_cond_broadcast(&threadpool->cond);
}
static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
GGML_PRINT_DEBUG("Resuming threadpool\n");
threadpool->pause = false;
ggml_cond_broadcast(&threadpool->cond);
}
#endif
void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) { void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
#ifndef GGML_USE_OPENMP #ifndef GGML_USE_OPENMP
GGML_ASSERT(!threadpool->disposable); GGML_ASSERT(!threadpool->disposable);
GGML_PRINT_DEBUG("Pausing threadpool\n");
ggml_mutex_lock(&threadpool->mutex); ggml_mutex_lock(&threadpool->mutex);
if (!threadpool->pause) { if (!threadpool->pause) {
threadpool->pause = true; __ggml_pause_threadpool(threadpool);
ggml_cond_broadcast(&threadpool->cond);
} }
ggml_mutex_unlock(&threadpool->mutex); ggml_mutex_unlock(&threadpool->mutex);
#else #else
@ -18903,12 +18916,9 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) { void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
#ifndef GGML_USE_OPENMP #ifndef GGML_USE_OPENMP
GGML_ASSERT(!threadpool->disposable); GGML_ASSERT(!threadpool->disposable);
GGML_PRINT_DEBUG("Resuming threadpool\n");
ggml_mutex_lock(&threadpool->mutex); ggml_mutex_lock(&threadpool->mutex);
if (threadpool->pause) { if (threadpool->pause) {
threadpool->pause = false; __ggml_resume_threadpool(threadpool);
ggml_cond_broadcast(&threadpool->cond);
} }
ggml_mutex_unlock(&threadpool->mutex); ggml_mutex_unlock(&threadpool->mutex);
#else #else
@ -19250,7 +19260,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
threadpool->n_barrier_passed = 0; threadpool->n_barrier_passed = 0;
threadpool->current_chunk = 0; threadpool->current_chunk = 0;
threadpool->stop = false; threadpool->stop = false;
threadpool->pause = disposable ? false : true; threadpool->pause = disposable ? false : tpp->paused;
threadpool->new_work = false; threadpool->new_work = false;
threadpool->workers = NULL; threadpool->workers = NULL;
threadpool->n_threads_max = tpp->n_threads; threadpool->n_threads_max = tpp->n_threads;
@ -19340,9 +19350,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
struct ggml_threadpool_params ttp = { struct ggml_threadpool_params ttp = {
.mask_specified = false, .mask_specified = false,
.n_threads = n_threads, .n_threads = n_threads,
.prio = 1, .prio = 0,
.poll = false, .poll = false,
.strict_cpu = false .strict_cpu = false,
.paused = false
}; };
threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan); threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
@ -19396,10 +19407,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
if (!threadpool->poll) { if (!threadpool->poll) {
ggml_mutex_lock(&threadpool->mutex); ggml_mutex_lock(&threadpool->mutex);
threadpool->new_work = true; threadpool->new_work = true;
if (threadpool->pause) {
__ggml_resume_threadpool(threadpool);
} else {
ggml_cond_broadcast(&threadpool->cond); ggml_cond_broadcast(&threadpool->cond);
}
ggml_mutex_unlock(&threadpool->mutex); ggml_mutex_unlock(&threadpool->mutex);
} else { } else {
threadpool->new_work = true; threadpool->new_work = true;
if (threadpool->pause) {
ggml_mutex_lock(&threadpool->mutex);
__ggml_resume_threadpool(threadpool);
ggml_mutex_unlock(&threadpool->mutex);
}
} }
} }
// this is a work thread too // this is a work thread too

View file

@ -15541,17 +15541,14 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
// Switch between the 2 threadpools as needed // Switch between the 2 threadpools as needed
if (n_tokens > 1) { if (n_tokens > 1) {
ggml_pause_threadpool(lctx.threadpool); ggml_pause_threadpool(lctx.threadpool);
ggml_resume_threadpool(lctx.threadpool_batch);
threadpool = lctx.threadpool_batch; threadpool = lctx.threadpool_batch;
n_threads = cparams.n_threads_batch; n_threads = cparams.n_threads_batch;
} else { } else {
ggml_pause_threadpool(lctx.threadpool_batch); ggml_pause_threadpool(lctx.threadpool_batch);
ggml_resume_threadpool(lctx.threadpool);
threadpool = lctx.threadpool; threadpool = lctx.threadpool;
n_threads = cparams.n_threads; n_threads = cparams.n_threads;
} }
} else if (lctx.threadpool) { } else if (lctx.threadpool) {
ggml_resume_threadpool(lctx.threadpool);
threadpool = lctx.threadpool; threadpool = lctx.threadpool;
n_threads = cparams.n_threads; n_threads = cparams.n_threads;
} }