threadpool: reduce pause/resume/wakeup overhead in common cases
We now start threadpool in paused state only if we have two. The resume is now implicit (ie new work) which allows for reduced locking and context-switch overhead.
This commit is contained in:
parent
48aa8eec07
commit
494e27c793
5 changed files with 47 additions and 25 deletions
|
@ -2615,6 +2615,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
|
||||||
tpp.prio = params.priority;
|
tpp.prio = params.priority;
|
||||||
tpp.poll = params.poll;
|
tpp.poll = params.poll;
|
||||||
tpp.strict_cpu = params.strict_cpu;
|
tpp.strict_cpu = params.strict_cpu;
|
||||||
|
tpp.paused = false;
|
||||||
|
|
||||||
return tpp;
|
return tpp;
|
||||||
}
|
}
|
||||||
|
|
|
@ -230,17 +230,6 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_threadpool_params tpp =
|
struct ggml_threadpool_params tpp =
|
||||||
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
||||||
|
|
||||||
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
|
|
||||||
if (!threadpool) {
|
|
||||||
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_attach_threadpool(ctx, threadpool);
|
|
||||||
if (ctx_guidance) {
|
|
||||||
llama_attach_threadpool(ctx_guidance, threadpool);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_compute_threadpool * threadpool_batch = NULL;
|
struct ggml_compute_threadpool * threadpool_batch = NULL;
|
||||||
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||||
threadpool_batch = ggml_create_threadpool(&tpp_batch);
|
threadpool_batch = ggml_create_threadpool(&tpp_batch);
|
||||||
|
@ -253,6 +242,20 @@ int main(int argc, char ** argv) {
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
|
llama_attach_batch_threadpool(ctx_guidance, threadpool_batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Start the non-batch threadpool in the paused state
|
||||||
|
tpp.paused = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_attach_threadpool(ctx, threadpool);
|
||||||
|
if (ctx_guidance) {
|
||||||
|
llama_attach_threadpool(ctx_guidance, threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
|
|
|
@ -633,6 +633,7 @@ extern "C" {
|
||||||
int32_t prio;
|
int32_t prio;
|
||||||
bool poll;
|
bool poll;
|
||||||
bool strict_cpu;
|
bool strict_cpu;
|
||||||
|
bool paused;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_compute_threadpool; // forward declaration, see ggml.c
|
struct ggml_compute_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
|
@ -18885,14 +18885,27 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) {
|
||||||
GGML_ALIGNED_FREE(threadpool);
|
GGML_ALIGNED_FREE(threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef GGML_USE_OPENMP
|
||||||
|
// pause/resume must be called under mutex
|
||||||
|
static void __ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
|
||||||
|
GGML_PRINT_DEBUG("Pausing threadpool\n");
|
||||||
|
threadpool->pause = true;
|
||||||
|
ggml_cond_broadcast(&threadpool->cond);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
|
||||||
|
GGML_PRINT_DEBUG("Resuming threadpool\n");
|
||||||
|
threadpool->pause = false;
|
||||||
|
ggml_cond_broadcast(&threadpool->cond);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
|
void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
GGML_ASSERT(!threadpool->disposable);
|
GGML_ASSERT(!threadpool->disposable);
|
||||||
GGML_PRINT_DEBUG("Pausing threadpool\n");
|
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
if (!threadpool->pause) {
|
if (!threadpool->pause) {
|
||||||
threadpool->pause = true;
|
__ggml_pause_threadpool(threadpool);
|
||||||
ggml_cond_broadcast(&threadpool->cond);
|
|
||||||
}
|
}
|
||||||
ggml_mutex_unlock(&threadpool->mutex);
|
ggml_mutex_unlock(&threadpool->mutex);
|
||||||
#else
|
#else
|
||||||
|
@ -18903,12 +18916,9 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) {
|
||||||
void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
|
void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) {
|
||||||
#ifndef GGML_USE_OPENMP
|
#ifndef GGML_USE_OPENMP
|
||||||
GGML_ASSERT(!threadpool->disposable);
|
GGML_ASSERT(!threadpool->disposable);
|
||||||
GGML_PRINT_DEBUG("Resuming threadpool\n");
|
|
||||||
|
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
if (threadpool->pause) {
|
if (threadpool->pause) {
|
||||||
threadpool->pause = false;
|
__ggml_resume_threadpool(threadpool);
|
||||||
ggml_cond_broadcast(&threadpool->cond);
|
|
||||||
}
|
}
|
||||||
ggml_mutex_unlock(&threadpool->mutex);
|
ggml_mutex_unlock(&threadpool->mutex);
|
||||||
#else
|
#else
|
||||||
|
@ -19250,7 +19260,7 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl(
|
||||||
threadpool->n_barrier_passed = 0;
|
threadpool->n_barrier_passed = 0;
|
||||||
threadpool->current_chunk = 0;
|
threadpool->current_chunk = 0;
|
||||||
threadpool->stop = false;
|
threadpool->stop = false;
|
||||||
threadpool->pause = disposable ? false : true;
|
threadpool->pause = disposable ? false : tpp->paused;
|
||||||
threadpool->new_work = false;
|
threadpool->new_work = false;
|
||||||
threadpool->workers = NULL;
|
threadpool->workers = NULL;
|
||||||
threadpool->n_threads_max = tpp->n_threads;
|
threadpool->n_threads_max = tpp->n_threads;
|
||||||
|
@ -19340,9 +19350,10 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
struct ggml_threadpool_params ttp = {
|
struct ggml_threadpool_params ttp = {
|
||||||
.mask_specified = false,
|
.mask_specified = false,
|
||||||
.n_threads = n_threads,
|
.n_threads = n_threads,
|
||||||
.prio = 1,
|
.prio = 0,
|
||||||
.poll = false,
|
.poll = false,
|
||||||
.strict_cpu = false
|
.strict_cpu = false,
|
||||||
|
.paused = false
|
||||||
};
|
};
|
||||||
|
|
||||||
threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
|
threadpool = ggml_create_threadpool_impl(&ttp, true, cgraph, cplan);
|
||||||
|
@ -19396,10 +19407,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
||||||
if (!threadpool->poll) {
|
if (!threadpool->poll) {
|
||||||
ggml_mutex_lock(&threadpool->mutex);
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
threadpool->new_work = true;
|
threadpool->new_work = true;
|
||||||
|
if (threadpool->pause) {
|
||||||
|
__ggml_resume_threadpool(threadpool);
|
||||||
|
} else {
|
||||||
ggml_cond_broadcast(&threadpool->cond);
|
ggml_cond_broadcast(&threadpool->cond);
|
||||||
|
}
|
||||||
ggml_mutex_unlock(&threadpool->mutex);
|
ggml_mutex_unlock(&threadpool->mutex);
|
||||||
} else {
|
} else {
|
||||||
threadpool->new_work = true;
|
threadpool->new_work = true;
|
||||||
|
if (threadpool->pause) {
|
||||||
|
ggml_mutex_lock(&threadpool->mutex);
|
||||||
|
__ggml_resume_threadpool(threadpool);
|
||||||
|
ggml_mutex_unlock(&threadpool->mutex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// this is a work thread too
|
// this is a work thread too
|
||||||
|
|
|
@ -15541,17 +15541,14 @@ static std::pair<int32_t, ggml_compute_threadpool_t> llama_swap_threadpools(
|
||||||
// Switch between the 2 threadpools as needed
|
// Switch between the 2 threadpools as needed
|
||||||
if (n_tokens > 1) {
|
if (n_tokens > 1) {
|
||||||
ggml_pause_threadpool(lctx.threadpool);
|
ggml_pause_threadpool(lctx.threadpool);
|
||||||
ggml_resume_threadpool(lctx.threadpool_batch);
|
|
||||||
threadpool = lctx.threadpool_batch;
|
threadpool = lctx.threadpool_batch;
|
||||||
n_threads = cparams.n_threads_batch;
|
n_threads = cparams.n_threads_batch;
|
||||||
} else {
|
} else {
|
||||||
ggml_pause_threadpool(lctx.threadpool_batch);
|
ggml_pause_threadpool(lctx.threadpool_batch);
|
||||||
ggml_resume_threadpool(lctx.threadpool);
|
|
||||||
threadpool = lctx.threadpool;
|
threadpool = lctx.threadpool;
|
||||||
n_threads = cparams.n_threads;
|
n_threads = cparams.n_threads;
|
||||||
}
|
}
|
||||||
} else if (lctx.threadpool) {
|
} else if (lctx.threadpool) {
|
||||||
ggml_resume_threadpool(lctx.threadpool);
|
|
||||||
threadpool = lctx.threadpool;
|
threadpool = lctx.threadpool;
|
||||||
n_threads = cparams.n_threads;
|
n_threads = cparams.n_threads;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue