From c6328bc0adc8ee412e51c288457c69aea8a6eca3 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 27 Aug 2024 18:55:59 -0700 Subject: [PATCH] threadpool: futher api cleanup and prep for future refactoring All threadpool related functions and structs use ggml_threadpool prefix. --- examples/llama-bench/llama-bench.cpp | 4 +-- examples/main/main.cpp | 10 +++--- ggml/include/ggml-backend.h | 2 +- ggml/include/ggml.h | 18 +++++----- ggml/src/ggml-backend.c | 6 ++-- ggml/src/ggml.c | 54 ++++++++++++++-------------- include/llama.h | 4 +-- src/llama.cpp | 14 ++++---- 8 files changed, 56 insertions(+), 56 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index b6b1efe02..ce461333c 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); + struct ggml_threadpool* threadpool = ggml_threadpool_create(&tpp); if (!threadpool) { LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_release_threadpool(threadpool); + ggml_threadpool_release(threadpool); } llama_free_model(lmodel); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 0ccd0558f..4d8b02801 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -232,9 +232,9 @@ int main(int argc, char ** argv) { set_process_priority(params.cpuparams.priority); - struct ggml_compute_threadpool * threadpool_batch = NULL; + struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_create_threadpool(&tpp_batch); + threadpool_batch = ggml_threadpool_create(&tpp_batch); if (!threadpool_batch) { LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); exit(1); @@ -244,7 +244,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_create(&tpp); if (!threadpool) { LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1023,8 +1023,8 @@ int main(int argc, char ** argv) { llama_sampling_free(ctx_sampling); llama_backend_free(); - ggml_release_threadpool(threadpool); - ggml_release_threadpool(threadpool_batch); + ggml_threadpool_release(threadpool); + ggml_threadpool_release(threadpool_batch); #ifndef LOG_DISABLE_LOGS LOG_TEE("Log end\n"); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index c59f9f54a..e46121081 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -102,7 +102,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend); GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); - GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool); + GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); // Create a backend buffer from an existing pointer diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1ced22eec..8b10e025a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -645,9 +645,9 @@ extern "C" { bool paused; // start in paused state }; - struct ggml_compute_threadpool; // forward declaration, see ggml.c + struct ggml_threadpool; // forward declaration, see ggml.c - typedef struct ggml_compute_threadpool * ggml_compute_threadpool_t; + typedef struct ggml_threadpool * ggml_threadpool_t; // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 @@ -656,7 +656,7 @@ extern "C" { uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` int n_threads; - struct ggml_compute_threadpool * threadpool; + struct ggml_threadpool * threadpool; // abort ggml_graph_compute when true ggml_abort_callback abort_callback; @@ -2039,18 +2039,18 @@ extern "C" { GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); GGML_API void ggml_threadpool_params_init(struct ggml_threadpool_params *p, int n_threads); GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1); - GGML_API struct ggml_compute_threadpool* ggml_create_threadpool (struct ggml_threadpool_params * params); - GGML_API void ggml_release_threadpool (struct ggml_compute_threadpool * threadpool); - GGML_API int ggml_threadpool_get_n_threads(struct ggml_compute_threadpool * threadpool); - GGML_API void ggml_pause_threadpool (struct ggml_compute_threadpool * threadpool); - GGML_API void ggml_resume_threadpool (struct ggml_compute_threadpool * threadpool); + GGML_API struct ggml_threadpool* ggml_threadpool_create (struct ggml_threadpool_params * params); + GGML_API void ggml_threadpool_release (struct ggml_threadpool * threadpool); + GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data GGML_API struct ggml_cplan ggml_graph_plan( const struct ggml_cgraph * cgraph, int n_threads, /* = GGML_DEFAULT_N_THREADS */ - struct ggml_compute_threadpool * threadpool /* = NULL */ ); + struct ggml_threadpool * threadpool /* = NULL */ ); GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 03e41a09c..99ec15a0f 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -723,7 +723,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { struct ggml_backend_cpu_context { int n_threads; - ggml_compute_threadpool_t threadpool; + ggml_threadpool_t threadpool; void * work_data; size_t work_size; @@ -906,14 +906,14 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { ctx->n_threads = n_threads; } -void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_compute_threadpool_t threadpool) { +void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) { GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; if (ctx->threadpool && ctx->threadpool != threadpool) { // already had a different threadpool, pause/suspend it before switching - ggml_pause_threadpool(ctx->threadpool); + ggml_threadpool_pause(ctx->threadpool); } ctx->threadpool = threadpool; } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f05f89a27..c8f6152e5 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1955,7 +1955,7 @@ typedef pthread_mutex_t ggml_mutex_t; #endif // Threadpool def -struct ggml_compute_threadpool { +struct ggml_threadpool { ggml_mutex_t mutex; // mutex for cond.var ggml_cond_t cond; // cond.var for waiting for new work @@ -1990,7 +1990,7 @@ struct ggml_compute_state { int last_graph; bool pending; #endif - struct ggml_compute_threadpool * threadpool; + struct ggml_threadpool * threadpool; int ith; }; @@ -2002,7 +2002,7 @@ struct ggml_compute_params { size_t wsize; void * wdata; - struct ggml_compute_threadpool * threadpool; + struct ggml_threadpool * threadpool; }; // @@ -3110,7 +3110,7 @@ inline static void ggml_critical_section_start(void) { } #ifdef GGML_USE_OPENMP -static void ggml_barrier(struct ggml_compute_threadpool * threadpool) { +static void ggml_barrier(struct ggml_threadpool * threadpool) { if (threadpool->n_threads_cur == 1) { return; } @@ -3118,7 +3118,7 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) { #pragma omp barrier } #else -static void ggml_barrier(struct ggml_compute_threadpool * threadpool) { +static void ggml_barrier(struct ggml_threadpool * threadpool) { if (threadpool->n_threads_cur == 1) { return; } @@ -18837,7 +18837,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask } } -void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) { +void ggml_threadpool_release(struct ggml_threadpool* threadpool) { if (!threadpool) return; #ifndef GGML_USE_OPENMP @@ -18868,24 +18868,24 @@ void ggml_release_threadpool(struct ggml_compute_threadpool* threadpool) { #ifndef GGML_USE_OPENMP // pause/resume must be called under mutex -static void ggml_pause_threadpool_locked(struct ggml_compute_threadpool * threadpool) { +static void ggml_threadpool_pause_locked(struct ggml_threadpool * threadpool) { GGML_PRINT_DEBUG("Pausing threadpool\n"); threadpool->pause = true; ggml_cond_broadcast(&threadpool->cond); } -static void ggml_resume_threadpool_locked(struct ggml_compute_threadpool * threadpool) { +static void ggml_threadpool_resume_locked(struct ggml_threadpool * threadpool) { GGML_PRINT_DEBUG("Resuming threadpool\n"); threadpool->pause = false; ggml_cond_broadcast(&threadpool->cond); } #endif -void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) { +void ggml_threadpool_pause(struct ggml_threadpool * threadpool) { #ifndef GGML_USE_OPENMP ggml_mutex_lock(&threadpool->mutex); if (!threadpool->pause) { - ggml_pause_threadpool_locked(threadpool); + ggml_threadpool_pause_locked(threadpool); } ggml_mutex_unlock(&threadpool->mutex); #else @@ -18893,11 +18893,11 @@ void ggml_pause_threadpool(struct ggml_compute_threadpool * threadpool) { #endif } -void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) { +void ggml_threadpool_resume(struct ggml_threadpool * threadpool) { #ifndef GGML_USE_OPENMP ggml_mutex_lock(&threadpool->mutex); if (threadpool->pause) { - ggml_resume_threadpool_locked(threadpool); + ggml_threadpool_resume_locked(threadpool); } ggml_mutex_unlock(&threadpool->mutex); #else @@ -18908,7 +18908,7 @@ void ggml_resume_threadpool(struct ggml_compute_threadpool * threadpool) { struct ggml_cplan ggml_graph_plan( const struct ggml_cgraph * cgraph, int n_threads, - struct ggml_compute_threadpool * threadpool) { + struct ggml_threadpool * threadpool) { if (threadpool == NULL) { GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads); @@ -19119,7 +19119,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { #ifndef GGML_USE_OPENMP static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { - struct ggml_compute_threadpool * threadpool = state->threadpool; + struct ggml_threadpool * threadpool = state->threadpool; if (state->pending || threadpool->stop || threadpool->pause) { return true; } @@ -19134,7 +19134,7 @@ static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) { } static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) { - struct ggml_compute_threadpool * threadpool = state->threadpool; + struct ggml_threadpool * threadpool = state->threadpool; // This seems to make 0 ... 100 a decent range for polling level across modern processors. // Perhaps, we can adjust it dynamically based on load and things. @@ -19149,7 +19149,7 @@ static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * } static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) { - struct ggml_compute_threadpool * threadpool = state->threadpool; + struct ggml_threadpool * threadpool = state->threadpool; if (ggml_graph_compute_poll_for_work(state)) { return state->pending; @@ -19168,7 +19168,7 @@ static inline bool ggml_graph_compute_check_for_work(struct ggml_compute_state * static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_compute_threadpool * threadpool = state->threadpool; + struct ggml_threadpool * threadpool = state->threadpool; ggml_thread_apply_priority(threadpool->prio); if (ggml_thread_cpumask_is_valid(state->cpumask)) { @@ -19205,7 +19205,7 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) { } // Start processing new graph -static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpool) +static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool) { // always take the mutex here because the worker threads are doing hybrid poll/wait @@ -19221,7 +19221,7 @@ static void ggml_graph_compute_kickoff(struct ggml_compute_threadpool * threadpo } // resume does cond broadcast - ggml_resume_threadpool_locked(threadpool); + ggml_threadpool_resume_locked(threadpool); } else { ggml_cond_broadcast(&threadpool->cond); } @@ -19254,13 +19254,13 @@ bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, cons return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; } -static struct ggml_compute_threadpool * ggml_create_threadpool_impl( +static struct ggml_threadpool * ggml_threadpool_create_impl( struct ggml_threadpool_params * tpp, struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { - struct ggml_compute_threadpool * threadpool = - GGML_ALIGNED_MALLOC(sizeof(struct ggml_compute_threadpool)); + struct ggml_threadpool * threadpool = + GGML_ALIGNED_MALLOC(sizeof(struct ggml_threadpool)); { threadpool->cgraph = cgraph; threadpool->cplan = cplan; @@ -19320,8 +19320,8 @@ static struct ggml_compute_threadpool * ggml_create_threadpool_impl( return threadpool; } -struct ggml_compute_threadpool * ggml_create_threadpool(struct ggml_threadpool_params * tpp) { - return ggml_create_threadpool_impl(tpp, NULL, NULL); +struct ggml_threadpool * ggml_threadpool_create(struct ggml_threadpool_params * tpp) { + return ggml_threadpool_create_impl(tpp, NULL, NULL); } enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { @@ -19330,7 +19330,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); int n_threads = cplan->n_threads; - struct ggml_compute_threadpool * threadpool = cplan->threadpool; + struct ggml_threadpool * threadpool = cplan->threadpool; bool disposable_threadpool = false; @@ -19339,7 +19339,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl disposable_threadpool = true; struct ggml_threadpool_params ttp = ggml_threadpool_params_default(n_threads); - threadpool = ggml_create_threadpool_impl(&ttp, cgraph, cplan); + threadpool = ggml_threadpool_create_impl(&ttp, cgraph, cplan); } else { // Reset some of the parameters that need resetting // No worker threads should be accessing the parameters below at this stage @@ -19384,7 +19384,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl enum ggml_status ret = threadpool->ec; if (disposable_threadpool) { - ggml_release_threadpool(threadpool); + ggml_threadpool_release(threadpool); } return ret; diff --git a/include/llama.h b/include/llama.h index c03c4929b..b969689be 100644 --- a/include/llama.h +++ b/include/llama.h @@ -431,8 +431,8 @@ extern "C" { // Optional: an auto threadpool gets created in ggml if not passed explicitly LLAMA_API void llama_attach_threadpool( struct llama_context * ctx, - ggml_compute_threadpool_t threadpool, - ggml_compute_threadpool_t threadpool_batch); + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch); LLAMA_API void llama_detach_threadpool(struct llama_context * ctx); // Call once at the end of the program - currently only used for MPI diff --git a/src/llama.cpp b/src/llama.cpp index 57e765ce0..9bf6e22af 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3091,8 +3091,8 @@ struct llama_context { #endif ggml_backend_t backend_cpu = nullptr; - ggml_compute_threadpool_t threadpool = nullptr; - ggml_compute_threadpool_t threadpool_batch = nullptr; + ggml_threadpool_t threadpool = nullptr; + ggml_threadpool_t threadpool_batch = nullptr; bool has_evaluated_once = false; @@ -15500,7 +15500,7 @@ static void llama_graph_compute( llama_context & lctx, ggml_cgraph * gf, int n_threads, - ggml_compute_threadpool * threadpool) { + ggml_threadpool * threadpool) { #ifdef GGML_USE_METAL if (ggml_backend_is_metal(lctx.backend_metal)) { ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); @@ -15630,7 +15630,7 @@ static int llama_decode_internal( } int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; + ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; GGML_ASSERT(n_threads > 0); @@ -15871,7 +15871,7 @@ static int llama_encode_internal( lctx.n_outputs = n_tokens; int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; - ggml_compute_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; + ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch; GGML_ASSERT(n_threads > 0); @@ -17462,8 +17462,8 @@ void llama_numa_init(enum ggml_numa_strategy numa) { void llama_attach_threadpool( struct llama_context * ctx, - ggml_compute_threadpool_t threadpool, - ggml_compute_threadpool_t threadpool_batch) { + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { ctx->threadpool = threadpool; ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; }