threadpool: add support for hybrid polling

poll params (--poll, ...) now specify "polling level", i.e. how aggresively we poll before waiting on cond.var. poll=0 means no polling, 1 means poll for 128K rounds then wait, 2 for 256K rounds, ... The default value of 50 (ie 50x128K rounds) seems like a decent default across modern platforms. We can tune this further as things evolve.
2024-08-11 11:20:32 -07:00 · 2024-08-11 11:20:32 -07:00 · b630acdb73
commit b630acdb73
parent 494e27c793
4 changed files with 58 additions and 48 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -1717,7 +1717,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-Cr,   --cpu-range lo-hi",      "range of CPUs for affinity. Complements --cpu-mask"});
    options.push_back({ "*",           "       --cpu-strict <0|1>",     "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
    options.push_back({ "*",           "       --priority N",           "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
-    options.push_back({ "*",           "       --poll <0|1>",           "use polling to wait for work (default: %u)\n", (unsigned) params.cpuparams.poll});
+    options.push_back({ "*",           "       --poll <0...100>",       "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
    options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
    options.push_back({ "*",           "-Cb,   --cpu-mask-batch M",     "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
    options.push_back({ "*",           "-Crb,  --cpu-range-batch lo-hi",
--- a/common/common.h
+++ b/common/common.h
@ -73,7 +73,7 @@ struct cpu_params {
    bool     mask_valid                  = false;   // Default: any CPU
    int32_t  priority                    =  0;      // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
    bool     strict_cpu                  = false;   // Use strict CPU placement
-    bool     poll                        = true;    // Use polling (busywait) to wait for work (default matches OpenMP)
+    uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
 struct gpt_params {
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -627,13 +627,13 @@ extern "C" {
    typedef bool (*ggml_abort_callback)(void * data);
    struct ggml_threadpool_params {
-        bool    cpumask[GGML_MAX_N_THREADS];
+        bool     cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores
-        bool    mask_specified;
+        bool     mask_specified;              // mask is non-empty
-        int32_t n_threads;
+        int32_t  n_threads;                   // number of threads
-        int32_t prio;
+        int32_t  prio;                        // thread priority
-        bool    poll;
+        uint32_t poll;                        // polling level (0 - no polling, 100 - aggressive polling)
-        bool    strict_cpu;
+        bool     strict_cpu;                  // strict cpu placement
-        bool    paused;
+        bool     paused;                      // start in paused state
    };
    struct ggml_compute_threadpool;     // forward declaration, see ggml.c
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -1973,7 +1973,7 @@ struct ggml_compute_threadpool {
    int32_t      prio;       // Scheduling priority
    bool         disposable; // Doesn't initialize a conv-var
-    bool         poll;       // Use polling (busywait)  // TODO
+    uint32_t     poll;       // Polling level (0 - no polling)
    ggml_abort_callback abort_callback; // abort ggml_graph_compute when true
    void * abort_callback_data;
@ -19156,35 +19156,50 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
    return 0;
 }
 #ifndef GGML_USE_OPENMP
 static inline bool ggml_graph_compute_got_work(struct ggml_compute_state *state) {
    struct ggml_compute_threadpool * threadpool = state->threadpool;
    return (threadpool->new_work && state->ith < threadpool->n_threads_cur);
 }
 static inline bool ggml_graph_compute_ready(struct ggml_compute_state * state) {
    struct ggml_compute_threadpool * threadpool = state->threadpool;
    if (threadpool->stop || threadpool->pause) return true;
    return ggml_graph_compute_got_work(state);
 }
 static inline bool ggml_graph_compute_poll_for_work(struct ggml_compute_state * state) {
    struct ggml_compute_threadpool * threadpool = state->threadpool;
    // This seems to make 0 ... 100 a decent range for polling level across modern processors.
    // Perhaps, we can adjust it dynamically based on load and things.
    const uint64_t n_rounds = 1024UL * 128 * threadpool->poll;
    for (uint64_t i=0; !ggml_graph_compute_ready(state) && i<n_rounds; i++) {
        // No new work. Keep polling.
        __cpu_relax();
    }
    return ggml_graph_compute_got_work(state);
 }
 static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state) {
    struct ggml_compute_threadpool * threadpool = state->threadpool;
-    if (threadpool->poll) {
+    if (ggml_graph_compute_poll_for_work(state)) {
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
+        return ggml_graph_compute_got_work(state);
                 threadpool->stop ||
                 threadpool->pause
                )
        ) {
            // No new work. Yield and keep polling.
            __cpu_relax();
    }
-    } else {
+
    ggml_mutex_lock_shared(&threadpool->mutex);
-        while (!((threadpool->new_work && state->ith < threadpool->n_threads_cur) ||
+    while (!ggml_graph_compute_ready(state)) {
                 threadpool->stop ||
                 threadpool->pause
                )
        ) {
        // No new work. Wait for the signal.
        GGML_PRINT_DEBUG("thread #%d waiting for work\n", state->ith);
        ggml_cond_wait(&threadpool->cond, &threadpool->mutex);
    }
    ggml_mutex_unlock_shared(&threadpool->mutex);
-    }
+
-    return threadpool->new_work;
+    return ggml_graph_compute_got_work(state);
 }
 static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
@ -19404,24 +19419,19 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
            __thread_affinity(threadpool->workers[0].cpumask);
        }
-        if (!threadpool->poll) {
+        // always take the mutex here because the worker threads are doing hybrid poll/wait
        ggml_mutex_lock(&threadpool->mutex);
        threadpool->new_work = true;
-            if (threadpool->pause) {
+        if (!threadpool->pause) {
               __ggml_resume_threadpool(threadpool);
            } else {
           ggml_cond_broadcast(&threadpool->cond);
            }
            ggml_mutex_unlock(&threadpool->mutex);
        } else {
-            threadpool->new_work = true;
+           // resume does cond broadcast
            if (threadpool->pause) {
                ggml_mutex_lock(&threadpool->mutex);
           __ggml_resume_threadpool(threadpool);
        }
        ggml_mutex_unlock(&threadpool->mutex);
    }
-        }
+
    }
    // this is a work thread too
    ggml_graph_compute_thread(&threadpool->workers[0]);
 #endif