ggml: create thread pool lazily

2023-08-19 19:54:56 +02:00 · 2023-08-19 19:54:56 +02:00 · 01046648cf
commit 01046648cf
parent 1f0bccb279
1 changed files with 59 additions and 18 deletions
--- a/ggml.c
+++ b/ggml.c
@ -16215,8 +16215,11 @@ static void clear_numa_thread_affinity(void) {}
 #endif
 struct ggml_compute_state_shared {
-    const struct ggml_cgraph * cgraph;
+    const struct ggml_cgraph  * cgraph;
-    const struct ggml_cplan  * cplan;
+    const struct ggml_cplan   * cplan;
    struct ggml_compute_state * workers;
    bool workers_created;
    int64_t perf_node_start_cycles;
    int64_t perf_node_start_time_us;
@ -16246,6 +16249,8 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
    node->perf_time_us += time_us_cur;
 }
 void ggml_create_workers(struct ggml_compute_state_shared * state_shared);
 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
@ -16264,7 +16269,23 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            state->shared->node_n += 1;
            return (thread_ret_t) GGML_EXIT_ABORTED;
        }
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+
        int n_active;
        if (!state->shared->workers_created) {
            // if the worker pool has not yet been created:
            // there is only a single active thread
            n_active = 1;
        } else if (node_n == -1) {
            // if the worker pool has been created by another thread and this is the first iteration:
            // go straight to the else block as if the thread had been spinning all along
            n_active = -1;
        } else {
            // if the worker pool has been created and this is not the first iteration:
            // decrement the number of active threads and start spinning if there are still other active threads
            n_active = atomic_fetch_sub(&state->shared->n_active, 1);
        }
        if (n_active == 1) {
            // all other threads are finished and spinning
            // do finalize and init here so we don't have synchronize again
            struct ggml_compute_params params = {
@ -16316,6 +16337,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                    ggml_graph_compute_perf_stats_node(node, state->shared);
                } else {
                    // lazily create worker pool only once there is a node with >1 tasks
                    if (!state->shared->workers_created) {
                        state->shared->workers_created = true;
                        ggml_create_workers(state->shared);
                    }
                    break;
                }
@ -16727,6 +16753,16 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                } break;
        }
        bool node_and_src_all_cpu = node->backend == GGML_BACKEND_CPU;
        for (int j = 0; node_and_src_all_cpu && j < GGML_MAX_SRC; ++j) {
            if (node->src[j] != NULL && node->src[j]->backend != GGML_BACKEND_CPU) {
                node_and_src_all_cpu = false;
            }
        }
        if (!node_and_src_all_cpu) {
            n_tasks = 1;
        }
        cplan.n_tasks[i] = n_tasks;
    }
@ -16741,6 +16777,22 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
    return cplan;
 }
 void ggml_create_workers(struct ggml_compute_state_shared * state_shared) {
    if (state_shared->n_threads > 1) {
        for (int j = 1; j < state_shared->n_threads; ++j) {
            state_shared->workers[j] = (struct ggml_compute_state) {
                .thrd   = 0,
                .ith = j,
                .shared = state_shared,
            };
            const int rc = ggml_thread_create(&state_shared->workers[j].thrd, NULL,
                                              ggml_graph_compute_thread, &state_shared->workers[j]);
            GGML_ASSERT(rc == 0);
        }
    }
 }
 int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    {
        GGML_ASSERT(cplan);
@ -16759,9 +16811,12 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    const int n_threads = cplan->n_threads;
    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
    struct ggml_compute_state_shared state_shared = {
        /*.cgraph                  =*/ cgraph,
        /*.cgraph_plan             =*/ cplan,
        /*.workers                 =*/ workers,
        /*.workers_created         =*/ false,
        /*.perf_node_start_cycles  =*/ 0,
        /*.perf_node_start_time_us =*/ 0,
        /*.n_threads               =*/ n_threads,
@ -16770,21 +16825,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
    // create thread pool
    if (n_threads > 1) {
        for (int j = 1; j < n_threads; ++j) {
            workers[j] = (struct ggml_compute_state) {
                .thrd   = 0,
                .ith = j,
                .shared = &state_shared,
            };
            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
            GGML_ASSERT(rc == 0);
        }
    }
    workers[0].ith = 0;
    workers[0].shared = &state_shared;
@ -16798,7 +16839,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
    clear_numa_thread_affinity();
    // join or kill thread pool
-    if (n_threads > 1) {
+    if (n_threads > 1 && state_shared.workers_created) {
        for (int j = 1; j < n_threads; j++) {
            const int rc = ggml_thread_join(workers[j].thrd, NULL);
            GGML_ASSERT(rc == 0);