threadpool: further simplify and improve ggml_barrier

Avoid using strict memory order while polling, yet make sure that all threads go through full memory barrier (memory fence) on ggml_barrier entrace and exit.
2024-09-13 11:21:59 -07:00 · 2024-09-13 11:21:59 -07:00 · ed094a5211
commit ed094a5211
parent 2bd9f47800
1 changed files with 18 additions and 24 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -3178,42 +3178,36 @@ inline static void ggml_critical_section_start(void) {
    }
 }

+static void ggml_barrier(struct ggml_threadpool * tp) {
+    int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
+    if (n_threads == 1) {
+        return;
+    }
+
 #ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_threadpool * threadpool) {
-    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
-    if (n_threads == 1) {
-        return;
-    }
-
    #pragma omp barrier
-}
 #else
-static void ggml_barrier(struct ggml_threadpool * threadpool) {
-    int n_threads = atomic_load_explicit(&threadpool->n_threads_cur, memory_order_relaxed);
-    if (n_threads == 1) {
-        return;
-    }
+    int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed);

-    atomic_int * n_barrier = &threadpool->n_barrier;
-    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
+    // enter barrier (full seq-cst fence)
+    int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst);

-    int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
-
-    if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
+    int last = 0;
+    if (n_barrier == (n_threads - 1)) {
        // last thread
-        atomic_store(n_barrier, 0);
-        atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
+        atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed);
+        last = 1;
    } else {
        // wait for other threads
-        while (true) {
-            if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
-                return;
-            }
+        while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) {
            ggml_thread_cpu_relax();
        }
    }
-}
+
+    // exit barrier (full seq-cst fence)
+    atomic_fetch_add_explicit(&tp->n_barrier_passed, last, memory_order_seq_cst);
 #endif
+}

 // TODO: make this somehow automatically executed
 //       some sort of "sentry" mechanism