atomics: always use stdatomics with clang and use relaxed memory order when polling in ggml_barrier

This also removes sched_yield() calls from ggml_barrier() to match OpenMP behavior.
2024-08-05 14:25:49 -07:00 · 2024-08-05 14:25:49 -07:00 · 6fcc780b5f
commit 6fcc780b5f
parent 2953441563
1 changed files with 38 additions and 25 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -69,23 +69,38 @@ int ggml_sve_cnt_b = 0;
 #endif
 #include <windows.h>

+#if !defined(__clang__)
 typedef volatile LONG atomic_int;
 typedef atomic_int atomic_bool;
 typedef atomic_int atomic_flag;

 #define ATOMIC_FLAG_INIT 0

+typedef enum {
+    memory_order_relaxed,
+    memory_order_consume,
+    memory_order_acquire,
+    memory_order_release,
+    memory_order_acq_rel,
+    memory_order_seq_cst
+} memory_order;
+
 static void atomic_store(atomic_int * ptr, LONG val) {
    InterlockedExchange(ptr, val);
 }
 static LONG atomic_load(atomic_int * ptr) {
    return InterlockedCompareExchange(ptr, 0, 0);
 }
+static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
 static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
    return InterlockedExchangeAdd(ptr, inc);
 }
-static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
+static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
+    // TODO: add support for explicit memory order
+    return InterlockedExchangeAdd(ptr, inc);
 }
 static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
    return InterlockedExchange(ptr, 1);
@ -93,6 +108,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
 static void atomic_flag_clear(atomic_flag * ptr) {
    InterlockedExchange(ptr, 0);
 }
+#else // clang
+#include <stdatomic.h>
+#endif

 typedef HANDLE pthread_t;

@ -3030,6 +3048,19 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");

+// Helpers for polling loops
+#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
+static inline void __cpu_relax(void) {
+    __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__x86_64__)
+static inline void __cpu_relax(void) {
+    _mm_pause();
+}
+#else
+static inline void __cpu_relax(void) {;}
+#endif
+
 //
 // NUMA support
 //
@ -3094,25 +3125,19 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
    atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;

    int n_threads = threadpool->n_threads_cur;
-    int passed_old = atomic_load(n_barrier_passed);
+    int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);

    if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
        // last thread
        atomic_store(n_barrier, 0);
-        atomic_fetch_add(n_barrier_passed, 1);
+        atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
    } else {
        // wait for other threads
-        const int n_spin_before_sleep = 100000;
        while (true) {
-            for (int i = 0; i < n_spin_before_sleep; i++) {
-                if (atomic_load(n_barrier_passed) != passed_old) {
-                    return;
-                }
-            #if defined(__SSE3__)
-                _mm_pause();
-            #endif
+            if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
+                return;
            }
-            sched_yield();
+            __cpu_relax();
        }
    }
 }
@ -18800,18 +18825,6 @@ static bool __thread_priority(int32_t prio) {

 #endif

-#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
-static inline void __cpu_relax(void) {
-    __asm__ volatile("yield" ::: "memory");
-}
-#elif defined(__x86_64__)
-static inline void __cpu_relax(void) {
-    _mm_pause();
-}
-#else
-static inline void __cpu_relax(void) {;}
-#endif
-
 static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
    if (!global_mask) {
        memset(local_mask, 1, GGML_MAX_N_THREADS);