atomics: always use stdatomics with clang and use relaxed memory order when polling in ggml_barrier

This also removes sched_yield() calls from ggml_barrier() to match OpenMP behavior.
This commit is contained in:
Max Krasnyansky 2024-08-05 14:25:49 -07:00 committed by fmz
parent 2953441563
commit 6fcc780b5f

View file

@ -69,23 +69,38 @@ int ggml_sve_cnt_b = 0;
#endif #endif
#include <windows.h> #include <windows.h>
#if !defined(__clang__)
typedef volatile LONG atomic_int; typedef volatile LONG atomic_int;
typedef atomic_int atomic_bool; typedef atomic_int atomic_bool;
typedef atomic_int atomic_flag; typedef atomic_int atomic_flag;
#define ATOMIC_FLAG_INIT 0 #define ATOMIC_FLAG_INIT 0
typedef enum {
memory_order_relaxed,
memory_order_consume,
memory_order_acquire,
memory_order_release,
memory_order_acq_rel,
memory_order_seq_cst
} memory_order;
static void atomic_store(atomic_int * ptr, LONG val) { static void atomic_store(atomic_int * ptr, LONG val) {
InterlockedExchange(ptr, val); InterlockedExchange(ptr, val);
} }
static LONG atomic_load(atomic_int * ptr) { static LONG atomic_load(atomic_int * ptr) {
return InterlockedCompareExchange(ptr, 0, 0); return InterlockedCompareExchange(ptr, 0, 0);
} }
static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
// TODO: add support for explicit memory order
return InterlockedCompareExchange(ptr, 0, 0);
}
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) { static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
return InterlockedExchangeAdd(ptr, inc); return InterlockedExchangeAdd(ptr, inc);
} }
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) { static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
return atomic_fetch_add(ptr, -(dec)); // TODO: add support for explicit memory order
return InterlockedExchangeAdd(ptr, inc);
} }
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) { static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
return InterlockedExchange(ptr, 1); return InterlockedExchange(ptr, 1);
@ -93,6 +108,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
static void atomic_flag_clear(atomic_flag * ptr) { static void atomic_flag_clear(atomic_flag * ptr) {
InterlockedExchange(ptr, 0); InterlockedExchange(ptr, 0);
} }
#else // clang
#include <stdatomic.h>
#endif
typedef HANDLE pthread_t; typedef HANDLE pthread_t;
@ -3030,6 +3048,19 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
// Helpers for polling loops
#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
static inline void __cpu_relax(void) {
__asm__ volatile("yield" ::: "memory");
}
#elif defined(__x86_64__)
static inline void __cpu_relax(void) {
_mm_pause();
}
#else
static inline void __cpu_relax(void) {;}
#endif
// //
// NUMA support // NUMA support
// //
@ -3094,25 +3125,19 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed; atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
int n_threads = threadpool->n_threads_cur; int n_threads = threadpool->n_threads_cur;
int passed_old = atomic_load(n_barrier_passed); int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
// last thread // last thread
atomic_store(n_barrier, 0); atomic_store(n_barrier, 0);
atomic_fetch_add(n_barrier_passed, 1); atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
} else { } else {
// wait for other threads // wait for other threads
const int n_spin_before_sleep = 100000;
while (true) { while (true) {
for (int i = 0; i < n_spin_before_sleep; i++) { if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
if (atomic_load(n_barrier_passed) != passed_old) {
return; return;
} }
#if defined(__SSE3__) __cpu_relax();
_mm_pause();
#endif
}
sched_yield();
} }
} }
} }
@ -18800,18 +18825,6 @@ static bool __thread_priority(int32_t prio) {
#endif #endif
#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
static inline void __cpu_relax(void) {
__asm__ volatile("yield" ::: "memory");
}
#elif defined(__x86_64__)
static inline void __cpu_relax(void) {
_mm_pause();
}
#else
static inline void __cpu_relax(void) {;}
#endif
static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) { static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
if (!global_mask) { if (!global_mask) {
memset(local_mask, 1, GGML_MAX_N_THREADS); memset(local_mask, 1, GGML_MAX_N_THREADS);