atomics: always use stdatomics with clang and use relaxed memory order when polling in ggml_barrier
This also removes sched_yield() calls from ggml_barrier() to match OpenMP behavior.
This commit is contained in:
parent
2953441563
commit
6fcc780b5f
1 changed files with 38 additions and 25 deletions
|
@ -69,23 +69,38 @@ int ggml_sve_cnt_b = 0;
|
||||||
#endif
|
#endif
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
|
|
||||||
|
#if !defined(__clang__)
|
||||||
typedef volatile LONG atomic_int;
|
typedef volatile LONG atomic_int;
|
||||||
typedef atomic_int atomic_bool;
|
typedef atomic_int atomic_bool;
|
||||||
typedef atomic_int atomic_flag;
|
typedef atomic_int atomic_flag;
|
||||||
|
|
||||||
#define ATOMIC_FLAG_INIT 0
|
#define ATOMIC_FLAG_INIT 0
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
memory_order_relaxed,
|
||||||
|
memory_order_consume,
|
||||||
|
memory_order_acquire,
|
||||||
|
memory_order_release,
|
||||||
|
memory_order_acq_rel,
|
||||||
|
memory_order_seq_cst
|
||||||
|
} memory_order;
|
||||||
|
|
||||||
static void atomic_store(atomic_int * ptr, LONG val) {
|
static void atomic_store(atomic_int * ptr, LONG val) {
|
||||||
InterlockedExchange(ptr, val);
|
InterlockedExchange(ptr, val);
|
||||||
}
|
}
|
||||||
static LONG atomic_load(atomic_int * ptr) {
|
static LONG atomic_load(atomic_int * ptr) {
|
||||||
return InterlockedCompareExchange(ptr, 0, 0);
|
return InterlockedCompareExchange(ptr, 0, 0);
|
||||||
}
|
}
|
||||||
|
static LONG atomic_load_explicit(atomic_int * ptr, memory_order mo) {
|
||||||
|
// TODO: add support for explicit memory order
|
||||||
|
return InterlockedCompareExchange(ptr, 0, 0);
|
||||||
|
}
|
||||||
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
static LONG atomic_fetch_add(atomic_int * ptr, LONG inc) {
|
||||||
return InterlockedExchangeAdd(ptr, inc);
|
return InterlockedExchangeAdd(ptr, inc);
|
||||||
}
|
}
|
||||||
static LONG atomic_fetch_sub(atomic_int * ptr, LONG dec) {
|
static LONG atomic_fetch_add_explicit(atomic_int * ptr, LONG inc, memory_order mo) {
|
||||||
return atomic_fetch_add(ptr, -(dec));
|
// TODO: add support for explicit memory order
|
||||||
|
return InterlockedExchangeAdd(ptr, inc);
|
||||||
}
|
}
|
||||||
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
|
static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
|
||||||
return InterlockedExchange(ptr, 1);
|
return InterlockedExchange(ptr, 1);
|
||||||
|
@ -93,6 +108,9 @@ static atomic_bool atomic_flag_test_and_set(atomic_flag * ptr) {
|
||||||
static void atomic_flag_clear(atomic_flag * ptr) {
|
static void atomic_flag_clear(atomic_flag * ptr) {
|
||||||
InterlockedExchange(ptr, 0);
|
InterlockedExchange(ptr, 0);
|
||||||
}
|
}
|
||||||
|
#else // clang
|
||||||
|
#include <stdatomic.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef HANDLE pthread_t;
|
typedef HANDLE pthread_t;
|
||||||
|
|
||||||
|
@ -3030,6 +3048,19 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
||||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||||
|
|
||||||
|
// Helpers for polling loops
|
||||||
|
#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
|
||||||
|
static inline void __cpu_relax(void) {
|
||||||
|
__asm__ volatile("yield" ::: "memory");
|
||||||
|
}
|
||||||
|
#elif defined(__x86_64__)
|
||||||
|
static inline void __cpu_relax(void) {
|
||||||
|
_mm_pause();
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline void __cpu_relax(void) {;}
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// NUMA support
|
// NUMA support
|
||||||
//
|
//
|
||||||
|
@ -3094,25 +3125,19 @@ static void ggml_barrier(struct ggml_compute_threadpool * threadpool) {
|
||||||
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
|
atomic_int * n_barrier_passed = &threadpool->n_barrier_passed;
|
||||||
|
|
||||||
int n_threads = threadpool->n_threads_cur;
|
int n_threads = threadpool->n_threads_cur;
|
||||||
int passed_old = atomic_load(n_barrier_passed);
|
int passed_old = atomic_load_explicit(n_barrier_passed, memory_order_relaxed);
|
||||||
|
|
||||||
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
|
if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) {
|
||||||
// last thread
|
// last thread
|
||||||
atomic_store(n_barrier, 0);
|
atomic_store(n_barrier, 0);
|
||||||
atomic_fetch_add(n_barrier_passed, 1);
|
atomic_fetch_add_explicit(n_barrier_passed, 1, memory_order_relaxed);
|
||||||
} else {
|
} else {
|
||||||
// wait for other threads
|
// wait for other threads
|
||||||
const int n_spin_before_sleep = 100000;
|
|
||||||
while (true) {
|
while (true) {
|
||||||
for (int i = 0; i < n_spin_before_sleep; i++) {
|
if (atomic_load_explicit(n_barrier_passed, memory_order_relaxed) != passed_old) {
|
||||||
if (atomic_load(n_barrier_passed) != passed_old) {
|
return;
|
||||||
return;
|
|
||||||
}
|
|
||||||
#if defined(__SSE3__)
|
|
||||||
_mm_pause();
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
sched_yield();
|
__cpu_relax();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18800,18 +18825,6 @@ static bool __thread_priority(int32_t prio) {
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__aarch64__) && ( defined(__clang__) || defined(__GNUC__) )
|
|
||||||
static inline void __cpu_relax(void) {
|
|
||||||
__asm__ volatile("yield" ::: "memory");
|
|
||||||
}
|
|
||||||
#elif defined(__x86_64__)
|
|
||||||
static inline void __cpu_relax(void) {
|
|
||||||
_mm_pause();
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
static inline void __cpu_relax(void) {;}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
|
static void __cpumask_next(const bool * global_mask, bool * local_mask, bool strict, int32_t* iter) {
|
||||||
if (!global_mask) {
|
if (!global_mask) {
|
||||||
memset(local_mask, 1, GGML_MAX_N_THREADS);
|
memset(local_mask, 1, GGML_MAX_N_THREADS);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue