threadpool: use relaxed order for chunk sync

Full memory barrier is an overkill for this since each thread works on different chunk
This commit is contained in:
Max Krasnyansky 2024-08-20 18:43:39 -07:00 committed by fmz
parent db45b6d3a9
commit 307fece5d7

View file

@ -88,6 +88,10 @@ typedef enum {
static void atomic_store(atomic_int * ptr, LONG val) { static void atomic_store(atomic_int * ptr, LONG val) {
InterlockedExchange(ptr, val); InterlockedExchange(ptr, val);
} }
static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) {
// TODO: add support for explicit memory order
InterlockedExchange(ptr, val);
}
static LONG atomic_load(atomic_int * ptr) { static LONG atomic_load(atomic_int * ptr) {
return InterlockedCompareExchange(ptr, 0, 0); return InterlockedCompareExchange(ptr, 0, 0);
} }
@ -12472,7 +12476,7 @@ UseGgmlGemm1:;
if (ith == 0) { if (ith == 0) {
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
atomic_store(&params->threadpool->current_chunk, nth); atomic_store_explicit(&params->threadpool->current_chunk, nth, memory_order_relaxed);
} }
ggml_barrier(params->threadpool); ggml_barrier(params->threadpool);
@ -12583,7 +12587,7 @@ UseGgmlGemm2:;
break; break;
} }
current_chunk = atomic_fetch_add(&params->threadpool->current_chunk, 1); current_chunk = atomic_fetch_add_explicit(&params->threadpool->current_chunk, 1, memory_order_relaxed);
} }
} }