From 307fece5d76a204ef668613f4e19f8e321f71149 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Tue, 20 Aug 2024 18:43:39 -0700 Subject: [PATCH] threadpool: use relaxed order for chunk sync Full memory barrier is an overkill for this since each thread works on different chunk --- ggml/src/ggml.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b1400923c..15448a633 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -88,6 +88,10 @@ typedef enum { static void atomic_store(atomic_int * ptr, LONG val) { InterlockedExchange(ptr, val); } +static void atomic_store_explicit(atomic_int * ptr, LONG val, memory_order mo) { + // TODO: add support for explicit memory order + InterlockedExchange(ptr, val); +} static LONG atomic_load(atomic_int * ptr) { return InterlockedCompareExchange(ptr, 0, 0); } @@ -12472,7 +12476,7 @@ UseGgmlGemm1:; if (ith == 0) { // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - atomic_store(¶ms->threadpool->current_chunk, nth); + atomic_store_explicit(¶ms->threadpool->current_chunk, nth, memory_order_relaxed); } ggml_barrier(params->threadpool); @@ -12583,7 +12587,7 @@ UseGgmlGemm2:; break; } - current_chunk = atomic_fetch_add(¶ms->threadpool->current_chunk, 1); + current_chunk = atomic_fetch_add_explicit(¶ms->threadpool->current_chunk, 1, memory_order_relaxed); } }