From 12cc2de22ed429a45968d4a91699d19637429321 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Thu, 26 Sep 2024 09:17:51 -0700 Subject: [PATCH] Make contended mutexes 30% faster on aarch64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Raspberry Pi 5, benchmark_mu_contended takes 359µs in *NSYNC upstream and in Cosmopolitan it takes 272µs. --- third_party/nsync/README.cosmo | 4 + third_party/nsync/atomic.internal.h | 8 -- third_party/nsync/mu.c | 107 +++++++++++++++---------- third_party/nsync/mu_semaphore_futex.c | 19 +++-- 4 files changed, 81 insertions(+), 57 deletions(-) diff --git a/third_party/nsync/README.cosmo b/third_party/nsync/README.cosmo index bf0f49919..34eb18291 100644 --- a/third_party/nsync/README.cosmo +++ b/third_party/nsync/README.cosmo @@ -29,6 +29,10 @@ LOCAL CHANGES - Ensure resources such as POSIX semaphores are are released on fork. + - Make contended mutexes go 30% faster by using C11 atomics API. This + lets us use weak cas when appropriate. It also avoids a superfluous + relaxed load on failure. This mostly impacts aarch64, not x86_64. + - Modified *NSYNC to allocate waiter objects on the stack. We need it because we use *NSYNC mutexes to implement POSIX mutexes, which are too low-level to safely depend on malloc, or even mmap in our case. diff --git a/third_party/nsync/atomic.internal.h b/third_party/nsync/atomic.internal.h index 64c3c9412..1b9879f64 100644 --- a/third_party/nsync/atomic.internal.h +++ b/third_party/nsync/atomic.internal.h @@ -85,13 +85,6 @@ static inline int atm_cas_relacq_u32_(nsync_atomic_uint32_ *p, uint32_t o, memory_order_relaxed); } -static inline int atm_cas_seqcst_u32_(nsync_atomic_uint32_ *p, uint32_t o, - uint32_t n) { - return atomic_compare_exchange_strong_explicit(NSYNC_ATOMIC_UINT32_PTR_(p), - &o, n, memory_order_seq_cst, - memory_order_relaxed); -} - #define ATM_CAS_HELPER_(barrier, p, o, n) \ (atm_cas_##barrier##_u32_((p), (o), (n))) @@ -99,7 +92,6 @@ static inline int atm_cas_seqcst_u32_(nsync_atomic_uint32_ *p, uint32_t o, #define ATM_CAS_ACQ(p, o, n) ATM_CAS_HELPER_(acq, (p), (o), (n)) #define ATM_CAS_REL(p, o, n) ATM_CAS_HELPER_(rel, (p), (o), (n)) #define ATM_CAS_RELACQ(p, o, n) ATM_CAS_HELPER_(relacq, (p), (o), (n)) -#define ATM_CAS_SEQCST(p, o, n) ATM_CAS_HELPER_(seqcst, (p), (o), (n)) /* Need a cast to remove "const" from some uses. */ #define ATM_LOAD(p) \ diff --git a/third_party/nsync/mu.c b/third_party/nsync/mu.c index ed6a287ff..317add4fe 100644 --- a/third_party/nsync/mu.c +++ b/third_party/nsync/mu.c @@ -34,9 +34,11 @@ void nsync_mu_init (nsync_mu *mu) { /* Release the mutex spinlock. */ static void mu_release_spinlock (nsync_mu *mu) { - uint32_t old_word = ATM_LOAD (&mu->word); - while (!ATM_CAS_REL (&mu->word, old_word, old_word & ~MU_SPINLOCK)) { - old_word = ATM_LOAD (&mu->word); + uint32_t old_word = atomic_load_explicit (&mu->word, + memory_order_relaxed); + while (!atomic_compare_exchange_weak_explicit ( + &mu->word, &old_word, old_word & ~MU_SPINLOCK, + memory_order_release, memory_order_relaxed)) { } } @@ -68,15 +70,17 @@ void nsync_mu_lock_slow_ (nsync_mu *mu, waiter *w, uint32_t clear, lock_type *l_ if ((old_word & zero_to_acquire) == 0) { /* lock can be acquired; try to acquire, possibly clearing MU_DESIG_WAKER and MU_LONG_WAIT. */ - if (ATM_CAS_ACQ (&mu->word, old_word, - (old_word+l_type->add_to_acquire) & - ~(clear|long_wait|l_type->clear_on_acquire))) { + if (atomic_compare_exchange_weak_explicit (&mu->word, &old_word, + (old_word+l_type->add_to_acquire) & + ~(clear|long_wait|l_type->clear_on_acquire), + memory_order_acquire, memory_order_relaxed)) { break; } } else if ((old_word&MU_SPINLOCK) == 0 && - ATM_CAS_ACQ (&mu->word, old_word, - (old_word|MU_SPINLOCK|long_wait| - l_type->set_when_waiting) & ~(clear | MU_ALL_FALSE))) { + atomic_compare_exchange_weak_explicit (&mu->word, &old_word, + (old_word|MU_SPINLOCK|long_wait| + l_type->set_when_waiting) & ~(clear | MU_ALL_FALSE), + memory_order_acquire, memory_order_relaxed)) { /* Spinlock is now held, and lock is held by someone else; MU_WAITING has also been set; queue ourselves. @@ -133,13 +137,16 @@ void nsync_mu_lock_slow_ (nsync_mu *mu, waiter *w, uint32_t clear, lock_type *l_ int nsync_mu_trylock (nsync_mu *mu) { int result; IGNORE_RACES_START (); - if (ATM_CAS_ACQ (&mu->word, 0, MU_WADD_TO_ACQUIRE)) { /* acquire CAS */ + uint32_t old_word = 0; + if (atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_WADD_TO_ACQUIRE, + memory_order_acquire, memory_order_relaxed)) { result = 1; } else { - uint32_t old_word = ATM_LOAD (&mu->word); result = ((old_word & MU_WZERO_TO_ACQUIRE) == 0 && - ATM_CAS_ACQ (&mu->word, old_word, - (old_word + MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE)); + atomic_compare_exchange_strong_explicit ( + &mu->word, &old_word, + (old_word + MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE, + memory_order_acquire, memory_order_relaxed)); } IGNORE_RACES_END (); return (result); @@ -148,11 +155,13 @@ int nsync_mu_trylock (nsync_mu *mu) { /* Block until *mu is free and then acquire it in writer mode. */ void nsync_mu_lock (nsync_mu *mu) { IGNORE_RACES_START (); - if (!ATM_CAS_ACQ (&mu->word, 0, MU_WADD_TO_ACQUIRE)) { /* acquire CAS */ - uint32_t old_word = ATM_LOAD (&mu->word); + uint32_t old_word = 0; + if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_WADD_TO_ACQUIRE, + memory_order_acquire, memory_order_relaxed)) { if ((old_word&MU_WZERO_TO_ACQUIRE) != 0 || - !ATM_CAS_ACQ (&mu->word, old_word, - (old_word+MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE)) { + !atomic_compare_exchange_strong_explicit (&mu->word, &old_word, + (old_word+MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE, + memory_order_acquire, memory_order_relaxed)) { LOCKTRACE("acquiring nsync_mu_lock(%t)...", mu); waiter *w = nsync_waiter_new_ (); nsync_mu_lock_slow_ (mu, w, 0, nsync_writer_type_); @@ -169,13 +178,15 @@ void nsync_mu_lock (nsync_mu *mu) { int nsync_mu_rtrylock (nsync_mu *mu) { int result; IGNORE_RACES_START (); - if (ATM_CAS_ACQ (&mu->word, 0, MU_RADD_TO_ACQUIRE)) { /* acquire CAS */ + uint32_t old_word = 0; + if (atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_RADD_TO_ACQUIRE, + memory_order_acquire, memory_order_relaxed)) { result = 1; } else { - uint32_t old_word = ATM_LOAD (&mu->word); result = ((old_word&MU_RZERO_TO_ACQUIRE) == 0 && - ATM_CAS_ACQ (&mu->word, old_word, - (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE)); + atomic_compare_exchange_strong_explicit (&mu->word, &old_word, + (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE, + memory_order_acquire, memory_order_relaxed)); } IGNORE_RACES_END (); return (result); @@ -184,11 +195,13 @@ int nsync_mu_rtrylock (nsync_mu *mu) { /* Block until *mu can be acquired in reader mode and then acquire it. */ void nsync_mu_rlock (nsync_mu *mu) { IGNORE_RACES_START (); - if (!ATM_CAS_ACQ (&mu->word, 0, MU_RADD_TO_ACQUIRE)) { /* acquire CAS */ - uint32_t old_word = ATM_LOAD (&mu->word); + uint32_t old_word = 0; + if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_RADD_TO_ACQUIRE, + memory_order_acquire, memory_order_relaxed)) { if ((old_word&MU_RZERO_TO_ACQUIRE) != 0 || - !ATM_CAS_ACQ (&mu->word, old_word, - (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE)) { + !atomic_compare_exchange_strong_explicit (&mu->word, &old_word, + (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE, + memory_order_acquire, memory_order_relaxed)) { waiter *w = nsync_waiter_new_ (); nsync_mu_lock_slow_ (mu, w, 0, nsync_reader_type_); nsync_waiter_free_ (w); @@ -236,16 +249,16 @@ struct Dll *nsync_remove_from_mu_queue_ (struct Dll *mu_queue, struct Dll *e) { /* Record previous and next elements in the original queue. */ struct Dll *prev = e->prev; struct Dll *next = e->next; - uint32_t old_value; /* Remove. */ dll_remove (&mu_queue, e); - do { - old_value = ATM_LOAD (&DLL_WAITER (e)->remove_count); - } while (!ATM_CAS (&DLL_WAITER (e)->remove_count, old_value, old_value+1)); + uint32_t old_value = ATM_LOAD (&DLL_WAITER (e)->remove_count); + while (!atomic_compare_exchange_weak_explicit ( + &DLL_WAITER (e)->remove_count, &old_value, old_value+1, + memory_order_relaxed, memory_order_relaxed)) { + } if (!dll_is_empty (mu_queue)) { /* Fix up same_condition. */ struct Dll *e_same_condition = &DLL_WAITER (e)->same_condition; - if (e_same_condition->next != e_same_condition) { /* *e is linked to a same_condition neighbour---just remove it. */ e_same_condition->next->prev = e_same_condition->prev; @@ -290,14 +303,18 @@ void nsync_mu_unlock_slow_ (nsync_mu *mu, lock_type *l_type) { /* no one to wake, there's a designated waker waking up, there are still readers, or it's a reader and all waiters have false conditions */ - if (ATM_CAS_REL (&mu->word, old_word, - (old_word - l_type->add_to_acquire) & - ~l_type->clear_on_uncontended_release)) { + if (atomic_compare_exchange_weak_explicit ( + &mu->word, &old_word, + (old_word - l_type->add_to_acquire) & + ~l_type->clear_on_uncontended_release, + memory_order_release, memory_order_relaxed)) { return; } } else if ((old_word&MU_SPINLOCK) == 0 && - ATM_CAS_SEQCST (&mu->word, old_word, /* [jart] fixes issues on apple silicon */ - (old_word-early_release_mu)|MU_SPINLOCK|MU_DESIG_WAKER)) { + atomic_compare_exchange_weak_explicit ( + &mu->word, &old_word, + (old_word-early_release_mu)|MU_SPINLOCK|MU_DESIG_WAKER, + memory_order_acq_rel, memory_order_relaxed)) { struct Dll *wake; lock_type *wake_type; uint32_t clear_on_release; @@ -433,10 +450,10 @@ void nsync_mu_unlock_slow_ (nsync_mu *mu, lock_type *l_type) { whether any waiters remain, and whether any of them are writers. */ old_word = ATM_LOAD (&mu->word); - while (!ATM_CAS_REL (&mu->word, old_word, - ((old_word-late_release_mu)|set_on_release) & - ~clear_on_release)) { /* release CAS */ - old_word = ATM_LOAD (&mu->word); + while (!atomic_compare_exchange_weak_explicit ( + &mu->word, &old_word, + ((old_word - late_release_mu) | set_on_release) & ~clear_on_release, + memory_order_release, memory_order_relaxed)) { } /* Wake the waiters. */ for (p = dll_first (wake); p != NULL; p = next) { @@ -459,8 +476,10 @@ void nsync_mu_unlock (nsync_mu *mu) { waiter. Another thread could acquire, decrement a reference count and deallocate the mutex before the current thread touched the mutex word again. */ - if (!ATM_CAS_REL (&mu->word, MU_WLOCK, 0)) { - uint32_t old_word = ATM_LOAD (&mu->word); + uint32_t old_word = MU_WLOCK; + if (!atomic_compare_exchange_weak_explicit (&mu->word, &old_word, 0, + memory_order_release, + memory_order_relaxed)) { /* Clear MU_ALL_FALSE because the critical section we're just leaving may have made some conditions true. */ uint32_t new_word = (old_word - MU_WLOCK) & ~MU_ALL_FALSE; @@ -488,8 +507,10 @@ void nsync_mu_unlock (nsync_mu *mu) { void nsync_mu_runlock (nsync_mu *mu) { IGNORE_RACES_START (); /* See comment in nsync_mu_unlock(). */ - if (!ATM_CAS_REL (&mu->word, MU_RLOCK, 0)) { - uint32_t old_word = ATM_LOAD (&mu->word); + uint32_t old_word = MU_RLOCK; + if (!atomic_compare_exchange_weak_explicit (&mu->word, &old_word, 0, + memory_order_release, + memory_order_relaxed)) { /* Sanity check: mutex must not be held in write mode and reader count must not be 0. */ if (((old_word ^ MU_WLOCK) & (MU_WLOCK | MU_RLOCK_FIELD)) == 0) { diff --git a/third_party/nsync/mu_semaphore_futex.c b/third_party/nsync/mu_semaphore_futex.c index 863773da1..051f5e907 100644 --- a/third_party/nsync/mu_semaphore_futex.c +++ b/third_party/nsync/mu_semaphore_futex.c @@ -73,7 +73,10 @@ errno_t nsync_mu_semaphore_p_futex (nsync_semaphore *s) { result = ECANCELED; } } - } while (result == 0 && (i == 0 || !ATM_CAS_ACQ ((nsync_atomic_uint32_ *) &f->i, i, i-1))); + } while (result == 0 && (i == 0 || + !atomic_compare_exchange_weak_explicit ( + (nsync_atomic_uint32_ *) &f->i, &i, i-1, + memory_order_acquire, memory_order_relaxed))); return result; } @@ -118,16 +121,20 @@ errno_t nsync_mu_semaphore_p_with_deadline_futex (nsync_semaphore *s, int clock, result = ECANCELED; } } - } while (result == 0 && (i == 0 || !ATM_CAS_ACQ ((nsync_atomic_uint32_ *) &f->i, i, i - 1))); + } while (result == 0 && (i == 0 || + !atomic_compare_exchange_weak_explicit ( + (nsync_atomic_uint32_ *) &f->i, &i, i-1, + memory_order_acquire, memory_order_relaxed))); return (result); } /* Ensure that the count of *s is at least 1. */ void nsync_mu_semaphore_v_futex (nsync_semaphore *s) { struct futex *f = (struct futex *) s; - uint32_t old_value; - do { - old_value = ATM_LOAD ((nsync_atomic_uint32_ *) &f->i); - } while (!ATM_CAS_REL ((nsync_atomic_uint32_ *) &f->i, old_value, old_value+1)); + uint32_t old_value = ATM_LOAD ((nsync_atomic_uint32_ *) &f->i); + while (!atomic_compare_exchange_weak_explicit ( + (nsync_atomic_uint32_ *) &f->i, &old_value, old_value+1, + memory_order_release, memory_order_relaxed)) { + } ASSERT (nsync_futex_wake_ ((atomic_int *)&f->i, 1, PTHREAD_PROCESS_PRIVATE) >= 0); }