Make contended mutexes 30% faster on aarch64

On Raspberry Pi 5, benchmark_mu_contended takes 359µs in *NSYNC upstream
and in Cosmopolitan it takes 272µs.
This commit is contained in:
Justine Tunney 2024-09-26 09:17:51 -07:00
parent 70603fa6ea
commit 12cc2de22e
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
4 changed files with 81 additions and 57 deletions

View file

@ -29,6 +29,10 @@ LOCAL CHANGES
- Ensure resources such as POSIX semaphores are are released on fork. - Ensure resources such as POSIX semaphores are are released on fork.
- Make contended mutexes go 30% faster by using C11 atomics API. This
lets us use weak cas when appropriate. It also avoids a superfluous
relaxed load on failure. This mostly impacts aarch64, not x86_64.
- Modified *NSYNC to allocate waiter objects on the stack. We need it - Modified *NSYNC to allocate waiter objects on the stack. We need it
because we use *NSYNC mutexes to implement POSIX mutexes, which are because we use *NSYNC mutexes to implement POSIX mutexes, which are
too low-level to safely depend on malloc, or even mmap in our case. too low-level to safely depend on malloc, or even mmap in our case.

View file

@ -85,13 +85,6 @@ static inline int atm_cas_relacq_u32_(nsync_atomic_uint32_ *p, uint32_t o,
memory_order_relaxed); memory_order_relaxed);
} }
static inline int atm_cas_seqcst_u32_(nsync_atomic_uint32_ *p, uint32_t o,
uint32_t n) {
return atomic_compare_exchange_strong_explicit(NSYNC_ATOMIC_UINT32_PTR_(p),
&o, n, memory_order_seq_cst,
memory_order_relaxed);
}
#define ATM_CAS_HELPER_(barrier, p, o, n) \ #define ATM_CAS_HELPER_(barrier, p, o, n) \
(atm_cas_##barrier##_u32_((p), (o), (n))) (atm_cas_##barrier##_u32_((p), (o), (n)))
@ -99,7 +92,6 @@ static inline int atm_cas_seqcst_u32_(nsync_atomic_uint32_ *p, uint32_t o,
#define ATM_CAS_ACQ(p, o, n) ATM_CAS_HELPER_(acq, (p), (o), (n)) #define ATM_CAS_ACQ(p, o, n) ATM_CAS_HELPER_(acq, (p), (o), (n))
#define ATM_CAS_REL(p, o, n) ATM_CAS_HELPER_(rel, (p), (o), (n)) #define ATM_CAS_REL(p, o, n) ATM_CAS_HELPER_(rel, (p), (o), (n))
#define ATM_CAS_RELACQ(p, o, n) ATM_CAS_HELPER_(relacq, (p), (o), (n)) #define ATM_CAS_RELACQ(p, o, n) ATM_CAS_HELPER_(relacq, (p), (o), (n))
#define ATM_CAS_SEQCST(p, o, n) ATM_CAS_HELPER_(seqcst, (p), (o), (n))
/* Need a cast to remove "const" from some uses. */ /* Need a cast to remove "const" from some uses. */
#define ATM_LOAD(p) \ #define ATM_LOAD(p) \

107
third_party/nsync/mu.c vendored
View file

@ -34,9 +34,11 @@ void nsync_mu_init (nsync_mu *mu) {
/* Release the mutex spinlock. */ /* Release the mutex spinlock. */
static void mu_release_spinlock (nsync_mu *mu) { static void mu_release_spinlock (nsync_mu *mu) {
uint32_t old_word = ATM_LOAD (&mu->word); uint32_t old_word = atomic_load_explicit (&mu->word,
while (!ATM_CAS_REL (&mu->word, old_word, old_word & ~MU_SPINLOCK)) { memory_order_relaxed);
old_word = ATM_LOAD (&mu->word); while (!atomic_compare_exchange_weak_explicit (
&mu->word, &old_word, old_word & ~MU_SPINLOCK,
memory_order_release, memory_order_relaxed)) {
} }
} }
@ -68,15 +70,17 @@ void nsync_mu_lock_slow_ (nsync_mu *mu, waiter *w, uint32_t clear, lock_type *l_
if ((old_word & zero_to_acquire) == 0) { if ((old_word & zero_to_acquire) == 0) {
/* lock can be acquired; try to acquire, possibly /* lock can be acquired; try to acquire, possibly
clearing MU_DESIG_WAKER and MU_LONG_WAIT. */ clearing MU_DESIG_WAKER and MU_LONG_WAIT. */
if (ATM_CAS_ACQ (&mu->word, old_word, if (atomic_compare_exchange_weak_explicit (&mu->word, &old_word,
(old_word+l_type->add_to_acquire) & (old_word+l_type->add_to_acquire) &
~(clear|long_wait|l_type->clear_on_acquire))) { ~(clear|long_wait|l_type->clear_on_acquire),
memory_order_acquire, memory_order_relaxed)) {
break; break;
} }
} else if ((old_word&MU_SPINLOCK) == 0 && } else if ((old_word&MU_SPINLOCK) == 0 &&
ATM_CAS_ACQ (&mu->word, old_word, atomic_compare_exchange_weak_explicit (&mu->word, &old_word,
(old_word|MU_SPINLOCK|long_wait| (old_word|MU_SPINLOCK|long_wait|
l_type->set_when_waiting) & ~(clear | MU_ALL_FALSE))) { l_type->set_when_waiting) & ~(clear | MU_ALL_FALSE),
memory_order_acquire, memory_order_relaxed)) {
/* Spinlock is now held, and lock is held by someone /* Spinlock is now held, and lock is held by someone
else; MU_WAITING has also been set; queue ourselves. else; MU_WAITING has also been set; queue ourselves.
@ -133,13 +137,16 @@ void nsync_mu_lock_slow_ (nsync_mu *mu, waiter *w, uint32_t clear, lock_type *l_
int nsync_mu_trylock (nsync_mu *mu) { int nsync_mu_trylock (nsync_mu *mu) {
int result; int result;
IGNORE_RACES_START (); IGNORE_RACES_START ();
if (ATM_CAS_ACQ (&mu->word, 0, MU_WADD_TO_ACQUIRE)) { /* acquire CAS */ uint32_t old_word = 0;
if (atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_WADD_TO_ACQUIRE,
memory_order_acquire, memory_order_relaxed)) {
result = 1; result = 1;
} else { } else {
uint32_t old_word = ATM_LOAD (&mu->word);
result = ((old_word & MU_WZERO_TO_ACQUIRE) == 0 && result = ((old_word & MU_WZERO_TO_ACQUIRE) == 0 &&
ATM_CAS_ACQ (&mu->word, old_word, atomic_compare_exchange_strong_explicit (
(old_word + MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE)); &mu->word, &old_word,
(old_word + MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE,
memory_order_acquire, memory_order_relaxed));
} }
IGNORE_RACES_END (); IGNORE_RACES_END ();
return (result); return (result);
@ -148,11 +155,13 @@ int nsync_mu_trylock (nsync_mu *mu) {
/* Block until *mu is free and then acquire it in writer mode. */ /* Block until *mu is free and then acquire it in writer mode. */
void nsync_mu_lock (nsync_mu *mu) { void nsync_mu_lock (nsync_mu *mu) {
IGNORE_RACES_START (); IGNORE_RACES_START ();
if (!ATM_CAS_ACQ (&mu->word, 0, MU_WADD_TO_ACQUIRE)) { /* acquire CAS */ uint32_t old_word = 0;
uint32_t old_word = ATM_LOAD (&mu->word); if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_WADD_TO_ACQUIRE,
memory_order_acquire, memory_order_relaxed)) {
if ((old_word&MU_WZERO_TO_ACQUIRE) != 0 || if ((old_word&MU_WZERO_TO_ACQUIRE) != 0 ||
!ATM_CAS_ACQ (&mu->word, old_word, !atomic_compare_exchange_strong_explicit (&mu->word, &old_word,
(old_word+MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE)) { (old_word+MU_WADD_TO_ACQUIRE) & ~MU_WCLEAR_ON_ACQUIRE,
memory_order_acquire, memory_order_relaxed)) {
LOCKTRACE("acquiring nsync_mu_lock(%t)...", mu); LOCKTRACE("acquiring nsync_mu_lock(%t)...", mu);
waiter *w = nsync_waiter_new_ (); waiter *w = nsync_waiter_new_ ();
nsync_mu_lock_slow_ (mu, w, 0, nsync_writer_type_); nsync_mu_lock_slow_ (mu, w, 0, nsync_writer_type_);
@ -169,13 +178,15 @@ void nsync_mu_lock (nsync_mu *mu) {
int nsync_mu_rtrylock (nsync_mu *mu) { int nsync_mu_rtrylock (nsync_mu *mu) {
int result; int result;
IGNORE_RACES_START (); IGNORE_RACES_START ();
if (ATM_CAS_ACQ (&mu->word, 0, MU_RADD_TO_ACQUIRE)) { /* acquire CAS */ uint32_t old_word = 0;
if (atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_RADD_TO_ACQUIRE,
memory_order_acquire, memory_order_relaxed)) {
result = 1; result = 1;
} else { } else {
uint32_t old_word = ATM_LOAD (&mu->word);
result = ((old_word&MU_RZERO_TO_ACQUIRE) == 0 && result = ((old_word&MU_RZERO_TO_ACQUIRE) == 0 &&
ATM_CAS_ACQ (&mu->word, old_word, atomic_compare_exchange_strong_explicit (&mu->word, &old_word,
(old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE)); (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE,
memory_order_acquire, memory_order_relaxed));
} }
IGNORE_RACES_END (); IGNORE_RACES_END ();
return (result); return (result);
@ -184,11 +195,13 @@ int nsync_mu_rtrylock (nsync_mu *mu) {
/* Block until *mu can be acquired in reader mode and then acquire it. */ /* Block until *mu can be acquired in reader mode and then acquire it. */
void nsync_mu_rlock (nsync_mu *mu) { void nsync_mu_rlock (nsync_mu *mu) {
IGNORE_RACES_START (); IGNORE_RACES_START ();
if (!ATM_CAS_ACQ (&mu->word, 0, MU_RADD_TO_ACQUIRE)) { /* acquire CAS */ uint32_t old_word = 0;
uint32_t old_word = ATM_LOAD (&mu->word); if (!atomic_compare_exchange_strong_explicit (&mu->word, &old_word, MU_RADD_TO_ACQUIRE,
memory_order_acquire, memory_order_relaxed)) {
if ((old_word&MU_RZERO_TO_ACQUIRE) != 0 || if ((old_word&MU_RZERO_TO_ACQUIRE) != 0 ||
!ATM_CAS_ACQ (&mu->word, old_word, !atomic_compare_exchange_strong_explicit (&mu->word, &old_word,
(old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE)) { (old_word+MU_RADD_TO_ACQUIRE) & ~MU_RCLEAR_ON_ACQUIRE,
memory_order_acquire, memory_order_relaxed)) {
waiter *w = nsync_waiter_new_ (); waiter *w = nsync_waiter_new_ ();
nsync_mu_lock_slow_ (mu, w, 0, nsync_reader_type_); nsync_mu_lock_slow_ (mu, w, 0, nsync_reader_type_);
nsync_waiter_free_ (w); nsync_waiter_free_ (w);
@ -236,16 +249,16 @@ struct Dll *nsync_remove_from_mu_queue_ (struct Dll *mu_queue, struct Dll *e) {
/* Record previous and next elements in the original queue. */ /* Record previous and next elements in the original queue. */
struct Dll *prev = e->prev; struct Dll *prev = e->prev;
struct Dll *next = e->next; struct Dll *next = e->next;
uint32_t old_value;
/* Remove. */ /* Remove. */
dll_remove (&mu_queue, e); dll_remove (&mu_queue, e);
do { uint32_t old_value = ATM_LOAD (&DLL_WAITER (e)->remove_count);
old_value = ATM_LOAD (&DLL_WAITER (e)->remove_count); while (!atomic_compare_exchange_weak_explicit (
} while (!ATM_CAS (&DLL_WAITER (e)->remove_count, old_value, old_value+1)); &DLL_WAITER (e)->remove_count, &old_value, old_value+1,
memory_order_relaxed, memory_order_relaxed)) {
}
if (!dll_is_empty (mu_queue)) { if (!dll_is_empty (mu_queue)) {
/* Fix up same_condition. */ /* Fix up same_condition. */
struct Dll *e_same_condition = &DLL_WAITER (e)->same_condition; struct Dll *e_same_condition = &DLL_WAITER (e)->same_condition;
if (e_same_condition->next != e_same_condition) { if (e_same_condition->next != e_same_condition) {
/* *e is linked to a same_condition neighbour---just remove it. */ /* *e is linked to a same_condition neighbour---just remove it. */
e_same_condition->next->prev = e_same_condition->prev; e_same_condition->next->prev = e_same_condition->prev;
@ -290,14 +303,18 @@ void nsync_mu_unlock_slow_ (nsync_mu *mu, lock_type *l_type) {
/* no one to wake, there's a designated waker waking /* no one to wake, there's a designated waker waking
up, there are still readers, or it's a reader and all waiters up, there are still readers, or it's a reader and all waiters
have false conditions */ have false conditions */
if (ATM_CAS_REL (&mu->word, old_word, if (atomic_compare_exchange_weak_explicit (
(old_word - l_type->add_to_acquire) & &mu->word, &old_word,
~l_type->clear_on_uncontended_release)) { (old_word - l_type->add_to_acquire) &
~l_type->clear_on_uncontended_release,
memory_order_release, memory_order_relaxed)) {
return; return;
} }
} else if ((old_word&MU_SPINLOCK) == 0 && } else if ((old_word&MU_SPINLOCK) == 0 &&
ATM_CAS_SEQCST (&mu->word, old_word, /* [jart] fixes issues on apple silicon */ atomic_compare_exchange_weak_explicit (
(old_word-early_release_mu)|MU_SPINLOCK|MU_DESIG_WAKER)) { &mu->word, &old_word,
(old_word-early_release_mu)|MU_SPINLOCK|MU_DESIG_WAKER,
memory_order_acq_rel, memory_order_relaxed)) {
struct Dll *wake; struct Dll *wake;
lock_type *wake_type; lock_type *wake_type;
uint32_t clear_on_release; uint32_t clear_on_release;
@ -433,10 +450,10 @@ void nsync_mu_unlock_slow_ (nsync_mu *mu, lock_type *l_type) {
whether any waiters remain, and whether any of them whether any waiters remain, and whether any of them
are writers. */ are writers. */
old_word = ATM_LOAD (&mu->word); old_word = ATM_LOAD (&mu->word);
while (!ATM_CAS_REL (&mu->word, old_word, while (!atomic_compare_exchange_weak_explicit (
((old_word-late_release_mu)|set_on_release) & &mu->word, &old_word,
~clear_on_release)) { /* release CAS */ ((old_word - late_release_mu) | set_on_release) & ~clear_on_release,
old_word = ATM_LOAD (&mu->word); memory_order_release, memory_order_relaxed)) {
} }
/* Wake the waiters. */ /* Wake the waiters. */
for (p = dll_first (wake); p != NULL; p = next) { for (p = dll_first (wake); p != NULL; p = next) {
@ -459,8 +476,10 @@ void nsync_mu_unlock (nsync_mu *mu) {
waiter. Another thread could acquire, decrement a reference count waiter. Another thread could acquire, decrement a reference count
and deallocate the mutex before the current thread touched the mutex and deallocate the mutex before the current thread touched the mutex
word again. */ word again. */
if (!ATM_CAS_REL (&mu->word, MU_WLOCK, 0)) { uint32_t old_word = MU_WLOCK;
uint32_t old_word = ATM_LOAD (&mu->word); if (!atomic_compare_exchange_weak_explicit (&mu->word, &old_word, 0,
memory_order_release,
memory_order_relaxed)) {
/* Clear MU_ALL_FALSE because the critical section we're just /* Clear MU_ALL_FALSE because the critical section we're just
leaving may have made some conditions true. */ leaving may have made some conditions true. */
uint32_t new_word = (old_word - MU_WLOCK) & ~MU_ALL_FALSE; uint32_t new_word = (old_word - MU_WLOCK) & ~MU_ALL_FALSE;
@ -488,8 +507,10 @@ void nsync_mu_unlock (nsync_mu *mu) {
void nsync_mu_runlock (nsync_mu *mu) { void nsync_mu_runlock (nsync_mu *mu) {
IGNORE_RACES_START (); IGNORE_RACES_START ();
/* See comment in nsync_mu_unlock(). */ /* See comment in nsync_mu_unlock(). */
if (!ATM_CAS_REL (&mu->word, MU_RLOCK, 0)) { uint32_t old_word = MU_RLOCK;
uint32_t old_word = ATM_LOAD (&mu->word); if (!atomic_compare_exchange_weak_explicit (&mu->word, &old_word, 0,
memory_order_release,
memory_order_relaxed)) {
/* Sanity check: mutex must not be held in write mode and /* Sanity check: mutex must not be held in write mode and
reader count must not be 0. */ reader count must not be 0. */
if (((old_word ^ MU_WLOCK) & (MU_WLOCK | MU_RLOCK_FIELD)) == 0) { if (((old_word ^ MU_WLOCK) & (MU_WLOCK | MU_RLOCK_FIELD)) == 0) {

View file

@ -73,7 +73,10 @@ errno_t nsync_mu_semaphore_p_futex (nsync_semaphore *s) {
result = ECANCELED; result = ECANCELED;
} }
} }
} while (result == 0 && (i == 0 || !ATM_CAS_ACQ ((nsync_atomic_uint32_ *) &f->i, i, i-1))); } while (result == 0 && (i == 0 ||
!atomic_compare_exchange_weak_explicit (
(nsync_atomic_uint32_ *) &f->i, &i, i-1,
memory_order_acquire, memory_order_relaxed)));
return result; return result;
} }
@ -118,16 +121,20 @@ errno_t nsync_mu_semaphore_p_with_deadline_futex (nsync_semaphore *s, int clock,
result = ECANCELED; result = ECANCELED;
} }
} }
} while (result == 0 && (i == 0 || !ATM_CAS_ACQ ((nsync_atomic_uint32_ *) &f->i, i, i - 1))); } while (result == 0 && (i == 0 ||
!atomic_compare_exchange_weak_explicit (
(nsync_atomic_uint32_ *) &f->i, &i, i-1,
memory_order_acquire, memory_order_relaxed)));
return (result); return (result);
} }
/* Ensure that the count of *s is at least 1. */ /* Ensure that the count of *s is at least 1. */
void nsync_mu_semaphore_v_futex (nsync_semaphore *s) { void nsync_mu_semaphore_v_futex (nsync_semaphore *s) {
struct futex *f = (struct futex *) s; struct futex *f = (struct futex *) s;
uint32_t old_value; uint32_t old_value = ATM_LOAD ((nsync_atomic_uint32_ *) &f->i);
do { while (!atomic_compare_exchange_weak_explicit (
old_value = ATM_LOAD ((nsync_atomic_uint32_ *) &f->i); (nsync_atomic_uint32_ *) &f->i, &old_value, old_value+1,
} while (!ATM_CAS_REL ((nsync_atomic_uint32_ *) &f->i, old_value, old_value+1)); memory_order_release, memory_order_relaxed)) {
}
ASSERT (nsync_futex_wake_ ((atomic_int *)&f->i, 1, PTHREAD_PROCESS_PRIVATE) >= 0); ASSERT (nsync_futex_wake_ ((atomic_int *)&f->i, 1, PTHREAD_PROCESS_PRIVATE) >= 0);
} }