Make Cosmo mutexes competitive with Apple Libc

While we have always licked glibc and musl libc on gnu/systemd sadly the
Apple Libc implementation of pthread_mutex_t is better than ours. It may
be due to how the XNU kernel and M2 microprocessor are in league when it
comes to scheduling processes and the NSYNC behavior is being penalized.
We can solve this by leaning more heavily on ulock using Drepper's algo.
It's kind of ironic that Linux's official mutexes work terribly on Linux
but almost as good as Apple Libc if used on MacOS.
This commit is contained in:
Justine Tunney 2024-09-02 18:21:03 -07:00
parent 2ec413b5a9
commit 90460ceb3c
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
13 changed files with 349 additions and 202 deletions

View file

@ -33,13 +33,16 @@
static void pthread_mutex_lock_spin(atomic_int *word) {
int backoff = 0;
for (;;) {
if (!atomic_exchange_explicit(word, 1, memory_order_acquire))
break;
if (atomic_exchange_explicit(word, 1, memory_order_acquire)) {
LOCKTRACE("acquiring pthread_mutex_lock_spin(%t)...", word);
for (;;) {
if (!atomic_load_explicit(word, memory_order_relaxed))
for (;;) {
if (!atomic_load_explicit(word, memory_order_relaxed))
break;
backoff = pthread_delay_np(word, backoff);
}
if (!atomic_exchange_explicit(word, 1, memory_order_acquire))
break;
backoff = pthread_delay_np(word, backoff);
}
}
}
@ -47,14 +50,11 @@ static void pthread_mutex_lock_spin(atomic_int *word) {
// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
// slightly improved to attempt acquiring multiple times b4 syscall
static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) {
int word;
for (int i = 0; i < 4; ++i) {
word = 0;
if (atomic_compare_exchange_strong_explicit(
futex, &word, 1, memory_order_acquire, memory_order_acquire))
return;
pthread_pause_np();
}
int word = 0;
if (atomic_compare_exchange_strong_explicit(
futex, &word, 1, memory_order_acquire, memory_order_acquire))
return;
LOCKTRACE("acquiring pthread_mutex_lock_drepper(%t)...", futex);
if (word == 1)
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
while (word > 0) {
@ -70,6 +70,7 @@ static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
uint64_t lock;
int backoff = 0;
int me = gettid();
bool once = false;
for (;;) {
if (MUTEX_OWNER(word) == me) {
if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -95,6 +96,10 @@ static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
mutex->_pid = __pid;
return 0;
}
if (!once) {
LOCKTRACE("acquiring pthread_mutex_lock_recursive(%t)...", mutex);
once = true;
}
for (;;) {
word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
if (MUTEX_OWNER(word) == me)
@ -117,8 +122,12 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && //
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_lock)) {
_weaken(nsync_mu_lock)((nsync_mu *)mutex);
return 0;
// on apple silicon we should just put our faith in ulock
// otherwise *nsync gets struck down by the eye of sauron
if (!IsXnuSilicon()) {
_weaken(nsync_mu_lock)((nsync_mu *)mutex);
return 0;
}
}
#endif
@ -169,15 +178,26 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
*
* This function does nothing in vfork() children.
*
* You can debug locks the acquisition of locks by building your program
* with `cosmocc -mdbg` and passing the `--strace` flag to your program.
* This will cause a line to be logged each time a mutex or spin lock is
* locked or unlocked. When locking, this is printed after the lock gets
* acquired. The entry to the lock operation will be logged too but only
* if the lock couldn't be immediately acquired. Lock logging works best
* when `mutex` refers to a static variable, in which case its name will
* be printed in the log.
*
* @return 0 on success, or error number on failure
* @see pthread_spin_lock()
* @vforksafe
*/
errno_t pthread_mutex_lock(pthread_mutex_t *mutex) {
if (__vforked)
if (!__vforked) {
errno_t err = pthread_mutex_lock_impl(mutex);
LOCKTRACE("pthread_mutex_lock(%t) → %s", mutex, DescribeErrno(err));
return err;
} else {
LOCKTRACE("skipping pthread_mutex_lock(%t) due to vfork", mutex);
return 0;
LOCKTRACE("acquiring %t...", mutex);
errno_t err = pthread_mutex_lock_impl(mutex);
LOCKTRACE("pthread_mutex_lock(%t) → %s", mutex, DescribeErrno(err));
return err;
}
}

View file

@ -97,10 +97,14 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_trylock)) {
if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
return 0;
} else {
return EBUSY;
// on apple silicon we should just put our faith in ulock
// otherwise *nsync gets struck down by the eye of sauron
if (!IsXnuSilicon()) {
if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
return 0;
} else {
return EBUSY;
}
}
}
#endif

View file

@ -91,8 +91,12 @@ errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && //
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_unlock)) {
_weaken(nsync_mu_unlock)((nsync_mu *)mutex);
return 0;
// on apple silicon we should just put our faith in ulock
// otherwise *nsync gets struck down by the eye of sauron
if (!IsXnuSilicon()) {
_weaken(nsync_mu_unlock)((nsync_mu *)mutex);
return 0;
}
}
#endif

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/atomic.h"
#include "libc/intrin/strace.h"
#include "libc/thread/thread.h"
/**
@ -29,8 +30,17 @@
* pthread_spin_unlock(&lock);
* pthread_spin_destroy(&lock);
*
* This function has undefined behavior when `spin` wasn't intialized,
* was destroyed, or if the lock's already held by the calling thread.
* This function has undefined behavior when `spin` wasn't intialized or
* was destroyed, and if the lock is already held by the calling thread.
*
* You can debug locks the acquisition of locks by building your program
* with `cosmocc -mdbg` and passing the `--strace` flag to your program.
* This will cause a line to be logged each time a mutex or spin lock is
* locked or unlocked. When locking, this is printed after the lock gets
* acquired. The entry to the lock operation will be logged too but only
* if the lock couldn't be immediately acquired. Lock logging works best
* when `mutex` refers to a static variable, in which case its name will
* be printed in the log.
*
* @return 0 on success, or errno on error
* @see pthread_spin_trylock
@ -38,12 +48,16 @@
* @see pthread_spin_init
*/
errno_t pthread_spin_lock(pthread_spinlock_t *spin) {
for (;;) {
if (!atomic_exchange_explicit(&spin->_lock, 1, memory_order_acquire))
break;
for (;;)
if (!atomic_load_explicit(&spin->_lock, memory_order_relaxed))
if (atomic_exchange_explicit(&spin->_lock, 1, memory_order_acquire)) {
LOCKTRACE("acquiring pthread_spin_lock(%t)...", spin);
for (;;) {
for (;;)
if (!atomic_load_explicit(&spin->_lock, memory_order_relaxed))
break;
if (!atomic_exchange_explicit(&spin->_lock, 1, memory_order_acquire))
break;
}
}
LOCKTRACE("pthread_spin_lock(%t)", spin);
return 0;
}

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/atomic.h"
#include "libc/intrin/strace.h"
#include "libc/thread/thread.h"
/**
@ -29,6 +30,7 @@
* @see pthread_spin_lock
*/
errno_t pthread_spin_unlock(pthread_spinlock_t *spin) {
LOCKTRACE("pthread_spin_unlock(%t)", spin);
atomic_store_explicit(&spin->_lock, 0, memory_order_release);
return 0;
}

View file

@ -5,13 +5,19 @@
#define SYSDEBUG 0
#endif
#define _NTTRACE 0 /* not configurable w/ flag yet */
#define _POLLTRACE 0 /* not configurable w/ flag yet */
#define _DATATRACE 1 /* not configurable w/ flag yet */
#define _LOCKTRACE 0 /* not configurable w/ flag yet */
#define _STDIOTRACE 0 /* not configurable w/ flag yet */
#define _KERNTRACE 0 /* not configurable w/ flag yet */
#define _TIMETRACE 0 /* not configurable w/ flag yet */
#ifdef MODE_DBG
#define _STRACE_VERBOSE 1
#else
#define _STRACE_VERBOSE 0
#endif
#define _NTTRACE _STRACE_VERBOSE /* not configurable w/ flag yet */
#define _KERNTRACE _STRACE_VERBOSE /* not configurable w/ flag yet */
#define _POLLTRACE _STRACE_VERBOSE /* not configurable w/ flag yet */
#define _LOCKTRACE _STRACE_VERBOSE /* not configurable w/ flag yet */
#define _DATATRACE 1 /* not configurable w/ flag yet */
#define _STDIOTRACE 0 /* not configurable w/ flag yet */
#define _TIMETRACE 0 /* not configurable w/ flag yet */
#define STRACE_PROLOGUE "%rSYS %6P %6H %'18T "
@ -30,9 +36,10 @@ COSMOPOLITAN_C_START_
((void)(SYSDEBUG && _POLLTRACE && strace_enabled(0) > 0 && \
(__stracef(STRACE_PROLOGUE FMT "\n", ##__VA_ARGS__), 0)))
#define KERNTRACE(FMT, ...) \
((void)(SYSDEBUG && _KERNTRACE && strace_enabled(0) > 0 && \
(__stracef(STRACE_PROLOGUE FMT "\n", ##__VA_ARGS__), 0)))
#define KERNTRACE(FMT, ...) \
((void)(SYSDEBUG && _KERNTRACE && strace_enabled(0) > 0 && \
(__stracef(STRACE_PROLOGUE "\e[2m" FMT "\e[0m\n", ##__VA_ARGS__), \
0)))
#define STDIOTRACE(FMT, ...) \
((void)(SYSDEBUG && _STDIOTRACE && strace_enabled(0) > 0 && \

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/dce.h"
#include "libc/intrin/atomic.h"
#include "libc/limits.h"
#include "libc/thread/thread.h"
@ -44,7 +45,7 @@ errno_t pthread_cond_broadcast(pthread_cond_t *cond) {
#if PTHREAD_USE_NSYNC
// favor *NSYNC if this is a process private condition variable
// if using Mike Burrows' code isn't possible, use a naive impl
if (!cond->_pshared) {
if (!cond->_pshared && !IsXnuSilicon()) {
nsync_cv_broadcast((nsync_cv *)cond);
return 0;
}

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/dce.h"
#include "libc/intrin/atomic.h"
#include "libc/thread/thread.h"
#include "third_party/nsync/cv.h"
@ -43,7 +44,7 @@ errno_t pthread_cond_signal(pthread_cond_t *cond) {
#if PTHREAD_USE_NSYNC
// favor *NSYNC if this is a process private condition variable
// if using Mike Burrows' code isn't possible, use a naive impl
if (!cond->_pshared) {
if (!cond->_pshared && !IsXnuSilicon()) {
nsync_cv_signal((nsync_cv *)cond);
return 0;
}

View file

@ -18,6 +18,7 @@
*/
#include "libc/calls/calls.h"
#include "libc/calls/cp.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/thread/lock.h"
#include "libc/thread/posixthread.internal.h"
@ -122,7 +123,7 @@ errno_t pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
#if PTHREAD_USE_NSYNC
// favor *NSYNC if this is a process private condition variable
// if using Mike Burrows' code isn't possible, use a naive impl
if (!cond->_pshared) {
if (!cond->_pshared && !IsXnuSilicon()) {
err = nsync_cv_wait_with_deadline(
(nsync_cv *)cond, (nsync_mu *)mutex,
abstime ? *abstime : nsync_time_no_deadline, 0);