From e398f3887ce4933adfcb8bb7729ea6eb0d39dbb5 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Wed, 24 Jul 2024 01:05:00 -0700
Subject: [PATCH] Make more improvements to threads and mappings

- NetBSD should now have faster synchronization
- POSIX barriers may now be shared across processes
- An edge case with memory map tracking has been fixed
- Grand Central Dispatch is no longer used on MacOS ARM64
- POSIX mutexes in normal mode now use futexes across processes
---
 libc/intrin/cp.c                             |  12 +-
 libc/intrin/maps.h                           |   2 +
 libc/intrin/mmap.c                           |  55 +++--
 libc/intrin/mprotect.c                       |  13 +-
 libc/intrin/pthread_mutex_lock.c             |  92 +++++---
 libc/intrin/pthread_mutex_trylock.c          | 111 +++++----
 libc/intrin/pthread_mutex_unlock.c           |  90 ++++---
 libc/intrin/pthread_yield_np.c               |   2 +-
 libc/thread/pthread_barrier_destroy.c        |  10 +-
 libc/thread/pthread_barrier_init.c           |  14 +-
 libc/thread/pthread_barrier_wait.c           |  42 +++-
 libc/thread/pthread_barrierattr_getpshared.c |   2 +-
 libc/thread/pthread_barrierattr_init.c       |   2 +-
 libc/thread/pthread_barrierattr_setpshared.c |   3 +-
 libc/thread/thread.h                         |  12 +-
 test/libc/thread/footek_test.c               | 236 +++++++++++++++++++
 third_party/nsync/common.c                   |   7 +-
 third_party/nsync/futex.c                    |   5 +-
 third_party/nsync/mu_semaphore.c             |  13 +-
 third_party/nsync/mu_semaphore.internal.h    |  14 ++
 20 files changed, 566 insertions(+), 171 deletions(-)
 create mode 100644 test/libc/thread/footek_test.c

diff --git a/libc/intrin/cp.c b/libc/intrin/cp.c
index 411226157..5f4061033 100644
--- a/libc/intrin/cp.c
+++ b/libc/intrin/cp.c
@@ -26,11 +26,9 @@
 
 int begin_cancelation_point(void) {
   int state = 0;
-  struct CosmoTib *tib;
-  struct PosixThread *pt;
   if (__tls_enabled) {
-    tib = __get_tls();
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    struct PosixThread *pt;
+    if ((pt = _pthread_self())) {
       state = pt->pt_flags & PT_INCANCEL;
       pt->pt_flags |= PT_INCANCEL;
     }
@@ -39,11 +37,9 @@ int begin_cancelation_point(void) {
 }
 
 void end_cancelation_point(int state) {
-  struct CosmoTib *tib;
-  struct PosixThread *pt;
   if (__tls_enabled) {
-    tib = __get_tls();
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    struct PosixThread *pt;
+    if ((pt = _pthread_self())) {
       pt->pt_flags &= ~PT_INCANCEL;
       pt->pt_flags |= state;
     }
diff --git a/libc/intrin/maps.h b/libc/intrin/maps.h
index a438d1221..3a30c752a 100644
--- a/libc/intrin/maps.h
+++ b/libc/intrin/maps.h
@@ -6,6 +6,8 @@
 #include "libc/thread/tls2.internal.h"
 COSMOPOLITAN_C_START_
 
+#define MAPS_RETRY ((void *)-1)
+
 #define MAP_TREE_CONTAINER(e) TREE_CONTAINER(struct Map, tree, e)
 
 struct Map {
diff --git a/libc/intrin/mmap.c b/libc/intrin/mmap.c
index cc342b73f..65cccc769 100644
--- a/libc/intrin/mmap.c
+++ b/libc/intrin/mmap.c
@@ -120,6 +120,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
   struct Map *map;
   struct Map *next;
   struct Map *floor;
+StartOver:
   floor = __maps_floor(addr);
   for (map = floor; map && map->addr <= addr + size; map = next) {
     next = __maps_next(map);
@@ -148,6 +149,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
       ASSERT(left > 0);
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         map->addr += left;
         map->size = right;
         if (!(map->flags & MAP_ANONYMOUS))
@@ -167,6 +170,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
       size_t right = map_addr + map_size - addr;
       struct Map *rightmap;
       if ((rightmap = __maps_alloc())) {
+        if (rightmap == MAPS_RETRY)
+          goto StartOver;
         map->size = left;
         __maps.pages -= (right + pagesz - 1) / pagesz;
         rightmap->addr = addr;
@@ -184,8 +189,14 @@ static int __muntrack(char *addr, size_t size, int pagesz,
       size_t right = map_size - middle - left;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         struct Map *middlemap;
         if ((middlemap = __maps_alloc())) {
+          if (middlemap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
           leftmap->addr = map_addr;
           leftmap->size = left;
           leftmap->off = map->off;
@@ -204,6 +215,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
           *deleted = middlemap;
           __maps_check();
         } else {
+          __maps_free(leftmap);
           rc = -1;
         }
       } else {
@@ -304,12 +316,11 @@ struct Map *__maps_alloc(void) {
   map->flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK;
   map->hand = sys.maphandle;
   __maps_lock();
-  __maps_insert(map++);
+  __maps_insert(map);
   __maps_unlock();
-  map->addr = MAP_FAILED;
-  for (int i = 1; i < gransz / sizeof(struct Map) - 1; ++i)
+  for (int i = 1; i < gransz / sizeof(struct Map); ++i)
     __maps_free(map + i);
-  return map;
+  return MAPS_RETRY;
 }
 
 static int __munmap(char *addr, size_t size) {
@@ -396,21 +407,32 @@ void *__maps_pickaddr(size_t size) {
 static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
                           int64_t off, int pagesz, int gransz) {
 
+  // allocate Map object
+  struct Map *map;
+  do {
+    if (!(map = __maps_alloc()))
+      return MAP_FAILED;
+  } while (map == MAPS_RETRY);
+
   // polyfill nuances of fixed mappings
   int sysflags = flags;
   bool noreplace = false;
   bool should_untrack = false;
   if (flags & MAP_FIXED_NOREPLACE) {
-    if (flags & MAP_FIXED)
+    if (flags & MAP_FIXED) {
+      __maps_free(map);
       return (void *)einval();
+    }
     sysflags &= ~MAP_FIXED_NOREPLACE;
     if (IsLinux()) {
       noreplace = true;
       sysflags |= MAP_FIXED_NOREPLACE_linux;
     } else if (IsFreebsd() || IsNetbsd()) {
       sysflags |= MAP_FIXED;
-      if (__maps_overlaps(addr, size, pagesz))
+      if (__maps_overlaps(addr, size, pagesz)) {
+        __maps_free(map);
         return (void *)eexist();
+      }
     } else {
       noreplace = true;
     }
@@ -418,11 +440,6 @@ static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
     should_untrack = true;
   }
 
-  // allocate Map object
-  struct Map *map;
-  if (!(map = __maps_alloc()))
-    return MAP_FAILED;
-
   // remove mapping we blew away
   if (IsWindows() && should_untrack)
     __munmap(addr, size);
@@ -572,23 +589,27 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
         return (void *)einval();
   }
 
+  // allocate object for tracking new mapping
+  struct Map *map;
+  do {
+    if (!(map = __maps_alloc()))
+      return (void *)enomem();
+  } while (map == MAPS_RETRY);
+
   // check old interval is fully contained within one mapping
   struct Map *old_map;
   if (!(old_map = __maps_floor(old_addr)) ||
       old_addr + old_size > old_map->addr + PGUP(old_map->size) ||
-      old_addr < old_map->addr)
+      old_addr < old_map->addr) {
+    __maps_free(map);
     return (void *)efault();
+  }
 
   // save old properties
   int old_off = old_map->off;
   int old_prot = old_map->prot;
   int old_flags = old_map->flags;
 
-  // allocate object for tracking new mapping
-  struct Map *map;
-  if (!(map = __maps_alloc()))
-    return (void *)enomem();
-
   // netbsd mremap fixed returns enoent rather than unmapping old pages
   if (IsNetbsd() && (flags & MREMAP_FIXED))
     if (__munmap(new_addr, new_size)) {
diff --git a/libc/intrin/mprotect.c b/libc/intrin/mprotect.c
index cc8a23ee9..784906acc 100644
--- a/libc/intrin/mprotect.c
+++ b/libc/intrin/mprotect.c
@@ -75,6 +75,7 @@ int __mprotect(char *addr, size_t size, int prot) {
     return edeadlk();
   }
   struct Map *map, *floor;
+StartOver:
   floor = __maps_floor(addr);
   for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) {
     char *map_addr = map->addr;
@@ -93,10 +94,12 @@ int __mprotect(char *addr, size_t size, int prot) {
       }
     } else if (addr <= map_addr) {
       // change lefthand side of mapping
-      size_t left = PGUP(addr + size - map_addr);
+      size_t left = addr + size - map_addr;
       size_t right = map_size - left;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         if (!__mprotect_chunk(map_addr, left, prot, false)) {
           leftmap->addr = map_addr;
           leftmap->size = left;
@@ -127,6 +130,8 @@ int __mprotect(char *addr, size_t size, int prot) {
       size_t right = map_addr + map_size - addr;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         if (!__mprotect_chunk(map_addr + left, right, prot, false)) {
           leftmap->addr = map_addr;
           leftmap->size = left;
@@ -159,8 +164,14 @@ int __mprotect(char *addr, size_t size, int prot) {
       size_t right = map_size - middle - left;
       struct Map *leftmap;
       if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
         struct Map *midlmap;
         if ((midlmap = __maps_alloc())) {
+          if (midlmap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
           if (!__mprotect_chunk(map_addr + left, middle, prot, false)) {
             leftmap->addr = map_addr;
             leftmap->size = left;
diff --git a/libc/intrin/pthread_mutex_lock.c b/libc/intrin/pthread_mutex_lock.c
index 0b96949de..b293a6b75 100644
--- a/libc/intrin/pthread_mutex_lock.c
+++ b/libc/intrin/pthread_mutex_lock.c
@@ -27,41 +27,47 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
 
-static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
-  int me;
+static void pthread_mutex_lock_naive(pthread_mutex_t *mutex, uint64_t word) {
   int backoff = 0;
-  uint64_t word, lock;
-
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-#if PTHREAD_USE_NSYNC
-  // use fancy nsync mutex if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_lock)) {
-    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
-    return 0;
+  uint64_t lock;
+  for (;;) {
+    word = MUTEX_UNLOCK(word);
+    lock = MUTEX_LOCK(word);
+    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+                                              memory_order_acquire,
+                                              memory_order_relaxed))
+      return;
+    backoff = pthread_delay_np(mutex, backoff);
   }
-#endif
+}
 
-  // implement barebones normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
-    for (;;) {
-      word = MUTEX_UNLOCK(word);
-      lock = MUTEX_LOCK(word);
-      if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
-                                                memory_order_acquire,
-                                                memory_order_relaxed))
-        return 0;
-      backoff = pthread_delay_np(mutex, backoff);
-    }
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
+// slightly improved to attempt acquiring multiple times b4 syscall
+static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) {
+  int word;
+  for (int i = 0; i < 4; ++i) {
+    word = 0;
+    if (atomic_compare_exchange_strong_explicit(
+            futex, &word, 1, memory_order_acquire, memory_order_acquire))
+      return;
+    pthread_pause_np();
   }
+  if (word == 1)
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  while (word > 0) {
+    _weaken(nsync_futex_wait_)(futex, 2, pshare, 0);
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  }
+}
 
-  // implement recursive mutexes
-  me = gettid();
+static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
+                                            uint64_t word) {
+  uint64_t lock;
+  int backoff = 0;
+  int me = gettid();
   for (;;) {
     if (MUTEX_OWNER(word) == me) {
       if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@@ -91,6 +97,36 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
   }
 }
 
+static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
+  uint64_t word;
+
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_lock)) {
+    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
+    return 0;
+  }
+#endif
+
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      pthread_mutex_lock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_lock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_lock_recursive(mutex, word);
+}
+
 /**
  * Locks mutex.
  *
diff --git a/libc/intrin/pthread_mutex_trylock.c b/libc/intrin/pthread_mutex_trylock.c
index a793eed34..5fd06a078 100644
--- a/libc/intrin/pthread_mutex_trylock.c
+++ b/libc/intrin/pthread_mutex_trylock.c
@@ -24,54 +24,33 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
 
-/**
- * Attempts acquiring lock.
- *
- * Unlike pthread_mutex_lock() this function won't block and instead
- * returns an error immediately if the lock couldn't be acquired.
- *
- * @return 0 if lock was acquired, otherwise an errno
- * @raise EAGAIN if maximum number of recursive locks is held
- * @raise EBUSY if lock is currently held in read or write mode
- * @raise EINVAL if `mutex` doesn't refer to an initialized lock
- * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
- *     current thread already holds this mutex
- */
-errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
-  int me;
-  uint64_t word, lock;
+static errno_t pthread_mutex_trylock_naive(pthread_mutex_t *mutex,
+                                           uint64_t word) {
+  uint64_t lock;
+  word = MUTEX_UNLOCK(word);
+  lock = MUTEX_LOCK(word);
+  if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+                                            memory_order_acquire,
+                                            memory_order_relaxed))
+    return 0;
+  return EBUSY;
+}
 
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+static errno_t pthread_mutex_trylock_drepper(atomic_int *futex) {
+  int word = 0;
+  if (atomic_compare_exchange_strong_explicit(
+          futex, &word, 1, memory_order_acquire, memory_order_acquire))
+    return 0;
+  return EBUSY;
+}
 
-#if PTHREAD_USE_NSYNC
-  // delegate to *NSYNC if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_trylock)) {
-    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
-      return 0;
-    } else {
-      return EBUSY;
-    }
-  }
-#endif
-
-  // handle normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
-    word = MUTEX_UNLOCK(word);
-    lock = MUTEX_LOCK(word);
-    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
-                                              memory_order_acquire,
-                                              memory_order_relaxed))
-      return 0;
-    return EBUSY;
-  }
-
-  // handle recursive and error check mutexes
-  me = gettid();
+static errno_t pthread_mutex_trylock_recursive(pthread_mutex_t *mutex,
+                                               uint64_t word) {
+  uint64_t lock;
+  int me = gettid();
   for (;;) {
     if (MUTEX_OWNER(word) == me) {
       if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@@ -100,3 +79,47 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
     return EBUSY;
   }
 }
+
+/**
+ * Attempts acquiring lock.
+ *
+ * Unlike pthread_mutex_lock() this function won't block and instead
+ * returns an error immediately if the lock couldn't be acquired.
+ *
+ * @return 0 if lock was acquired, otherwise an errno
+ * @raise EAGAIN if maximum number of recursive locks is held
+ * @raise EBUSY if lock is currently held in read or write mode
+ * @raise EINVAL if `mutex` doesn't refer to an initialized lock
+ * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
+ *     current thread already holds this mutex
+ */
+errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
+
+  // get current state of lock
+  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_trylock)) {
+    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
+      return 0;
+    } else {
+      return EBUSY;
+    }
+  }
+#endif
+
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      return pthread_mutex_trylock_drepper(&mutex->_futex);
+    } else {
+      return pthread_mutex_trylock_naive(mutex, word);
+    }
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_trylock_recursive(mutex, word);
+}
diff --git a/libc/intrin/pthread_mutex_unlock.c b/libc/intrin/pthread_mutex_unlock.c
index 4d796d9a4..fcb549dcb 100644
--- a/libc/intrin/pthread_mutex_unlock.c
+++ b/libc/intrin/pthread_mutex_unlock.c
@@ -25,45 +25,26 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
 
-/**
- * Releases mutex.
- *
- * This function does nothing in vfork() children.
- *
- * @return 0 on success or error number on failure
- * @raises EPERM if in error check mode and not owned by caller
- * @vforksafe
- */
-errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
-  int me;
-  uint64_t word, lock;
+static void pthread_mutex_unlock_naive(pthread_mutex_t *mutex, uint64_t word) {
+  uint64_t lock = MUTEX_UNLOCK(word);
+  atomic_store_explicit(&mutex->_word, lock, memory_order_release);
+}
 
-  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
-
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-#if PTHREAD_USE_NSYNC
-  // use fancy nsync mutex if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_unlock)) {
-    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
-    return 0;
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
+static void pthread_mutex_unlock_drepper(atomic_int *futex, char pshare) {
+  int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
+  if (word == 2) {
+    atomic_store_explicit(futex, 0, memory_order_release);
+    _weaken(nsync_futex_wake_)(futex, 1, pshare);
   }
-#endif
+}
 
-  // implement barebones normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
-    lock = MUTEX_UNLOCK(word);
-    atomic_store_explicit(&mutex->_word, lock, memory_order_release);
-    return 0;
-  }
-
-  // implement recursive mutex unlocking
-  me = gettid();
+static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex,
+                                              uint64_t word) {
+  int me = gettid();
   for (;;) {
 
     // we allow unlocking an initialized lock that wasn't locked, but we
@@ -88,3 +69,44 @@ errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
       return 0;
   }
 }
+
+/**
+ * Releases mutex.
+ *
+ * This function does nothing in vfork() children.
+ *
+ * @return 0 on success or error number on failure
+ * @raises EPERM if in error check mode and not owned by caller
+ * @vforksafe
+ */
+errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
+  uint64_t word;
+
+  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
+
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_unlock)) {
+    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
+    return 0;
+  }
+#endif
+
+  // implement barebones normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wake_)) {
+      pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_unlock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_unlock_recursive(mutex, word);
+}
diff --git a/libc/intrin/pthread_yield_np.c b/libc/intrin/pthread_yield_np.c
index fefc3f283..4fa0c988c 100644
--- a/libc/intrin/pthread_yield_np.c
+++ b/libc/intrin/pthread_yield_np.c
@@ -32,7 +32,7 @@ void sys_sched_yield(void);
 int pthread_yield_np(void) {
   if (IsXnuSilicon()) {
     __syslib->__pthread_yield_np();
-  } else if (IsOpenbsd() || IsNetbsd()) {
+  } else if (IsOpenbsd()) {
     // sched_yield() is punishingly slow on OpenBSD
     // it's ruinously slow it'll destroy everything
     pthread_pause_np();
diff --git a/libc/thread/pthread_barrier_destroy.c b/libc/thread/pthread_barrier_destroy.c
index 5893558ac..5a75ad3e3 100644
--- a/libc/thread/pthread_barrier_destroy.c
+++ b/libc/thread/pthread_barrier_destroy.c
@@ -16,9 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
 #include "libc/str/str.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"
 
 /**
  * Destroys barrier.
@@ -27,9 +28,8 @@
  * @raise EINVAL if threads are still inside the barrier
  */
 errno_t pthread_barrier_destroy(pthread_barrier_t *barrier) {
-  if (barrier->_nsync) {
-    nsync_counter_free(barrier->_nsync);
-    barrier->_nsync = 0;
-  }
+  if (atomic_load_explicit(&barrier->_waiters, memory_order_relaxed))
+    return EINVAL;
+  memset(barrier, -1, sizeof(*barrier));
   return 0;
 }
diff --git a/libc/thread/pthread_barrier_init.c b/libc/thread/pthread_barrier_init.c
index 43770caa2..77079c54a 100644
--- a/libc/thread/pthread_barrier_init.c
+++ b/libc/thread/pthread_barrier_init.c
@@ -17,8 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"
 
 /**
  * Initializes barrier.
@@ -28,16 +29,17 @@
  *     before the barrier is released, which must be greater than zero
  * @return 0 on success, or error number on failure
  * @raise EINVAL if `count` isn't greater than zero
- * @raise ENOMEM if insufficient memory exists
  */
 errno_t pthread_barrier_init(pthread_barrier_t *barrier,
                              const pthread_barrierattr_t *attr,
                              unsigned count) {
-  nsync_counter c;
   if (!count)
     return EINVAL;
-  if (!(c = nsync_counter_new(count)))
-    return ENOMEM;
-  *barrier = (pthread_barrier_t){._nsync = c};
+  if (count > INT_MAX)
+    return EINVAL;
+  barrier->_count = count;
+  barrier->_pshared = attr ? *attr : PTHREAD_PROCESS_PRIVATE;
+  atomic_store_explicit(&barrier->_counter, count, memory_order_relaxed);
+  atomic_store_explicit(&barrier->_waiters, 0, memory_order_relaxed);
   return 0;
 }
diff --git a/libc/thread/pthread_barrier_wait.c b/libc/thread/pthread_barrier_wait.c
index 3dfcec5b5..27f64773e 100644
--- a/libc/thread/pthread_barrier_wait.c
+++ b/libc/thread/pthread_barrier_wait.c
@@ -16,25 +16,53 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/blockcancel.internal.h"
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"
+#include "third_party/nsync/futex.internal.h"
 
 /**
  * Waits for all threads to arrive at barrier.
  *
  * When the barrier is broken, the state becomes reset to what it was
  * when pthread_barrier_init() was called, so that the barrior may be
- * used again in the same way. The last thread to arrive shall be the
- * last to leave and it returns a magic value.
+ * used again in the same way.
+ *
+ * Unlike pthread_cond_timedwait() this function is not a cancelation
+ * point. It is not needed to have cleanup handlers on block cancels.
  *
  * @return 0 on success, `PTHREAD_BARRIER_SERIAL_THREAD` to one lucky
  *     thread which was the last arrival, or an errno on error
+ * @raise EINVAL if barrier is used incorrectly
  */
 errno_t pthread_barrier_wait(pthread_barrier_t *barrier) {
-  if (nsync_counter_add(barrier->_nsync, -1)) {
-    nsync_counter_wait(barrier->_nsync, nsync_time_no_deadline);
-    return 0;
-  } else {
+  int n;
+
+  // enter barrier
+  atomic_fetch_add_explicit(&barrier->_waiters, 1, memory_order_acq_rel);
+  n = atomic_fetch_sub_explicit(&barrier->_counter, 1, memory_order_acq_rel);
+  n = n - 1;
+
+  // this can only happen on invalid usage
+  if (n < 0)
+    return EINVAL;
+
+  // reset count and wake waiters if we're last at barrier
+  if (!n) {
+    atomic_store_explicit(&barrier->_counter, barrier->_count,
+                          memory_order_release);
+    atomic_store_explicit(&barrier->_waiters, 0, memory_order_release);
+    nsync_futex_wake_(&barrier->_waiters, INT_MAX, barrier->_pshared);
     return PTHREAD_BARRIER_SERIAL_THREAD;
   }
+
+  // wait for everyone else to arrive at barrier
+  BLOCK_CANCELATION;
+  while ((n = atomic_load_explicit(&barrier->_waiters, memory_order_acquire)))
+    nsync_futex_wait_(&barrier->_waiters, n, barrier->_pshared, 0);
+  ALLOW_CANCELATION;
+
+  return 0;
 }
diff --git a/libc/thread/pthread_barrierattr_getpshared.c b/libc/thread/pthread_barrierattr_getpshared.c
index 8c0725963..e71846dd2 100644
--- a/libc/thread/pthread_barrierattr_getpshared.c
+++ b/libc/thread/pthread_barrierattr_getpshared.c
@@ -23,7 +23,7 @@
  *
  * @param pshared is set to one of the following
  *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
  * @return 0 on success, or error on failure
  */
 errno_t pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr,
diff --git a/libc/thread/pthread_barrierattr_init.c b/libc/thread/pthread_barrierattr_init.c
index 473038114..f4abec468 100644
--- a/libc/thread/pthread_barrierattr_init.c
+++ b/libc/thread/pthread_barrierattr_init.c
@@ -24,6 +24,6 @@
  * @return 0 on success, or error on failure
  */
 errno_t pthread_barrierattr_init(pthread_barrierattr_t *attr) {
-  *attr = 0;
+  *attr = PTHREAD_PROCESS_PRIVATE;
   return 0;
 }
diff --git a/libc/thread/pthread_barrierattr_setpshared.c b/libc/thread/pthread_barrierattr_setpshared.c
index 05dcee1c1..0f74e0436 100644
--- a/libc/thread/pthread_barrierattr_setpshared.c
+++ b/libc/thread/pthread_barrierattr_setpshared.c
@@ -24,13 +24,14 @@
  *
  * @param pshared can be one of
  *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
  * @return 0 on success, or error on failure
  * @raises EINVAL if `pshared` is invalid
  */
 errno_t pthread_barrierattr_setpshared(pthread_barrierattr_t *attr,
                                        int pshared) {
   switch (pshared) {
+    case PTHREAD_PROCESS_SHARED:
     case PTHREAD_PROCESS_PRIVATE:
       *attr = pshared;
       return 0;
diff --git a/libc/thread/thread.h b/libc/thread/thread.h
index 406c19ff8..1c3ff8eff 100644
--- a/libc/thread/thread.h
+++ b/libc/thread/thread.h
@@ -46,7 +46,7 @@ COSMOPOLITAN_C_START_
 #define PTHREAD_RWLOCK_INITIALIZER {0}
 #define PTHREAD_MUTEX_INITIALIZER  {0}
 
-#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, 0, PTHREAD_MUTEX_RECURSIVE}
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE}
 
 typedef uintptr_t pthread_t;
 typedef int pthread_id_np_t;
@@ -66,7 +66,10 @@ typedef struct pthread_spinlock_s {
 
 typedef struct pthread_mutex_s {
   uint32_t _nsync;
-  int32_t _pid;
+  union {
+    int32_t _pid;
+    _Atomic(int32_t) _futex;
+  };
   _Atomic(uint64_t) _word;
 } pthread_mutex_t;
 
@@ -92,7 +95,10 @@ typedef struct pthread_rwlock_s {
 } pthread_rwlock_t;
 
 typedef struct pthread_barrier_s {
-  void *_nsync;
+  int _count;
+  char _pshared;
+  _Atomic(int) _counter;
+  _Atomic(int) _waiters;
 } pthread_barrier_t;
 
 typedef struct pthread_attr_s {
diff --git a/test/libc/thread/footek_test.c b/test/libc/thread/footek_test.c
new file mode 100644
index 000000000..98e07e5e9
--- /dev/null
+++ b/test/libc/thread/footek_test.c
@@ -0,0 +1,236 @@
+#include <assert.h>
+#include <cosmo.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+#include "third_party/nsync/futex.internal.h"
+
+// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
+// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
+// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
+
+// arm fleet
+// with futexes
+// 30 threads / 100000 iterations
+//
+//          242,604 us real
+//        4,222,946 us user
+//        1,079,229 us sys
+// footek_test on studio.test.          630 µs   17'415 µs     256'782 µs
+//        1,362,557 us real
+//        3,232,978 us user
+//        2,104,824 us sys
+// footek_test on pi.test.              611 µs   21'708 µs   1'385'129 µs
+//        1,346,482 us real
+//        3,370,513 us user
+//        1,992,383 us sys
+// footek_test on freebsdarm.test.      427 µs   19'967 µs   1'393'476 µs
+
+// arm fleet
+// without futexes
+// 30 threads / 100000 iterations
+//
+//        1,282,084 us real
+//       29,359,582 us user
+//           34,553 us sys
+// footek_test on studio.test.          961 µs   12'907 µs   1'287'983 µs
+//        4,070,988 us real
+//       16,203,990 us user
+//            7,999 us sys
+// footek_test on pi.test.              459 µs   16'376 µs   4'095'512 µs
+//        7,012,493 us real
+//       27,936,725 us user
+//            7,871 us sys
+// footek_test on freebsdarm.test.      502 µs   16'446 µs   7'051'545 µs
+
+// x86 fleet
+// with futexes
+// 30 threads / 100000 iterations
+//
+//          146,015 us real
+//          169,427 us user
+//           68,939 us sys
+// footek_test on rhel7.test.           376 µs    2'259 µs     153'024 µs
+//          144,917 us real
+//          383,317 us user
+//          191,203 us sys
+// footek_test on xnu.test.          11'143 µs    9'159 µs     164'865 µs
+//          244,286 us real
+//          405,395 us user
+//          956,122 us sys
+// footek_test on freebsd.test.         394 µs    2'165 µs     256'227 µs
+//          209,095 us real
+//          616,634 us user
+//            9,945 us sys
+// footek_test on netbsd.test.          502 µs    2'020 µs     261'895 µs
+//          344,876 us real
+//           50,000 us user
+//        1,240,000 us sys
+// footek_test on openbsd.test.         457 µs    2'737 µs     396'342 µs
+//        1,193,906 us real
+//       17,546,875 us user
+//        3,000,000 us sys
+// footek_test on win10.test.           462 µs   59'528 µs   1'348'265 µs
+
+// x86 fleet
+// without futexes
+// 30 threads / 100000 iterations
+//
+//          897,815 us real
+//        1,763,705 us user
+//            9,696 us sys
+// footek_test on rhel7.test.           423 µs    2'638 µs     912'241 µs
+//          790,332 us real
+//        2,359,967 us user
+//                0 us sys
+// footek_test on netbsd.test.        1'151 µs    2'634 µs   1'014'867 µs
+//        2,332,724 us real
+//        9,150,000 us user
+//           10,000 us sys
+// footek_test on openbsd.test.         557 µs    3'020 µs   2'554'648 µs
+//        2,528,863 us real
+//       56,546,875 us user
+//        1,671,875 us sys
+// footek_test on win10.test.           962 µs    9'698 µs   2'751'905 µs
+//        2,916,033 us real
+//       17,236,103 us user
+//                0 us sys
+// footek_test on freebsd.test.         690 µs    3'011 µs   2'925'997 µs
+//        4,225,726 us real
+//       16,679,456 us user
+//           16,265 us sys
+// footek_test on xnu.test.          98'468 µs    5'242 µs   5'191'724 µs
+
+#define USE_FUTEX  1
+#define THREADS    30
+#define ITERATIONS 30000
+
+#define MUTEX_LOCKED(word)  ((word) & 8)
+#define MUTEX_WAITING(word) ((word) & 16)
+
+#define MUTEX_LOCK(word)        ((word) | 8)
+#define MUTEX_SET_WAITING(word) ((word) | 16)
+#define MUTEX_UNLOCK(word)      ((word) & ~(8 | 16))
+
+void lock(atomic_int *futex) {
+  int word, cs;
+  for (int i = 0; i < 4; ++i) {
+    word = 0;
+    if (atomic_compare_exchange_strong_explicit(
+            futex, &word, 1, memory_order_acquire, memory_order_acquire))
+      return;
+    pthread_pause_np();
+  }
+  if (word == 1)
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  while (word > 0) {
+    pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
+#if USE_FUTEX
+    nsync_futex_wait_(futex, 2, 0, 0);
+#endif
+    pthread_setcancelstate(cs, 0);
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  }
+}
+
+void unlock(atomic_int *futex) {
+  int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
+  if (word == 2) {
+    atomic_store_explicit(futex, 0, memory_order_release);
+#if USE_FUTEX
+    nsync_futex_wake_(futex, 1, 0);
+#endif
+  }
+}
+
+int g_chores;
+atomic_int g_lock;
+pthread_mutex_t g_locker;
+
+void *worker(void *arg) {
+  for (int i = 0; i < ITERATIONS; ++i) {
+    lock(&g_lock);
+    ++g_chores;
+    unlock(&g_lock);
+  }
+  return 0;
+}
+
+int main() {
+  struct timeval start;
+  gettimeofday(&start, 0);
+
+  pthread_t th[THREADS];
+  for (int i = 0; i < THREADS; ++i)
+    pthread_create(&th[i], 0, worker, 0);
+  for (int i = 0; i < THREADS; ++i)
+    pthread_join(th[i], 0);
+  npassert(g_chores == THREADS * ITERATIONS);
+
+  struct rusage ru;
+  struct timeval end;
+  gettimeofday(&end, 0);
+  getrusage(RUSAGE_SELF, &ru);
+  printf("%,16ld us real\n"
+         "%,16ld us user\n"
+         "%,16ld us sys\n",
+         timeval_tomicros(timeval_sub(end, start)),  //
+         timeval_tomicros(ru.ru_utime),              //
+         timeval_tomicros(ru.ru_stime));
+
+  CheckForMemoryLeaks();
+}
+
+// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
+// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
+
+// x86 fleet
+// with pthread_mutex_t
+// 30 threads / 100000 iterations
+//
+//          186,976 us real
+//           43,609 us user
+//          205,585 us sys
+// footek_test on freebsd.test.         410 µs    2'054 µs     195'339 µs
+//          238,902 us real
+//          235,743 us user
+//           97,881 us sys
+// footek_test on rhel7.test.           343 µs    2'339 µs     246'926 µs
+//          201,285 us real
+//          249,612 us user
+//          141,230 us sys
+// footek_test on xnu.test.           1'960 µs    5'350 µs     265'758 µs
+//          303,363 us real
+//           60,000 us user
+//          410,000 us sys
+// footek_test on openbsd.test.         545 µs    3'023 µs     326'200 µs
+//          386,085 us real
+//          586,455 us user
+//          466,991 us sys
+// footek_test on netbsd.test.          344 µs    2'421 µs     413'440 µs
+//          245,010 us real
+//          437,500 us user
+//          140,625 us sys
+// footek_test on win10.test.           300 µs   18'574 µs     441'225 µs
+
+// arm fleet
+// with pthread_mutex_t
+// 30 threads / 100000 iterations
+//
+//           87,132 us real
+//          183,517 us user
+//           20,020 us sys
+// footek_test on studio.test.          560 µs   12'418 µs      92'825 µs
+//          679,374 us real
+//          957,678 us user
+//          605,078 us sys
+// footek_test on pi.test.              462 µs   16'574 µs     702'833 µs
+//          902,343 us real
+//        1,459,706 us user
+//          781,140 us sys
+// footek_test on freebsdarm.test.      400 µs   16'261 µs     970'022 µs
diff --git a/third_party/nsync/common.c b/third_party/nsync/common.c
index 2b150401c..d7fa348cd 100644
--- a/third_party/nsync/common.c
+++ b/third_party/nsync/common.c
@@ -37,6 +37,7 @@
 #include "third_party/nsync/atomic.internal.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/mu_semaphore.h"
+#include "third_party/nsync/mu_semaphore.internal.h"
 #include "third_party/nsync/wait_s.internal.h"
 __static_yoink("nsync_notice");
 
@@ -147,9 +148,9 @@ static void free_waiters_push (waiter *w) {
 
 static void free_waiters_populate (void) {
 	int n;
-	if (IsNetbsd () || IsXnuSilicon ()) {
-		// netbsd needs one file descriptor per semaphore (!!)
-		// tim cook wants us to use his grand central dispatch
+	if (IsNetbsd () || (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ())) {
+		// netbsd needs a real file descriptor per semaphore
+		// tim cook wants us to use his lol central dispatch
 		n = 1;
 	} else {
 		n = getpagesize() / sizeof(waiter);
diff --git a/third_party/nsync/futex.c b/third_party/nsync/futex.c
index 971501901..9a0f264a5 100644
--- a/third_party/nsync/futex.c
+++ b/third_party/nsync/futex.c
@@ -52,6 +52,7 @@
 #include "third_party/nsync/atomic.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/futex.internal.h"
+#include "libc/intrin/kprintf.h"
 #include "third_party/nsync/time.h"
 
 #define FUTEX_WAIT_BITS_ FUTEX_BITSET_MATCH_ANY
@@ -138,7 +139,7 @@ static int nsync_futex_polyfill_ (atomic_int *w, int expect, struct timespec *ab
 		}
 		if (_weaken (pthread_testcancel_np) &&
 		    _weaken (pthread_testcancel_np) ()) {
-			return -ETIMEDOUT;
+			return -ECANCELED;
 		}
 		if (abstime && timespec_cmp (timespec_real (), *abstime) >= 0) {
 			return -ETIMEDOUT;
@@ -163,7 +164,7 @@ static int nsync_futex_wait_win32_ (atomic_int *w, int expect, char pshare,
 
 	for (;;) {
 		now = timespec_real ();
-		if (timespec_cmp (now, deadline) > 0) {
+		if (timespec_cmp (now, deadline) >= 0) {
 			return etimedout();
 		}
 		wait = timespec_sub (deadline, now);
diff --git a/third_party/nsync/mu_semaphore.c b/third_party/nsync/mu_semaphore.c
index fd9d855ce..274b4e75b 100644
--- a/third_party/nsync/mu_semaphore.c
+++ b/third_party/nsync/mu_semaphore.c
@@ -21,14 +21,9 @@
 #include "third_party/nsync/mu_semaphore.internal.h"
 __static_yoink("nsync_notice");
 
-/* Apple's ulock (part by Cosmo futexes) is an internal API, but:
-   1. Unlike GCD it's cancellable, i.e. can be EINTR'd by signals
-   2. We currently always use ulock anyway for joining threads */
-#define PREFER_GCD_OVER_ULOCK 1
-
 /* Initialize *s; the initial value is 0. */
 bool nsync_mu_semaphore_init (nsync_semaphore *s) {
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		return nsync_mu_semaphore_init_gcd (s);
 	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_init_sem (s);
@@ -44,7 +39,7 @@ bool nsync_mu_semaphore_init (nsync_semaphore *s) {
 errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		err = nsync_mu_semaphore_p_gcd (s);
 	} else if (IsNetbsd ()) {
 		err = nsync_mu_semaphore_p_sem (s);
@@ -62,7 +57,7 @@ errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
 errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_deadline) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		err = nsync_mu_semaphore_p_with_deadline_gcd (s, abs_deadline);
 	} else if (IsNetbsd ()) {
 		err = nsync_mu_semaphore_p_with_deadline_sem (s, abs_deadline);
@@ -75,7 +70,7 @@ errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_d
 
 /* Ensure that the count of *s is at least 1. */
 void nsync_mu_semaphore_v (nsync_semaphore *s) {
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		return nsync_mu_semaphore_v_gcd (s);
 	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_v_sem (s);
diff --git a/third_party/nsync/mu_semaphore.internal.h b/third_party/nsync/mu_semaphore.internal.h
index 31b51975d..8795fe349 100755
--- a/third_party/nsync/mu_semaphore.internal.h
+++ b/third_party/nsync/mu_semaphore.internal.h
@@ -4,6 +4,20 @@
 #include "third_party/nsync/time.h"
 COSMOPOLITAN_C_START_
 
+/* XNU ulock (used by cosmo futexes) is an internal API, however:
+
+     1. Unlike GCD it's cancelable i.e. can be EINTR'd by signals
+     2. We have no choice but to use ulock for joining threads
+     3. Grand Central Dispatch requires a busy loop workaround
+     4. ulock makes our mutexes use 20% more system time (meh)
+     5. ulock makes our mutexes use 40% less wall time (good)
+     6. ulock makes our mutexes use 64% less user time (woop)
+
+   ulock is an outstanding system call that must be used.
+   gcd is not an acceptable alternative to ulock. */
+
+#define NSYNC_USE_GRAND_CENTRAL 0
+
 bool nsync_mu_semaphore_init_futex(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_futex(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_with_deadline_futex(nsync_semaphore *, nsync_time);