Make more improvements to threads and mappings

- NetBSD should now have faster synchronization - POSIX barriers may now be shared across processes - An edge case with memory map tracking has been fixed - Grand Central Dispatch is no longer used on MacOS ARM64 - POSIX mutexes in normal mode now use futexes across processes
2025-10-17 07:06:11 +00:00 · 2024-07-24 01:05:00 -07:00 · 2024-07-24 01:05:00 -07:00 · e398f3887c
commit e398f3887c
parent 2187d6d2dd
20 changed files with 566 additions and 171 deletions
--- a/libc/intrin/cp.c
+++ b/libc/intrin/cp.c
@ -26,11 +26,9 @@

 int begin_cancelation_point(void) {
  int state = 0;
-  struct CosmoTib *tib;
-  struct PosixThread *pt;
  if (__tls_enabled) {
-    tib = __get_tls();
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    struct PosixThread *pt;
+    if ((pt = _pthread_self())) {
      state = pt->pt_flags & PT_INCANCEL;
      pt->pt_flags |= PT_INCANCEL;
    }
@ -39,11 +37,9 @@ int begin_cancelation_point(void) {
 }

 void end_cancelation_point(int state) {
-  struct CosmoTib *tib;
-  struct PosixThread *pt;
  if (__tls_enabled) {
-    tib = __get_tls();
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    struct PosixThread *pt;
+    if ((pt = _pthread_self())) {
      pt->pt_flags &= ~PT_INCANCEL;
      pt->pt_flags |= state;
    }
--- a/libc/intrin/maps.h
+++ b/libc/intrin/maps.h
@ -6,6 +6,8 @@
 #include "libc/thread/tls2.internal.h"
 COSMOPOLITAN_C_START_

+#define MAPS_RETRY ((void *)-1)
+
 #define MAP_TREE_CONTAINER(e) TREE_CONTAINER(struct Map, tree, e)

 struct Map {
--- a/libc/intrin/mmap.c
+++ b/libc/intrin/mmap.c
@ -120,6 +120,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
  struct Map *map;
  struct Map *next;
  struct Map *floor;
+StartOver:
  floor = __maps_floor(addr);
  for (map = floor; map && map->addr <= addr + size; map = next) {
    next = __maps_next(map);
@ -148,6 +149,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      ASSERT(left > 0);
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        map->addr += left;
        map->size = right;
        if (!(map->flags & MAP_ANONYMOUS))
@ -167,6 +170,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      size_t right = map_addr + map_size - addr;
      struct Map *rightmap;
      if ((rightmap = __maps_alloc())) {
+        if (rightmap == MAPS_RETRY)
+          goto StartOver;
        map->size = left;
        __maps.pages -= (right + pagesz - 1) / pagesz;
        rightmap->addr = addr;
@ -184,8 +189,14 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      size_t right = map_size - middle - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        struct Map *middlemap;
        if ((middlemap = __maps_alloc())) {
+          if (middlemap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
          leftmap->addr = map_addr;
          leftmap->size = left;
          leftmap->off = map->off;
@ -204,6 +215,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
          *deleted = middlemap;
          __maps_check();
        } else {
+          __maps_free(leftmap);
          rc = -1;
        }
      } else {
@ -304,12 +316,11 @@ struct Map *__maps_alloc(void) {
  map->flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK;
  map->hand = sys.maphandle;
  __maps_lock();
-  __maps_insert(map++);
+  __maps_insert(map);
  __maps_unlock();
-  map->addr = MAP_FAILED;
-  for (int i = 1; i < gransz / sizeof(struct Map) - 1; ++i)
+  for (int i = 1; i < gransz / sizeof(struct Map); ++i)
    __maps_free(map + i);
-  return map;
+  return MAPS_RETRY;
 }

 static int __munmap(char *addr, size_t size) {
@ -396,21 +407,32 @@ void *__maps_pickaddr(size_t size) {
 static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
                          int64_t off, int pagesz, int gransz) {

+  // allocate Map object
+  struct Map *map;
+  do {
+    if (!(map = __maps_alloc()))
+      return MAP_FAILED;
+  } while (map == MAPS_RETRY);
+
  // polyfill nuances of fixed mappings
  int sysflags = flags;
  bool noreplace = false;
  bool should_untrack = false;
  if (flags & MAP_FIXED_NOREPLACE) {
-    if (flags & MAP_FIXED)
+    if (flags & MAP_FIXED) {
+      __maps_free(map);
      return (void *)einval();
+    }
    sysflags &= ~MAP_FIXED_NOREPLACE;
    if (IsLinux()) {
      noreplace = true;
      sysflags |= MAP_FIXED_NOREPLACE_linux;
    } else if (IsFreebsd() || IsNetbsd()) {
      sysflags |= MAP_FIXED;
-      if (__maps_overlaps(addr, size, pagesz))
+      if (__maps_overlaps(addr, size, pagesz)) {
+        __maps_free(map);
        return (void *)eexist();
+      }
    } else {
      noreplace = true;
    }
@ -418,11 +440,6 @@ static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
    should_untrack = true;
  }

-  // allocate Map object
-  struct Map *map;
-  if (!(map = __maps_alloc()))
-    return MAP_FAILED;
-
  // remove mapping we blew away
  if (IsWindows() && should_untrack)
    __munmap(addr, size);
@ -572,23 +589,27 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
        return (void *)einval();
  }

+  // allocate object for tracking new mapping
+  struct Map *map;
+  do {
+    if (!(map = __maps_alloc()))
+      return (void *)enomem();
+  } while (map == MAPS_RETRY);
+
  // check old interval is fully contained within one mapping
  struct Map *old_map;
  if (!(old_map = __maps_floor(old_addr)) ||
      old_addr + old_size > old_map->addr + PGUP(old_map->size) ||
-      old_addr < old_map->addr)
+      old_addr < old_map->addr) {
+    __maps_free(map);
    return (void *)efault();
+  }

  // save old properties
  int old_off = old_map->off;
  int old_prot = old_map->prot;
  int old_flags = old_map->flags;

-  // allocate object for tracking new mapping
-  struct Map *map;
-  if (!(map = __maps_alloc()))
-    return (void *)enomem();
-
  // netbsd mremap fixed returns enoent rather than unmapping old pages
  if (IsNetbsd() && (flags & MREMAP_FIXED))
    if (__munmap(new_addr, new_size)) {
--- a/libc/intrin/mprotect.c
+++ b/libc/intrin/mprotect.c
@ -75,6 +75,7 @@ int __mprotect(char *addr, size_t size, int prot) {
    return edeadlk();
  }
  struct Map *map, *floor;
+StartOver:
  floor = __maps_floor(addr);
  for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) {
    char *map_addr = map->addr;
@ -93,10 +94,12 @@ int __mprotect(char *addr, size_t size, int prot) {
      }
    } else if (addr <= map_addr) {
      // change lefthand side of mapping
-      size_t left = PGUP(addr + size - map_addr);
+      size_t left = addr + size - map_addr;
      size_t right = map_size - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        if (!__mprotect_chunk(map_addr, left, prot, false)) {
          leftmap->addr = map_addr;
          leftmap->size = left;
@ -127,6 +130,8 @@ int __mprotect(char *addr, size_t size, int prot) {
      size_t right = map_addr + map_size - addr;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        if (!__mprotect_chunk(map_addr + left, right, prot, false)) {
          leftmap->addr = map_addr;
          leftmap->size = left;
@ -159,8 +164,14 @@ int __mprotect(char *addr, size_t size, int prot) {
      size_t right = map_size - middle - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        struct Map *midlmap;
        if ((midlmap = __maps_alloc())) {
+          if (midlmap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
          if (!__mprotect_chunk(map_addr + left, middle, prot, false)) {
            leftmap->addr = map_addr;
            leftmap->size = left;
--- a/libc/intrin/pthread_mutex_lock.c
+++ b/libc/intrin/pthread_mutex_lock.c
@ -27,41 +27,47 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"

-static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
-  int me;
+static void pthread_mutex_lock_naive(pthread_mutex_t *mutex, uint64_t word) {
  int backoff = 0;
-  uint64_t word, lock;
-
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-#if PTHREAD_USE_NSYNC
-  // use fancy nsync mutex if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_lock)) {
-    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
-    return 0;
-  }
-#endif
-
-  // implement barebones normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+  uint64_t lock;
  for (;;) {
    word = MUTEX_UNLOCK(word);
    lock = MUTEX_LOCK(word);
    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
                                              memory_order_acquire,
                                              memory_order_relaxed))
-        return 0;
+      return;
    backoff = pthread_delay_np(mutex, backoff);
  }
-  }
+}

-  // implement recursive mutexes
-  me = gettid();
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
+// slightly improved to attempt acquiring multiple times b4 syscall
+static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) {
+  int word;
+  for (int i = 0; i < 4; ++i) {
+    word = 0;
+    if (atomic_compare_exchange_strong_explicit(
+            futex, &word, 1, memory_order_acquire, memory_order_acquire))
+      return;
+    pthread_pause_np();
+  }
+  if (word == 1)
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  while (word > 0) {
+    _weaken(nsync_futex_wait_)(futex, 2, pshare, 0);
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  }
+}
+
+static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
+                                            uint64_t word) {
+  uint64_t lock;
+  int backoff = 0;
+  int me = gettid();
  for (;;) {
    if (MUTEX_OWNER(word) == me) {
      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -91,6 +97,36 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
  }
 }

+static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
+  uint64_t word;
+
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_lock)) {
+    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
+    return 0;
+  }
+#endif
+
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      pthread_mutex_lock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_lock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_lock_recursive(mutex, word);
+}
+
 /**
 * Locks mutex.
 *
--- a/libc/intrin/pthread_mutex_trylock.c
+++ b/libc/intrin/pthread_mutex_trylock.c
@ -24,43 +24,12 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"

-/**
- * Attempts acquiring lock.
- *
- * Unlike pthread_mutex_lock() this function won't block and instead
- * returns an error immediately if the lock couldn't be acquired.
- *
- * @return 0 if lock was acquired, otherwise an errno
- * @raise EAGAIN if maximum number of recursive locks is held
- * @raise EBUSY if lock is currently held in read or write mode
- * @raise EINVAL if `mutex` doesn't refer to an initialized lock
- * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
- *     current thread already holds this mutex
- */
-errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
-  int me;
-  uint64_t word, lock;
-
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-#if PTHREAD_USE_NSYNC
-  // delegate to *NSYNC if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_trylock)) {
-    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
-      return 0;
-    } else {
-      return EBUSY;
-    }
-  }
-#endif
-
-  // handle normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+static errno_t pthread_mutex_trylock_naive(pthread_mutex_t *mutex,
+                                           uint64_t word) {
+  uint64_t lock;
  word = MUTEX_UNLOCK(word);
  lock = MUTEX_LOCK(word);
  if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
@ -68,10 +37,20 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
                                            memory_order_relaxed))
    return 0;
  return EBUSY;
-  }
+}

-  // handle recursive and error check mutexes
-  me = gettid();
+static errno_t pthread_mutex_trylock_drepper(atomic_int *futex) {
+  int word = 0;
+  if (atomic_compare_exchange_strong_explicit(
+          futex, &word, 1, memory_order_acquire, memory_order_acquire))
+    return 0;
+  return EBUSY;
+}
+
+static errno_t pthread_mutex_trylock_recursive(pthread_mutex_t *mutex,
+                                               uint64_t word) {
+  uint64_t lock;
+  int me = gettid();
  for (;;) {
    if (MUTEX_OWNER(word) == me) {
      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -100,3 +79,47 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
    return EBUSY;
  }
 }
+
+/**
+ * Attempts acquiring lock.
+ *
+ * Unlike pthread_mutex_lock() this function won't block and instead
+ * returns an error immediately if the lock couldn't be acquired.
+ *
+ * @return 0 if lock was acquired, otherwise an errno
+ * @raise EAGAIN if maximum number of recursive locks is held
+ * @raise EBUSY if lock is currently held in read or write mode
+ * @raise EINVAL if `mutex` doesn't refer to an initialized lock
+ * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
+ *     current thread already holds this mutex
+ */
+errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
+
+  // get current state of lock
+  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_trylock)) {
+    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
+      return 0;
+    } else {
+      return EBUSY;
+    }
+  }
+#endif
+
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      return pthread_mutex_trylock_drepper(&mutex->_futex);
+    } else {
+      return pthread_mutex_trylock_naive(mutex, word);
+    }
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_trylock_recursive(mutex, word);
+}
--- a/libc/intrin/pthread_mutex_unlock.c
+++ b/libc/intrin/pthread_mutex_unlock.c
@ -25,45 +25,26 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"

-/**
- * Releases mutex.
- *
- * This function does nothing in vfork() children.
- *
- * @return 0 on success or error number on failure
- * @raises EPERM if in error check mode and not owned by caller
- * @vforksafe
- */
-errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
-  int me;
-  uint64_t word, lock;
-
-  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
-
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-#if PTHREAD_USE_NSYNC
-  // use fancy nsync mutex if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_unlock)) {
-    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
-    return 0;
-  }
-#endif
-
-  // implement barebones normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
-    lock = MUTEX_UNLOCK(word);
+static void pthread_mutex_unlock_naive(pthread_mutex_t *mutex, uint64_t word) {
+  uint64_t lock = MUTEX_UNLOCK(word);
  atomic_store_explicit(&mutex->_word, lock, memory_order_release);
-    return 0;
-  }
+}

-  // implement recursive mutex unlocking
-  me = gettid();
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
+static void pthread_mutex_unlock_drepper(atomic_int *futex, char pshare) {
+  int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
+  if (word == 2) {
+    atomic_store_explicit(futex, 0, memory_order_release);
+    _weaken(nsync_futex_wake_)(futex, 1, pshare);
+  }
+}
+
+static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex,
+                                              uint64_t word) {
+  int me = gettid();
  for (;;) {

    // we allow unlocking an initialized lock that wasn't locked, but we
@ -88,3 +69,44 @@ errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
      return 0;
  }
 }
+
+/**
+ * Releases mutex.
+ *
+ * This function does nothing in vfork() children.
+ *
+ * @return 0 on success or error number on failure
+ * @raises EPERM if in error check mode and not owned by caller
+ * @vforksafe
+ */
+errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
+  uint64_t word;
+
+  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
+
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_unlock)) {
+    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
+    return 0;
+  }
+#endif
+
+  // implement barebones normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wake_)) {
+      pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_unlock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_unlock_recursive(mutex, word);
+}
--- a/libc/intrin/pthread_yield_np.c
+++ b/libc/intrin/pthread_yield_np.c
@ -32,7 +32,7 @@ void sys_sched_yield(void);
 int pthread_yield_np(void) {
  if (IsXnuSilicon()) {
    __syslib->__pthread_yield_np();
-  } else if (IsOpenbsd() || IsNetbsd()) {
+  } else if (IsOpenbsd()) {
    // sched_yield() is punishingly slow on OpenBSD
    // it's ruinously slow it'll destroy everything
    pthread_pause_np();
--- a/libc/thread/pthread_barrier_destroy.c
+++ b/libc/thread/pthread_barrier_destroy.c
@ -16,9 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
 #include "libc/str/str.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"

 /**
 * Destroys barrier.
@ -27,9 +28,8 @@
 * @raise EINVAL if threads are still inside the barrier
 */
 errno_t pthread_barrier_destroy(pthread_barrier_t *barrier) {
-  if (barrier->_nsync) {
-    nsync_counter_free(barrier->_nsync);
-    barrier->_nsync = 0;
-  }
+  if (atomic_load_explicit(&barrier->_waiters, memory_order_relaxed))
+    return EINVAL;
+  memset(barrier, -1, sizeof(*barrier));
  return 0;
 }
--- a/libc/thread/pthread_barrier_init.c
+++ b/libc/thread/pthread_barrier_init.c
@ -17,8 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"

 /**
 * Initializes barrier.
@ -28,16 +29,17 @@
 *     before the barrier is released, which must be greater than zero
 * @return 0 on success, or error number on failure
 * @raise EINVAL if `count` isn't greater than zero
- * @raise ENOMEM if insufficient memory exists
 */
 errno_t pthread_barrier_init(pthread_barrier_t *barrier,
                             const pthread_barrierattr_t *attr,
                             unsigned count) {
-  nsync_counter c;
  if (!count)
    return EINVAL;
-  if (!(c = nsync_counter_new(count)))
-    return ENOMEM;
-  *barrier = (pthread_barrier_t){._nsync = c};
+  if (count > INT_MAX)
+    return EINVAL;
+  barrier->_count = count;
+  barrier->_pshared = attr ? *attr : PTHREAD_PROCESS_PRIVATE;
+  atomic_store_explicit(&barrier->_counter, count, memory_order_relaxed);
+  atomic_store_explicit(&barrier->_waiters, 0, memory_order_relaxed);
  return 0;
 }
--- a/libc/thread/pthread_barrier_wait.c
+++ b/libc/thread/pthread_barrier_wait.c
@ -16,25 +16,53 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/blockcancel.internal.h"
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"
+#include "third_party/nsync/futex.internal.h"

 /**
 * Waits for all threads to arrive at barrier.
 *
 * When the barrier is broken, the state becomes reset to what it was
 * when pthread_barrier_init() was called, so that the barrior may be
- * used again in the same way. The last thread to arrive shall be the
- * last to leave and it returns a magic value.
+ * used again in the same way.
+ *
+ * Unlike pthread_cond_timedwait() this function is not a cancelation
+ * point. It is not needed to have cleanup handlers on block cancels.
 *
 * @return 0 on success, `PTHREAD_BARRIER_SERIAL_THREAD` to one lucky
 *     thread which was the last arrival, or an errno on error
+ * @raise EINVAL if barrier is used incorrectly
 */
 errno_t pthread_barrier_wait(pthread_barrier_t *barrier) {
-  if (nsync_counter_add(barrier->_nsync, -1)) {
-    nsync_counter_wait(barrier->_nsync, nsync_time_no_deadline);
-    return 0;
-  } else {
+  int n;
+
+  // enter barrier
+  atomic_fetch_add_explicit(&barrier->_waiters, 1, memory_order_acq_rel);
+  n = atomic_fetch_sub_explicit(&barrier->_counter, 1, memory_order_acq_rel);
+  n = n - 1;
+
+  // this can only happen on invalid usage
+  if (n < 0)
+    return EINVAL;
+
+  // reset count and wake waiters if we're last at barrier
+  if (!n) {
+    atomic_store_explicit(&barrier->_counter, barrier->_count,
+                          memory_order_release);
+    atomic_store_explicit(&barrier->_waiters, 0, memory_order_release);
+    nsync_futex_wake_(&barrier->_waiters, INT_MAX, barrier->_pshared);
    return PTHREAD_BARRIER_SERIAL_THREAD;
  }
+
+  // wait for everyone else to arrive at barrier
+  BLOCK_CANCELATION;
+  while ((n = atomic_load_explicit(&barrier->_waiters, memory_order_acquire)))
+    nsync_futex_wait_(&barrier->_waiters, n, barrier->_pshared, 0);
+  ALLOW_CANCELATION;
+
+  return 0;
 }
--- a/libc/thread/pthread_barrierattr_getpshared.c
+++ b/libc/thread/pthread_barrierattr_getpshared.c
@ -23,7 +23,7 @@
 *
 * @param pshared is set to one of the following
 *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
 * @return 0 on success, or error on failure
 */
 errno_t pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr,
--- a/libc/thread/pthread_barrierattr_init.c
+++ b/libc/thread/pthread_barrierattr_init.c
@ -24,6 +24,6 @@
 * @return 0 on success, or error on failure
 */
 errno_t pthread_barrierattr_init(pthread_barrierattr_t *attr) {
-  *attr = 0;
+  *attr = PTHREAD_PROCESS_PRIVATE;
  return 0;
 }
--- a/libc/thread/pthread_barrierattr_setpshared.c
+++ b/libc/thread/pthread_barrierattr_setpshared.c
@ -24,13 +24,14 @@
 *
 * @param pshared can be one of
 *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
 * @return 0 on success, or error on failure
 * @raises EINVAL if `pshared` is invalid
 */
 errno_t pthread_barrierattr_setpshared(pthread_barrierattr_t *attr,
                                       int pshared) {
  switch (pshared) {
+    case PTHREAD_PROCESS_SHARED:
    case PTHREAD_PROCESS_PRIVATE:
      *attr = pshared;
      return 0;
--- a/libc/thread/thread.h
+++ b/libc/thread/thread.h
@ -46,7 +46,7 @@ COSMOPOLITAN_C_START_
 #define PTHREAD_RWLOCK_INITIALIZER {0}
 #define PTHREAD_MUTEX_INITIALIZER  {0}

-#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, 0, PTHREAD_MUTEX_RECURSIVE}
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE}

 typedef uintptr_t pthread_t;
 typedef int pthread_id_np_t;
@ -66,7 +66,10 @@ typedef struct pthread_spinlock_s {

 typedef struct pthread_mutex_s {
  uint32_t _nsync;
+  union {
    int32_t _pid;
+    _Atomic(int32_t) _futex;
+  };
  _Atomic(uint64_t) _word;
 } pthread_mutex_t;

@ -92,7 +95,10 @@ typedef struct pthread_rwlock_s {
 } pthread_rwlock_t;

 typedef struct pthread_barrier_s {
-  void *_nsync;
+  int _count;
+  char _pshared;
+  _Atomic(int) _counter;
+  _Atomic(int) _waiters;
 } pthread_barrier_t;

 typedef struct pthread_attr_s {
--- a/test/libc/thread/footek_test.c
+++ b/test/libc/thread/footek_test.c
@ -0,0 +1,236 @@
+#include <assert.h>
+#include <cosmo.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdio.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+#include "third_party/nsync/futex.internal.h"
+
+// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
+// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
+// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
+
+// arm fleet
+// with futexes
+// 30 threads / 100000 iterations
+//
+//          242,604 us real
+//        4,222,946 us user
+//        1,079,229 us sys
+// footek_test on studio.test.          630 µs   17'415 µs     256'782 µs
+//        1,362,557 us real
+//        3,232,978 us user
+//        2,104,824 us sys
+// footek_test on pi.test.              611 µs   21'708 µs   1'385'129 µs
+//        1,346,482 us real
+//        3,370,513 us user
+//        1,992,383 us sys
+// footek_test on freebsdarm.test.      427 µs   19'967 µs   1'393'476 µs
+
+// arm fleet
+// without futexes
+// 30 threads / 100000 iterations
+//
+//        1,282,084 us real
+//       29,359,582 us user
+//           34,553 us sys
+// footek_test on studio.test.          961 µs   12'907 µs   1'287'983 µs
+//        4,070,988 us real
+//       16,203,990 us user
+//            7,999 us sys
+// footek_test on pi.test.              459 µs   16'376 µs   4'095'512 µs
+//        7,012,493 us real
+//       27,936,725 us user
+//            7,871 us sys
+// footek_test on freebsdarm.test.      502 µs   16'446 µs   7'051'545 µs
+
+// x86 fleet
+// with futexes
+// 30 threads / 100000 iterations
+//
+//          146,015 us real
+//          169,427 us user
+//           68,939 us sys
+// footek_test on rhel7.test.           376 µs    2'259 µs     153'024 µs
+//          144,917 us real
+//          383,317 us user
+//          191,203 us sys
+// footek_test on xnu.test.          11'143 µs    9'159 µs     164'865 µs
+//          244,286 us real
+//          405,395 us user
+//          956,122 us sys
+// footek_test on freebsd.test.         394 µs    2'165 µs     256'227 µs
+//          209,095 us real
+//          616,634 us user
+//            9,945 us sys
+// footek_test on netbsd.test.          502 µs    2'020 µs     261'895 µs
+//          344,876 us real
+//           50,000 us user
+//        1,240,000 us sys
+// footek_test on openbsd.test.         457 µs    2'737 µs     396'342 µs
+//        1,193,906 us real
+//       17,546,875 us user
+//        3,000,000 us sys
+// footek_test on win10.test.           462 µs   59'528 µs   1'348'265 µs
+
+// x86 fleet
+// without futexes
+// 30 threads / 100000 iterations
+//
+//          897,815 us real
+//        1,763,705 us user
+//            9,696 us sys
+// footek_test on rhel7.test.           423 µs    2'638 µs     912'241 µs
+//          790,332 us real
+//        2,359,967 us user
+//                0 us sys
+// footek_test on netbsd.test.        1'151 µs    2'634 µs   1'014'867 µs
+//        2,332,724 us real
+//        9,150,000 us user
+//           10,000 us sys
+// footek_test on openbsd.test.         557 µs    3'020 µs   2'554'648 µs
+//        2,528,863 us real
+//       56,546,875 us user
+//        1,671,875 us sys
+// footek_test on win10.test.           962 µs    9'698 µs   2'751'905 µs
+//        2,916,033 us real
+//       17,236,103 us user
+//                0 us sys
+// footek_test on freebsd.test.         690 µs    3'011 µs   2'925'997 µs
+//        4,225,726 us real
+//       16,679,456 us user
+//           16,265 us sys
+// footek_test on xnu.test.          98'468 µs    5'242 µs   5'191'724 µs
+
+#define USE_FUTEX  1
+#define THREADS    30
+#define ITERATIONS 30000
+
+#define MUTEX_LOCKED(word)  ((word) & 8)
+#define MUTEX_WAITING(word) ((word) & 16)
+
+#define MUTEX_LOCK(word)        ((word) | 8)
+#define MUTEX_SET_WAITING(word) ((word) | 16)
+#define MUTEX_UNLOCK(word)      ((word) & ~(8 | 16))
+
+void lock(atomic_int *futex) {
+  int word, cs;
+  for (int i = 0; i < 4; ++i) {
+    word = 0;
+    if (atomic_compare_exchange_strong_explicit(
+            futex, &word, 1, memory_order_acquire, memory_order_acquire))
+      return;
+    pthread_pause_np();
+  }
+  if (word == 1)
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  while (word > 0) {
+    pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
+#if USE_FUTEX
+    nsync_futex_wait_(futex, 2, 0, 0);
+#endif
+    pthread_setcancelstate(cs, 0);
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  }
+}
+
+void unlock(atomic_int *futex) {
+  int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
+  if (word == 2) {
+    atomic_store_explicit(futex, 0, memory_order_release);
+#if USE_FUTEX
+    nsync_futex_wake_(futex, 1, 0);
+#endif
+  }
+}
+
+int g_chores;
+atomic_int g_lock;
+pthread_mutex_t g_locker;
+
+void *worker(void *arg) {
+  for (int i = 0; i < ITERATIONS; ++i) {
+    lock(&g_lock);
+    ++g_chores;
+    unlock(&g_lock);
+  }
+  return 0;
+}
+
+int main() {
+  struct timeval start;
+  gettimeofday(&start, 0);
+
+  pthread_t th[THREADS];
+  for (int i = 0; i < THREADS; ++i)
+    pthread_create(&th[i], 0, worker, 0);
+  for (int i = 0; i < THREADS; ++i)
+    pthread_join(th[i], 0);
+  npassert(g_chores == THREADS * ITERATIONS);
+
+  struct rusage ru;
+  struct timeval end;
+  gettimeofday(&end, 0);
+  getrusage(RUSAGE_SELF, &ru);
+  printf("%,16ld us real\n"
+         "%,16ld us user\n"
+         "%,16ld us sys\n",
+         timeval_tomicros(timeval_sub(end, start)),  //
+         timeval_tomicros(ru.ru_utime),              //
+         timeval_tomicros(ru.ru_stime));
+
+  CheckForMemoryLeaks();
+}
+
+// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
+// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
+
+// x86 fleet
+// with pthread_mutex_t
+// 30 threads / 100000 iterations
+//
+//          186,976 us real
+//           43,609 us user
+//          205,585 us sys
+// footek_test on freebsd.test.         410 µs    2'054 µs     195'339 µs
+//          238,902 us real
+//          235,743 us user
+//           97,881 us sys
+// footek_test on rhel7.test.           343 µs    2'339 µs     246'926 µs
+//          201,285 us real
+//          249,612 us user
+//          141,230 us sys
+// footek_test on xnu.test.           1'960 µs    5'350 µs     265'758 µs
+//          303,363 us real
+//           60,000 us user
+//          410,000 us sys
+// footek_test on openbsd.test.         545 µs    3'023 µs     326'200 µs
+//          386,085 us real
+//          586,455 us user
+//          466,991 us sys
+// footek_test on netbsd.test.          344 µs    2'421 µs     413'440 µs
+//          245,010 us real
+//          437,500 us user
+//          140,625 us sys
+// footek_test on win10.test.           300 µs   18'574 µs     441'225 µs
+
+// arm fleet
+// with pthread_mutex_t
+// 30 threads / 100000 iterations
+//
+//           87,132 us real
+//          183,517 us user
+//           20,020 us sys
+// footek_test on studio.test.          560 µs   12'418 µs      92'825 µs
+//          679,374 us real
+//          957,678 us user
+//          605,078 us sys
+// footek_test on pi.test.              462 µs   16'574 µs     702'833 µs
+//          902,343 us real
+//        1,459,706 us user
+//          781,140 us sys
+// footek_test on freebsdarm.test.      400 µs   16'261 µs     970'022 µs
--- a/third_party/nsync/common.c
+++ b/third_party/nsync/common.c
@ -37,6 +37,7 @@
 #include "third_party/nsync/atomic.internal.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/mu_semaphore.h"
+#include "third_party/nsync/mu_semaphore.internal.h"
 #include "third_party/nsync/wait_s.internal.h"
 __static_yoink("nsync_notice");

@ -147,9 +148,9 @@ static void free_waiters_push (waiter *w) {

 static void free_waiters_populate (void) {
 	int n;
-	if (IsNetbsd () || IsXnuSilicon ()) {
-		// netbsd needs one file descriptor per semaphore (!!)
-		// tim cook wants us to use his grand central dispatch
+	if (IsNetbsd () || (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ())) {
+		// netbsd needs a real file descriptor per semaphore
+		// tim cook wants us to use his lol central dispatch
 		n = 1;
 	} else {
 		n = getpagesize() / sizeof(waiter);
--- a/third_party/nsync/futex.c
+++ b/third_party/nsync/futex.c
@ -52,6 +52,7 @@
 #include "third_party/nsync/atomic.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/futex.internal.h"
+#include "libc/intrin/kprintf.h"
 #include "third_party/nsync/time.h"

 #define FUTEX_WAIT_BITS_ FUTEX_BITSET_MATCH_ANY
@ -138,7 +139,7 @@ static int nsync_futex_polyfill_ (atomic_int *w, int expect, struct timespec *ab
 		}
 		if (_weaken (pthread_testcancel_np) &&
 		    _weaken (pthread_testcancel_np) ()) {
-			return -ETIMEDOUT;
+			return -ECANCELED;
 		}
 		if (abstime && timespec_cmp (timespec_real (), *abstime) >= 0) {
 			return -ETIMEDOUT;
@ -163,7 +164,7 @@ static int nsync_futex_wait_win32_ (atomic_int *w, int expect, char pshare,

 	for (;;) {
 		now = timespec_real ();
-		if (timespec_cmp (now, deadline) > 0) {
+		if (timespec_cmp (now, deadline) >= 0) {
 			return etimedout();
 		}
 		wait = timespec_sub (deadline, now);
--- a/third_party/nsync/mu_semaphore.c
+++ b/third_party/nsync/mu_semaphore.c
@ -21,14 +21,9 @@
 #include "third_party/nsync/mu_semaphore.internal.h"
 __static_yoink("nsync_notice");

-/* Apple's ulock (part by Cosmo futexes) is an internal API, but:
-   1. Unlike GCD it's cancellable, i.e. can be EINTR'd by signals
-   2. We currently always use ulock anyway for joining threads */
-#define PREFER_GCD_OVER_ULOCK 1
-
 /* Initialize *s; the initial value is 0. */
 bool nsync_mu_semaphore_init (nsync_semaphore *s) {
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		return nsync_mu_semaphore_init_gcd (s);
 	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_init_sem (s);
@ -44,7 +39,7 @@ bool nsync_mu_semaphore_init (nsync_semaphore *s) {
 errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		err = nsync_mu_semaphore_p_gcd (s);
 	} else if (IsNetbsd ()) {
 		err = nsync_mu_semaphore_p_sem (s);
@ -62,7 +57,7 @@ errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
 errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_deadline) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		err = nsync_mu_semaphore_p_with_deadline_gcd (s, abs_deadline);
 	} else if (IsNetbsd ()) {
 		err = nsync_mu_semaphore_p_with_deadline_sem (s, abs_deadline);
@ -75,7 +70,7 @@ errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_d

 /* Ensure that the count of *s is at least 1. */
 void nsync_mu_semaphore_v (nsync_semaphore *s) {
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		return nsync_mu_semaphore_v_gcd (s);
 	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_v_sem (s);
--- a/third_party/nsync/mu_semaphore.internal.h
+++ b/third_party/nsync/mu_semaphore.internal.h
@ -4,6 +4,20 @@
 #include "third_party/nsync/time.h"
 COSMOPOLITAN_C_START_

+/* XNU ulock (used by cosmo futexes) is an internal API, however:
+
+     1. Unlike GCD it's cancelable i.e. can be EINTR'd by signals
+     2. We have no choice but to use ulock for joining threads
+     3. Grand Central Dispatch requires a busy loop workaround
+     4. ulock makes our mutexes use 20% more system time (meh)
+     5. ulock makes our mutexes use 40% less wall time (good)
+     6. ulock makes our mutexes use 64% less user time (woop)
+
+   ulock is an outstanding system call that must be used.
+   gcd is not an acceptable alternative to ulock. */
+
+#define NSYNC_USE_GRAND_CENTRAL 0
+
 bool nsync_mu_semaphore_init_futex(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_futex(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_with_deadline_futex(nsync_semaphore *, nsync_time);