Make more improvements to threads and mappings

- NetBSD should now have faster synchronization - POSIX barriers may now be shared across processes - An edge case with memory map tracking has been fixed - Grand Central Dispatch is no longer used on MacOS ARM64 - POSIX mutexes in normal mode now use futexes across processes
2025-10-09 22:28:24 +00:00 · 2024-07-24 01:05:00 -07:00 · 2024-07-24 01:05:00 -07:00 · e398f3887c
commit e398f3887c
parent 2187d6d2dd
20 changed files with 566 additions and 171 deletions
--- a/libc/intrin/cp.c
+++ b/libc/intrin/cp.c
@ -26,11 +26,9 @@

 int begin_cancelation_point(void) {
  int state = 0;
-  struct CosmoTib *tib;
-  struct PosixThread *pt;
  if (__tls_enabled) {
-    tib = __get_tls();
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    struct PosixThread *pt;
+    if ((pt = _pthread_self())) {
      state = pt->pt_flags & PT_INCANCEL;
      pt->pt_flags |= PT_INCANCEL;
    }
@ -39,11 +37,9 @@ int begin_cancelation_point(void) {
 }

 void end_cancelation_point(int state) {
-  struct CosmoTib *tib;
-  struct PosixThread *pt;
  if (__tls_enabled) {
-    tib = __get_tls();
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    struct PosixThread *pt;
+    if ((pt = _pthread_self())) {
      pt->pt_flags &= ~PT_INCANCEL;
      pt->pt_flags |= state;
    }
--- a/libc/intrin/maps.h
+++ b/libc/intrin/maps.h
@ -6,6 +6,8 @@
 #include "libc/thread/tls2.internal.h"
 COSMOPOLITAN_C_START_

+#define MAPS_RETRY ((void *)-1)
+
 #define MAP_TREE_CONTAINER(e) TREE_CONTAINER(struct Map, tree, e)

 struct Map {
--- a/libc/intrin/mmap.c
+++ b/libc/intrin/mmap.c
@ -120,6 +120,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
  struct Map *map;
  struct Map *next;
  struct Map *floor;
+StartOver:
  floor = __maps_floor(addr);
  for (map = floor; map && map->addr <= addr + size; map = next) {
    next = __maps_next(map);
@ -148,6 +149,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      ASSERT(left > 0);
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        map->addr += left;
        map->size = right;
        if (!(map->flags & MAP_ANONYMOUS))
@ -167,6 +170,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      size_t right = map_addr + map_size - addr;
      struct Map *rightmap;
      if ((rightmap = __maps_alloc())) {
+        if (rightmap == MAPS_RETRY)
+          goto StartOver;
        map->size = left;
        __maps.pages -= (right + pagesz - 1) / pagesz;
        rightmap->addr = addr;
@ -184,8 +189,14 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      size_t right = map_size - middle - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        struct Map *middlemap;
        if ((middlemap = __maps_alloc())) {
+          if (middlemap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
          leftmap->addr = map_addr;
          leftmap->size = left;
          leftmap->off = map->off;
@ -204,6 +215,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
          *deleted = middlemap;
          __maps_check();
        } else {
+          __maps_free(leftmap);
          rc = -1;
        }
      } else {
@ -304,12 +316,11 @@ struct Map *__maps_alloc(void) {
  map->flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK;
  map->hand = sys.maphandle;
  __maps_lock();
-  __maps_insert(map++);
+  __maps_insert(map);
  __maps_unlock();
-  map->addr = MAP_FAILED;
-  for (int i = 1; i < gransz / sizeof(struct Map) - 1; ++i)
+  for (int i = 1; i < gransz / sizeof(struct Map); ++i)
    __maps_free(map + i);
-  return map;
+  return MAPS_RETRY;
 }

 static int __munmap(char *addr, size_t size) {
@ -396,21 +407,32 @@ void *__maps_pickaddr(size_t size) {
 static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
                          int64_t off, int pagesz, int gransz) {

+  // allocate Map object
+  struct Map *map;
+  do {
+    if (!(map = __maps_alloc()))
+      return MAP_FAILED;
+  } while (map == MAPS_RETRY);
+
  // polyfill nuances of fixed mappings
  int sysflags = flags;
  bool noreplace = false;
  bool should_untrack = false;
  if (flags & MAP_FIXED_NOREPLACE) {
-    if (flags & MAP_FIXED)
+    if (flags & MAP_FIXED) {
+      __maps_free(map);
      return (void *)einval();
+    }
    sysflags &= ~MAP_FIXED_NOREPLACE;
    if (IsLinux()) {
      noreplace = true;
      sysflags |= MAP_FIXED_NOREPLACE_linux;
    } else if (IsFreebsd() || IsNetbsd()) {
      sysflags |= MAP_FIXED;
-      if (__maps_overlaps(addr, size, pagesz))
+      if (__maps_overlaps(addr, size, pagesz)) {
+        __maps_free(map);
        return (void *)eexist();
+      }
    } else {
      noreplace = true;
    }
@ -418,11 +440,6 @@ static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
    should_untrack = true;
  }

-  // allocate Map object
-  struct Map *map;
-  if (!(map = __maps_alloc()))
-    return MAP_FAILED;
-
  // remove mapping we blew away
  if (IsWindows() && should_untrack)
    __munmap(addr, size);
@ -572,23 +589,27 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
        return (void *)einval();
  }

+  // allocate object for tracking new mapping
+  struct Map *map;
+  do {
+    if (!(map = __maps_alloc()))
+      return (void *)enomem();
+  } while (map == MAPS_RETRY);
+
  // check old interval is fully contained within one mapping
  struct Map *old_map;
  if (!(old_map = __maps_floor(old_addr)) ||
      old_addr + old_size > old_map->addr + PGUP(old_map->size) ||
-      old_addr < old_map->addr)
+      old_addr < old_map->addr) {
+    __maps_free(map);
    return (void *)efault();
+  }

  // save old properties
  int old_off = old_map->off;
  int old_prot = old_map->prot;
  int old_flags = old_map->flags;

-  // allocate object for tracking new mapping
-  struct Map *map;
-  if (!(map = __maps_alloc()))
-    return (void *)enomem();
-
  // netbsd mremap fixed returns enoent rather than unmapping old pages
  if (IsNetbsd() && (flags & MREMAP_FIXED))
    if (__munmap(new_addr, new_size)) {
--- a/libc/intrin/mprotect.c
+++ b/libc/intrin/mprotect.c
@ -75,6 +75,7 @@ int __mprotect(char *addr, size_t size, int prot) {
    return edeadlk();
  }
  struct Map *map, *floor;
+StartOver:
  floor = __maps_floor(addr);
  for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) {
    char *map_addr = map->addr;
@ -93,10 +94,12 @@ int __mprotect(char *addr, size_t size, int prot) {
      }
    } else if (addr <= map_addr) {
      // change lefthand side of mapping
-      size_t left = PGUP(addr + size - map_addr);
+      size_t left = addr + size - map_addr;
      size_t right = map_size - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        if (!__mprotect_chunk(map_addr, left, prot, false)) {
          leftmap->addr = map_addr;
          leftmap->size = left;
@ -127,6 +130,8 @@ int __mprotect(char *addr, size_t size, int prot) {
      size_t right = map_addr + map_size - addr;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        if (!__mprotect_chunk(map_addr + left, right, prot, false)) {
          leftmap->addr = map_addr;
          leftmap->size = left;
@ -159,8 +164,14 @@ int __mprotect(char *addr, size_t size, int prot) {
      size_t right = map_size - middle - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
+        if (leftmap == MAPS_RETRY)
+          goto StartOver;
        struct Map *midlmap;
        if ((midlmap = __maps_alloc())) {
+          if (midlmap == MAPS_RETRY) {
+            __maps_free(leftmap);
+            goto StartOver;
+          }
          if (!__mprotect_chunk(map_addr + left, middle, prot, false)) {
            leftmap->addr = map_addr;
            leftmap->size = left;
--- a/libc/intrin/pthread_mutex_lock.c
+++ b/libc/intrin/pthread_mutex_lock.c
@ -27,41 +27,47 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"

-static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
-  int me;
+static void pthread_mutex_lock_naive(pthread_mutex_t *mutex, uint64_t word) {
  int backoff = 0;
-  uint64_t word, lock;
-
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-#if PTHREAD_USE_NSYNC
-  // use fancy nsync mutex if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_lock)) {
-    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
-    return 0;
+  uint64_t lock;
+  for (;;) {
+    word = MUTEX_UNLOCK(word);
+    lock = MUTEX_LOCK(word);
+    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+                                              memory_order_acquire,
+                                              memory_order_relaxed))
+      return;
+    backoff = pthread_delay_np(mutex, backoff);
  }
-#endif
+}

-  // implement barebones normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
-    for (;;) {
-      word = MUTEX_UNLOCK(word);
-      lock = MUTEX_LOCK(word);
-      if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
-                                                memory_order_acquire,
-                                                memory_order_relaxed))
-        return 0;
-      backoff = pthread_delay_np(mutex, backoff);
-    }
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
+// slightly improved to attempt acquiring multiple times b4 syscall
+static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) {
+  int word;
+  for (int i = 0; i < 4; ++i) {
+    word = 0;
+    if (atomic_compare_exchange_strong_explicit(
+            futex, &word, 1, memory_order_acquire, memory_order_acquire))
+      return;
+    pthread_pause_np();
  }
+  if (word == 1)
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  while (word > 0) {
+    _weaken(nsync_futex_wait_)(futex, 2, pshare, 0);
+    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
+  }
+}

-  // implement recursive mutexes
-  me = gettid();
+static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
+                                            uint64_t word) {
+  uint64_t lock;
+  int backoff = 0;
+  int me = gettid();
  for (;;) {
    if (MUTEX_OWNER(word) == me) {
      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -91,6 +97,36 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
  }
 }

+static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
+  uint64_t word;
+
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_lock)) {
+    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
+    return 0;
+  }
+#endif
+
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      pthread_mutex_lock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_lock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_lock_recursive(mutex, word);
+}
+
 /**
 * Locks mutex.
 *
--- a/libc/intrin/pthread_mutex_trylock.c
+++ b/libc/intrin/pthread_mutex_trylock.c
@ -24,54 +24,33 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"

-/**
- * Attempts acquiring lock.
- *
- * Unlike pthread_mutex_lock() this function won't block and instead
- * returns an error immediately if the lock couldn't be acquired.
- *
- * @return 0 if lock was acquired, otherwise an errno
- * @raise EAGAIN if maximum number of recursive locks is held
- * @raise EBUSY if lock is currently held in read or write mode
- * @raise EINVAL if `mutex` doesn't refer to an initialized lock
- * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
- *     current thread already holds this mutex
- */
-errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
-  int me;
-  uint64_t word, lock;
+static errno_t pthread_mutex_trylock_naive(pthread_mutex_t *mutex,
+                                           uint64_t word) {
+  uint64_t lock;
+  word = MUTEX_UNLOCK(word);
+  lock = MUTEX_LOCK(word);
+  if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+                                            memory_order_acquire,
+                                            memory_order_relaxed))
+    return 0;
+  return EBUSY;
+}

-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+static errno_t pthread_mutex_trylock_drepper(atomic_int *futex) {
+  int word = 0;
+  if (atomic_compare_exchange_strong_explicit(
+          futex, &word, 1, memory_order_acquire, memory_order_acquire))
+    return 0;
+  return EBUSY;
+}

-#if PTHREAD_USE_NSYNC
-  // delegate to *NSYNC if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_trylock)) {
-    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
-      return 0;
-    } else {
-      return EBUSY;
-    }
-  }
-#endif
-
-  // handle normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
-    word = MUTEX_UNLOCK(word);
-    lock = MUTEX_LOCK(word);
-    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
-                                              memory_order_acquire,
-                                              memory_order_relaxed))
-      return 0;
-    return EBUSY;
-  }
-
-  // handle recursive and error check mutexes
-  me = gettid();
+static errno_t pthread_mutex_trylock_recursive(pthread_mutex_t *mutex,
+                                               uint64_t word) {
+  uint64_t lock;
+  int me = gettid();
  for (;;) {
    if (MUTEX_OWNER(word) == me) {
      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -100,3 +79,47 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
    return EBUSY;
  }
 }
+
+/**
+ * Attempts acquiring lock.
+ *
+ * Unlike pthread_mutex_lock() this function won't block and instead
+ * returns an error immediately if the lock couldn't be acquired.
+ *
+ * @return 0 if lock was acquired, otherwise an errno
+ * @raise EAGAIN if maximum number of recursive locks is held
+ * @raise EBUSY if lock is currently held in read or write mode
+ * @raise EINVAL if `mutex` doesn't refer to an initialized lock
+ * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
+ *     current thread already holds this mutex
+ */
+errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
+
+  // get current state of lock
+  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_trylock)) {
+    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
+      return 0;
+    } else {
+      return EBUSY;
+    }
+  }
+#endif
+
+  // handle normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wait_)) {
+      return pthread_mutex_trylock_drepper(&mutex->_futex);
+    } else {
+      return pthread_mutex_trylock_naive(mutex, word);
+    }
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_trylock_recursive(mutex, word);
+}
--- a/libc/intrin/pthread_mutex_unlock.c
+++ b/libc/intrin/pthread_mutex_unlock.c
@ -25,45 +25,26 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
+#include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"

-/**
- * Releases mutex.
- *
- * This function does nothing in vfork() children.
- *
- * @return 0 on success or error number on failure
- * @raises EPERM if in error check mode and not owned by caller
- * @vforksafe
- */
-errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
-  int me;
-  uint64_t word, lock;
+static void pthread_mutex_unlock_naive(pthread_mutex_t *mutex, uint64_t word) {
+  uint64_t lock = MUTEX_UNLOCK(word);
+  atomic_store_explicit(&mutex->_word, lock, memory_order_release);
+}

-  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
-
-  // get current state of lock
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
-
-#if PTHREAD_USE_NSYNC
-  // use fancy nsync mutex if possible
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
-      _weaken(nsync_mu_unlock)) {
-    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
-    return 0;
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
+static void pthread_mutex_unlock_drepper(atomic_int *futex, char pshare) {
+  int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
+  if (word == 2) {
+    atomic_store_explicit(futex, 0, memory_order_release);
+    _weaken(nsync_futex_wake_)(futex, 1, pshare);
  }
-#endif
+}

-  // implement barebones normal mutexes
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
-    lock = MUTEX_UNLOCK(word);
-    atomic_store_explicit(&mutex->_word, lock, memory_order_release);
-    return 0;
-  }
-
-  // implement recursive mutex unlocking
-  me = gettid();
+static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex,
+                                              uint64_t word) {
+  int me = gettid();
  for (;;) {

    // we allow unlocking an initialized lock that wasn't locked, but we
@ -88,3 +69,44 @@ errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
      return 0;
  }
 }
+
+/**
+ * Releases mutex.
+ *
+ * This function does nothing in vfork() children.
+ *
+ * @return 0 on success or error number on failure
+ * @raises EPERM if in error check mode and not owned by caller
+ * @vforksafe
+ */
+errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
+  uint64_t word;
+
+  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
+
+  // get current state of lock
+  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+
+#if PTHREAD_USE_NSYNC
+  // use superior mutexes if possible
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+      _weaken(nsync_mu_unlock)) {
+    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
+    return 0;
+  }
+#endif
+
+  // implement barebones normal mutexes
+  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+    if (_weaken(nsync_futex_wake_)) {
+      pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
+    } else {
+      pthread_mutex_unlock_naive(mutex, word);
+    }
+    return 0;
+  }
+
+  // handle recursive and error checking mutexes
+  return pthread_mutex_unlock_recursive(mutex, word);
+}
--- a/libc/intrin/pthread_yield_np.c
+++ b/libc/intrin/pthread_yield_np.c
@ -32,7 +32,7 @@ void sys_sched_yield(void);
 int pthread_yield_np(void) {
  if (IsXnuSilicon()) {
    __syslib->__pthread_yield_np();
-  } else if (IsOpenbsd() || IsNetbsd()) {
+  } else if (IsOpenbsd()) {
    // sched_yield() is punishingly slow on OpenBSD
    // it's ruinously slow it'll destroy everything
    pthread_pause_np();
--- a/libc/thread/pthread_barrier_destroy.c
+++ b/libc/thread/pthread_barrier_destroy.c
@ -16,9 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
 #include "libc/str/str.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"

 /**
 * Destroys barrier.
@ -27,9 +28,8 @@
 * @raise EINVAL if threads are still inside the barrier
 */
 errno_t pthread_barrier_destroy(pthread_barrier_t *barrier) {
-  if (barrier->_nsync) {
-    nsync_counter_free(barrier->_nsync);
-    barrier->_nsync = 0;
-  }
+  if (atomic_load_explicit(&barrier->_waiters, memory_order_relaxed))
+    return EINVAL;
+  memset(barrier, -1, sizeof(*barrier));
  return 0;
 }
--- a/libc/thread/pthread_barrier_init.c
+++ b/libc/thread/pthread_barrier_init.c
@ -17,8 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"

 /**
 * Initializes barrier.
@ -28,16 +29,17 @@
 *     before the barrier is released, which must be greater than zero
 * @return 0 on success, or error number on failure
 * @raise EINVAL if `count` isn't greater than zero
- * @raise ENOMEM if insufficient memory exists
 */
 errno_t pthread_barrier_init(pthread_barrier_t *barrier,
                             const pthread_barrierattr_t *attr,
                             unsigned count) {
-  nsync_counter c;
  if (!count)
    return EINVAL;
-  if (!(c = nsync_counter_new(count)))
-    return ENOMEM;
-  *barrier = (pthread_barrier_t){._nsync = c};
+  if (count > INT_MAX)
+    return EINVAL;
+  barrier->_count = count;
+  barrier->_pshared = attr ? *attr : PTHREAD_PROCESS_PRIVATE;
+  atomic_store_explicit(&barrier->_counter, count, memory_order_relaxed);
+  atomic_store_explicit(&barrier->_waiters, 0, memory_order_relaxed);
  return 0;
 }
--- a/libc/thread/pthread_barrier_wait.c
+++ b/libc/thread/pthread_barrier_wait.c
@ -16,25 +16,53 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/blockcancel.internal.h"
+#include "libc/errno.h"
+#include "libc/intrin/atomic.h"
+#include "libc/limits.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"
+#include "third_party/nsync/futex.internal.h"

 /**
 * Waits for all threads to arrive at barrier.
 *
 * When the barrier is broken, the state becomes reset to what it was
 * when pthread_barrier_init() was called, so that the barrior may be
- * used again in the same way. The last thread to arrive shall be the
- * last to leave and it returns a magic value.
+ * used again in the same way.
+ *
+ * Unlike pthread_cond_timedwait() this function is not a cancelation
+ * point. It is not needed to have cleanup handlers on block cancels.
 *
 * @return 0 on success, `PTHREAD_BARRIER_SERIAL_THREAD` to one lucky
 *     thread which was the last arrival, or an errno on error
+ * @raise EINVAL if barrier is used incorrectly
 */
 errno_t pthread_barrier_wait(pthread_barrier_t *barrier) {
-  if (nsync_counter_add(barrier->_nsync, -1)) {
-    nsync_counter_wait(barrier->_nsync, nsync_time_no_deadline);
-    return 0;
-  } else {
+  int n;
+
+  // enter barrier
+  atomic_fetch_add_explicit(&barrier->_waiters, 1, memory_order_acq_rel);
+  n = atomic_fetch_sub_explicit(&barrier->_counter, 1, memory_order_acq_rel);
+  n = n - 1;
+
+  // this can only happen on invalid usage
+  if (n < 0)
+    return EINVAL;
+
+  // reset count and wake waiters if we're last at barrier
+  if (!n) {
+    atomic_store_explicit(&barrier->_counter, barrier->_count,
+                          memory_order_release);
+    atomic_store_explicit(&barrier->_waiters, 0, memory_order_release);
+    nsync_futex_wake_(&barrier->_waiters, INT_MAX, barrier->_pshared);
    return PTHREAD_BARRIER_SERIAL_THREAD;
  }
+
+  // wait for everyone else to arrive at barrier
+  BLOCK_CANCELATION;
+  while ((n = atomic_load_explicit(&barrier->_waiters, memory_order_acquire)))
+    nsync_futex_wait_(&barrier->_waiters, n, barrier->_pshared, 0);
+  ALLOW_CANCELATION;
+
+  return 0;
 }
--- a/libc/thread/pthread_barrierattr_getpshared.c
+++ b/libc/thread/pthread_barrierattr_getpshared.c
@ -23,7 +23,7 @@
 *
 * @param pshared is set to one of the following
 *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
 * @return 0 on success, or error on failure
 */
 errno_t pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr,
--- a/libc/thread/pthread_barrierattr_init.c
+++ b/libc/thread/pthread_barrierattr_init.c
@ -24,6 +24,6 @@
 * @return 0 on success, or error on failure
 */
 errno_t pthread_barrierattr_init(pthread_barrierattr_t *attr) {
-  *attr = 0;
+  *attr = PTHREAD_PROCESS_PRIVATE;
  return 0;
 }
--- a/libc/thread/pthread_barrierattr_setpshared.c
+++ b/libc/thread/pthread_barrierattr_setpshared.c
@ -24,13 +24,14 @@
 *
 * @param pshared can be one of
 *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
 * @return 0 on success, or error on failure
 * @raises EINVAL if `pshared` is invalid
 */
 errno_t pthread_barrierattr_setpshared(pthread_barrierattr_t *attr,
                                       int pshared) {
  switch (pshared) {
+    case PTHREAD_PROCESS_SHARED:
    case PTHREAD_PROCESS_PRIVATE:
      *attr = pshared;
      return 0;
--- a/libc/thread/thread.h
+++ b/libc/thread/thread.h
@ -46,7 +46,7 @@ COSMOPOLITAN_C_START_
 #define PTHREAD_RWLOCK_INITIALIZER {0}
 #define PTHREAD_MUTEX_INITIALIZER  {0}

-#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, 0, PTHREAD_MUTEX_RECURSIVE}
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE}

 typedef uintptr_t pthread_t;
 typedef int pthread_id_np_t;
@ -66,7 +66,10 @@ typedef struct pthread_spinlock_s {

 typedef struct pthread_mutex_s {
  uint32_t _nsync;
-  int32_t _pid;
+  union {
+    int32_t _pid;
+    _Atomic(int32_t) _futex;
+  };
  _Atomic(uint64_t) _word;
 } pthread_mutex_t;

@ -92,7 +95,10 @@ typedef struct pthread_rwlock_s {
 } pthread_rwlock_t;

 typedef struct pthread_barrier_s {
-  void *_nsync;
+  int _count;
+  char _pshared;
+  _Atomic(int) _counter;
+  _Atomic(int) _waiters;
 } pthread_barrier_t;

 typedef struct pthread_attr_s {