Make more improvements to threads and mappings

- NetBSD should now have faster synchronization - POSIX barriers may now be shared across processes - An edge case with memory map tracking has been fixed - Grand Central Dispatch is no longer used on MacOS ARM64 - POSIX mutexes in normal mode now use futexes across processes
2025-10-16 14:46:11 +00:00 · 2024-07-24 01:05:00 -07:00 · 2024-07-24 01:05:00 -07:00 · e398f3887c
commit e398f3887c
parent 2187d6d2dd
20 changed files with 566 additions and 171 deletions
--- a/libc/intrin/cp.c
+++ b/libc/intrin/cp.c
@ -26,11 +26,9 @@
 int begin_cancelation_point(void) {
  int state = 0;
  struct CosmoTib *tib;
  struct PosixThread *pt;
  if (__tls_enabled) {
-    tib = __get_tls();
+    struct PosixThread *pt;
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    if ((pt = _pthread_self())) {
      state = pt->pt_flags & PT_INCANCEL;
      pt->pt_flags |= PT_INCANCEL;
    }
@ -39,11 +37,9 @@ int begin_cancelation_point(void) {
 }
 void end_cancelation_point(int state) {
  struct CosmoTib *tib;
  struct PosixThread *pt;
  if (__tls_enabled) {
-    tib = __get_tls();
+    struct PosixThread *pt;
-    if ((pt = (struct PosixThread *)tib->tib_pthread)) {
+    if ((pt = _pthread_self())) {
      pt->pt_flags &= ~PT_INCANCEL;
      pt->pt_flags |= state;
    }
--- a/libc/intrin/maps.h
+++ b/libc/intrin/maps.h
@ -6,6 +6,8 @@
 #include "libc/thread/tls2.internal.h"
 COSMOPOLITAN_C_START_
 #define MAPS_RETRY ((void *)-1)
 #define MAP_TREE_CONTAINER(e) TREE_CONTAINER(struct Map, tree, e)
 struct Map {
--- a/libc/intrin/mmap.c
+++ b/libc/intrin/mmap.c
@ -120,6 +120,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
  struct Map *map;
  struct Map *next;
  struct Map *floor;
 StartOver:
  floor = __maps_floor(addr);
  for (map = floor; map && map->addr <= addr + size; map = next) {
    next = __maps_next(map);
@ -148,6 +149,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      ASSERT(left > 0);
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
        if (leftmap == MAPS_RETRY)
          goto StartOver;
        map->addr += left;
        map->size = right;
        if (!(map->flags & MAP_ANONYMOUS))
@ -167,6 +170,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      size_t right = map_addr + map_size - addr;
      struct Map *rightmap;
      if ((rightmap = __maps_alloc())) {
        if (rightmap == MAPS_RETRY)
          goto StartOver;
        map->size = left;
        __maps.pages -= (right + pagesz - 1) / pagesz;
        rightmap->addr = addr;
@ -184,8 +189,14 @@ static int __muntrack(char *addr, size_t size, int pagesz,
      size_t right = map_size - middle - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
        if (leftmap == MAPS_RETRY)
          goto StartOver;
        struct Map *middlemap;
        if ((middlemap = __maps_alloc())) {
          if (middlemap == MAPS_RETRY) {
            __maps_free(leftmap);
            goto StartOver;
          }
          leftmap->addr = map_addr;
          leftmap->size = left;
          leftmap->off = map->off;
@ -204,6 +215,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
          *deleted = middlemap;
          __maps_check();
        } else {
          __maps_free(leftmap);
          rc = -1;
        }
      } else {
@ -304,12 +316,11 @@ struct Map *__maps_alloc(void) {
  map->flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK;
  map->hand = sys.maphandle;
  __maps_lock();
-  __maps_insert(map++);
+  __maps_insert(map);
  __maps_unlock();
-  map->addr = MAP_FAILED;
+  for (int i = 1; i < gransz / sizeof(struct Map); ++i)
  for (int i = 1; i < gransz / sizeof(struct Map) - 1; ++i)
    __maps_free(map + i);
-  return map;
+  return MAPS_RETRY;
 }
 static int __munmap(char *addr, size_t size) {
@ -396,21 +407,32 @@ void *__maps_pickaddr(size_t size) {
 static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
                          int64_t off, int pagesz, int gransz) {
  // allocate Map object
  struct Map *map;
  do {
    if (!(map = __maps_alloc()))
      return MAP_FAILED;
  } while (map == MAPS_RETRY);
  // polyfill nuances of fixed mappings
  int sysflags = flags;
  bool noreplace = false;
  bool should_untrack = false;
  if (flags & MAP_FIXED_NOREPLACE) {
-    if (flags & MAP_FIXED)
+    if (flags & MAP_FIXED) {
      __maps_free(map);
      return (void *)einval();
    }
    sysflags &= ~MAP_FIXED_NOREPLACE;
    if (IsLinux()) {
      noreplace = true;
      sysflags |= MAP_FIXED_NOREPLACE_linux;
    } else if (IsFreebsd() || IsNetbsd()) {
      sysflags |= MAP_FIXED;
-      if (__maps_overlaps(addr, size, pagesz))
+      if (__maps_overlaps(addr, size, pagesz)) {
        __maps_free(map);
        return (void *)eexist();
      }
    } else {
      noreplace = true;
    }
@ -418,11 +440,6 @@ static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
    should_untrack = true;
  }
  // allocate Map object
  struct Map *map;
  if (!(map = __maps_alloc()))
    return MAP_FAILED;
  // remove mapping we blew away
  if (IsWindows() && should_untrack)
    __munmap(addr, size);
@ -572,23 +589,27 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
        return (void *)einval();
  }
  // allocate object for tracking new mapping
  struct Map *map;
  do {
    if (!(map = __maps_alloc()))
      return (void *)enomem();
  } while (map == MAPS_RETRY);
  // check old interval is fully contained within one mapping
  struct Map *old_map;
  if (!(old_map = __maps_floor(old_addr)) ||
      old_addr + old_size > old_map->addr + PGUP(old_map->size) ||
-      old_addr < old_map->addr)
+      old_addr < old_map->addr) {
    __maps_free(map);
    return (void *)efault();
  }
  // save old properties
  int old_off = old_map->off;
  int old_prot = old_map->prot;
  int old_flags = old_map->flags;
  // allocate object for tracking new mapping
  struct Map *map;
  if (!(map = __maps_alloc()))
    return (void *)enomem();
  // netbsd mremap fixed returns enoent rather than unmapping old pages
  if (IsNetbsd() && (flags & MREMAP_FIXED))
    if (__munmap(new_addr, new_size)) {
--- a/libc/intrin/mprotect.c
+++ b/libc/intrin/mprotect.c
@ -75,6 +75,7 @@ int __mprotect(char *addr, size_t size, int prot) {
    return edeadlk();
  }
  struct Map *map, *floor;
 StartOver:
  floor = __maps_floor(addr);
  for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) {
    char *map_addr = map->addr;
@ -93,10 +94,12 @@ int __mprotect(char *addr, size_t size, int prot) {
      }
    } else if (addr <= map_addr) {
      // change lefthand side of mapping
-      size_t left = PGUP(addr + size - map_addr);
+      size_t left = addr + size - map_addr;
      size_t right = map_size - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
        if (leftmap == MAPS_RETRY)
          goto StartOver;
        if (!__mprotect_chunk(map_addr, left, prot, false)) {
          leftmap->addr = map_addr;
          leftmap->size = left;
@ -127,6 +130,8 @@ int __mprotect(char *addr, size_t size, int prot) {
      size_t right = map_addr + map_size - addr;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
        if (leftmap == MAPS_RETRY)
          goto StartOver;
        if (!__mprotect_chunk(map_addr + left, right, prot, false)) {
          leftmap->addr = map_addr;
          leftmap->size = left;
@ -159,8 +164,14 @@ int __mprotect(char *addr, size_t size, int prot) {
      size_t right = map_size - middle - left;
      struct Map *leftmap;
      if ((leftmap = __maps_alloc())) {
        if (leftmap == MAPS_RETRY)
          goto StartOver;
        struct Map *midlmap;
        if ((midlmap = __maps_alloc())) {
          if (midlmap == MAPS_RETRY) {
            __maps_free(leftmap);
            goto StartOver;
          }
          if (!__mprotect_chunk(map_addr + left, middle, prot, false)) {
            leftmap->addr = map_addr;
            leftmap->size = left;
--- a/libc/intrin/pthread_mutex_lock.c
+++ b/libc/intrin/pthread_mutex_lock.c
@ -27,41 +27,47 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
-static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
+static void pthread_mutex_lock_naive(pthread_mutex_t *mutex, uint64_t word) {
  int me;
  int backoff = 0;
-  uint64_t word, lock;
+  uint64_t lock;
-
+  for (;;) {
-  // get current state of lock
+    word = MUTEX_UNLOCK(word);
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+    lock = MUTEX_LOCK(word);
-
+    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
-#if PTHREAD_USE_NSYNC
+                                              memory_order_acquire,
-  // use fancy nsync mutex if possible
+                                              memory_order_relaxed))
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
+      return;
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+    backoff = pthread_delay_np(mutex, backoff);
      _weaken(nsync_mu_lock)) {
    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
    return 0;
  }
-#endif
+}
-  // implement barebones normal mutexes
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+// slightly improved to attempt acquiring multiple times b4 syscall
-    for (;;) {
+static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) {
-      word = MUTEX_UNLOCK(word);
+  int word;
-      lock = MUTEX_LOCK(word);
+  for (int i = 0; i < 4; ++i) {
-      if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
+    word = 0;
-                                                memory_order_acquire,
+    if (atomic_compare_exchange_strong_explicit(
-                                                memory_order_relaxed))
+            futex, &word, 1, memory_order_acquire, memory_order_acquire))
-        return 0;
+      return;
-      backoff = pthread_delay_np(mutex, backoff);
+    pthread_pause_np();
    }
  }
  if (word == 1)
    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
  while (word > 0) {
    _weaken(nsync_futex_wait_)(futex, 2, pshare, 0);
    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
  }
 }
-  // implement recursive mutexes
+static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
-  me = gettid();
+                                            uint64_t word) {
  uint64_t lock;
  int backoff = 0;
  int me = gettid();
  for (;;) {
    if (MUTEX_OWNER(word) == me) {
      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -91,6 +97,36 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
  }
 }
 static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
  uint64_t word;
  // get current state of lock
  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
 #if PTHREAD_USE_NSYNC
  // use superior mutexes if possible
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
      _weaken(nsync_mu_lock)) {
    _weaken(nsync_mu_lock)((nsync_mu *)mutex);
    return 0;
  }
 #endif
  // handle normal mutexes
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
    if (_weaken(nsync_futex_wait_)) {
      pthread_mutex_lock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
    } else {
      pthread_mutex_lock_naive(mutex, word);
    }
    return 0;
  }
  // handle recursive and error checking mutexes
  return pthread_mutex_lock_recursive(mutex, word);
 }
 /**
 * Locks mutex.
 *
--- a/libc/intrin/pthread_mutex_trylock.c
+++ b/libc/intrin/pthread_mutex_trylock.c
@ -24,54 +24,33 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
-/**
+static errno_t pthread_mutex_trylock_naive(pthread_mutex_t *mutex,
- * Attempts acquiring lock.
+                                           uint64_t word) {
- *
+  uint64_t lock;
- * Unlike pthread_mutex_lock() this function won't block and instead
+  word = MUTEX_UNLOCK(word);
- * returns an error immediately if the lock couldn't be acquired.
+  lock = MUTEX_LOCK(word);
- *
+  if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
- * @return 0 if lock was acquired, otherwise an errno
+                                            memory_order_acquire,
- * @raise EAGAIN if maximum number of recursive locks is held
+                                            memory_order_relaxed))
- * @raise EBUSY if lock is currently held in read or write mode
+    return 0;
- * @raise EINVAL if `mutex` doesn't refer to an initialized lock
+  return EBUSY;
- * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
+}
 *     current thread already holds this mutex
 */
 errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
  int me;
  uint64_t word, lock;
-  // get current state of lock
+static errno_t pthread_mutex_trylock_drepper(atomic_int *futex) {
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+  int word = 0;
  if (atomic_compare_exchange_strong_explicit(
          futex, &word, 1, memory_order_acquire, memory_order_acquire))
    return 0;
  return EBUSY;
 }
-#if PTHREAD_USE_NSYNC
+static errno_t pthread_mutex_trylock_recursive(pthread_mutex_t *mutex,
-  // delegate to *NSYNC if possible
+                                               uint64_t word) {
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
+  uint64_t lock;
-      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
+  int me = gettid();
      _weaken(nsync_mu_trylock)) {
    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
      return 0;
    } else {
      return EBUSY;
    }
  }
 #endif
  // handle normal mutexes
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
    word = MUTEX_UNLOCK(word);
    lock = MUTEX_LOCK(word);
    if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
                                              memory_order_acquire,
                                              memory_order_relaxed))
      return 0;
    return EBUSY;
  }
  // handle recursive and error check mutexes
  me = gettid();
  for (;;) {
    if (MUTEX_OWNER(word) == me) {
      if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -100,3 +79,47 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
    return EBUSY;
  }
 }
 /**
 * Attempts acquiring lock.
 *
 * Unlike pthread_mutex_lock() this function won't block and instead
 * returns an error immediately if the lock couldn't be acquired.
 *
 * @return 0 if lock was acquired, otherwise an errno
 * @raise EAGAIN if maximum number of recursive locks is held
 * @raise EBUSY if lock is currently held in read or write mode
 * @raise EINVAL if `mutex` doesn't refer to an initialized lock
 * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
 *     current thread already holds this mutex
 */
 errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
  // get current state of lock
  uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
 #if PTHREAD_USE_NSYNC
  // use superior mutexes if possible
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
      _weaken(nsync_mu_trylock)) {
    if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
      return 0;
    } else {
      return EBUSY;
    }
  }
 #endif
  // handle normal mutexes
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
    if (_weaken(nsync_futex_wait_)) {
      return pthread_mutex_trylock_drepper(&mutex->_futex);
    } else {
      return pthread_mutex_trylock_naive(mutex, word);
    }
  }
  // handle recursive and error checking mutexes
  return pthread_mutex_trylock_recursive(mutex, word);
 }
--- a/libc/intrin/pthread_mutex_unlock.c
+++ b/libc/intrin/pthread_mutex_unlock.c
@ -25,45 +25,26 @@
 #include "libc/runtime/internal.h"
 #include "libc/thread/lock.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/futex.internal.h"
 #include "third_party/nsync/mu.h"
-/**
+static void pthread_mutex_unlock_naive(pthread_mutex_t *mutex, uint64_t word) {
- * Releases mutex.
+  uint64_t lock = MUTEX_UNLOCK(word);
- *
+  atomic_store_explicit(&mutex->_word, lock, memory_order_release);
- * This function does nothing in vfork() children.
+}
 *
 * @return 0 on success or error number on failure
 * @raises EPERM if in error check mode and not owned by caller
 * @vforksafe
 */
 errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
  int me;
  uint64_t word, lock;
-  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
+// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
-
+static void pthread_mutex_unlock_drepper(atomic_int *futex, char pshare) {
-  // get current state of lock
+  int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
-  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
+  if (word == 2) {
-
+    atomic_store_explicit(futex, 0, memory_order_release);
-#if PTHREAD_USE_NSYNC
+    _weaken(nsync_futex_wake_)(futex, 1, pshare);
  // use fancy nsync mutex if possible
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
      _weaken(nsync_mu_unlock)) {
    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
    return 0;
  }
-#endif
+}
-  // implement barebones normal mutexes
+static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex,
-  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
+                                              uint64_t word) {
-    lock = MUTEX_UNLOCK(word);
+  int me = gettid();
    atomic_store_explicit(&mutex->_word, lock, memory_order_release);
    return 0;
  }
  // implement recursive mutex unlocking
  me = gettid();
  for (;;) {
    // we allow unlocking an initialized lock that wasn't locked, but we
@ -88,3 +69,44 @@ errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
      return 0;
  }
 }
 /**
 * Releases mutex.
 *
 * This function does nothing in vfork() children.
 *
 * @return 0 on success or error number on failure
 * @raises EPERM if in error check mode and not owned by caller
 * @vforksafe
 */
 errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
  uint64_t word;
  LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
  // get current state of lock
  word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
 #if PTHREAD_USE_NSYNC
  // use superior mutexes if possible
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&        //
      MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE &&  //
      _weaken(nsync_mu_unlock)) {
    _weaken(nsync_mu_unlock)((nsync_mu *)mutex);
    return 0;
  }
 #endif
  // implement barebones normal mutexes
  if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
    if (_weaken(nsync_futex_wake_)) {
      pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
    } else {
      pthread_mutex_unlock_naive(mutex, word);
    }
    return 0;
  }
  // handle recursive and error checking mutexes
  return pthread_mutex_unlock_recursive(mutex, word);
 }
--- a/libc/intrin/pthread_yield_np.c
+++ b/libc/intrin/pthread_yield_np.c
@ -32,7 +32,7 @@ void sys_sched_yield(void);
 int pthread_yield_np(void) {
  if (IsXnuSilicon()) {
    __syslib->__pthread_yield_np();
-  } else if (IsOpenbsd() || IsNetbsd()) {
+  } else if (IsOpenbsd()) {
    // sched_yield() is punishingly slow on OpenBSD
    // it's ruinously slow it'll destroy everything
    pthread_pause_np();
--- a/libc/thread/pthread_barrier_destroy.c
+++ b/libc/thread/pthread_barrier_destroy.c
@ -16,9 +16,10 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
 #include "libc/str/str.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/counter.h"
 /**
 * Destroys barrier.
@ -27,9 +28,8 @@
 * @raise EINVAL if threads are still inside the barrier
 */
 errno_t pthread_barrier_destroy(pthread_barrier_t *barrier) {
-  if (barrier->_nsync) {
+  if (atomic_load_explicit(&barrier->_waiters, memory_order_relaxed))
-    nsync_counter_free(barrier->_nsync);
+    return EINVAL;
-    barrier->_nsync = 0;
+  memset(barrier, -1, sizeof(*barrier));
  }
  return 0;
 }
--- a/libc/thread/pthread_barrier_init.c
+++ b/libc/thread/pthread_barrier_init.c
@ -17,8 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
 #include "libc/limits.h"
 #include "libc/thread/thread.h"
 #include "third_party/nsync/counter.h"
 /**
 * Initializes barrier.
@ -28,16 +29,17 @@
 *     before the barrier is released, which must be greater than zero
 * @return 0 on success, or error number on failure
 * @raise EINVAL if `count` isn't greater than zero
 * @raise ENOMEM if insufficient memory exists
 */
 errno_t pthread_barrier_init(pthread_barrier_t *barrier,
                             const pthread_barrierattr_t *attr,
                             unsigned count) {
  nsync_counter c;
  if (!count)
    return EINVAL;
-  if (!(c = nsync_counter_new(count)))
+  if (count > INT_MAX)
-    return ENOMEM;
+    return EINVAL;
-  *barrier = (pthread_barrier_t){._nsync = c};
+  barrier->_count = count;
  barrier->_pshared = attr ? *attr : PTHREAD_PROCESS_PRIVATE;
  atomic_store_explicit(&barrier->_counter, count, memory_order_relaxed);
  atomic_store_explicit(&barrier->_waiters, 0, memory_order_relaxed);
  return 0;
 }
--- a/libc/thread/pthread_barrier_wait.c
+++ b/libc/thread/pthread_barrier_wait.c
@ -16,25 +16,53 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/calls/blockcancel.internal.h"
 #include "libc/errno.h"
 #include "libc/intrin/atomic.h"
 #include "libc/limits.h"
 #include "libc/thread/thread.h"
-#include "third_party/nsync/counter.h"
+#include "third_party/nsync/futex.internal.h"
 /**
 * Waits for all threads to arrive at barrier.
 *
 * When the barrier is broken, the state becomes reset to what it was
 * when pthread_barrier_init() was called, so that the barrior may be
- * used again in the same way. The last thread to arrive shall be the
+ * used again in the same way.
- * last to leave and it returns a magic value.
+ *
 * Unlike pthread_cond_timedwait() this function is not a cancelation
 * point. It is not needed to have cleanup handlers on block cancels.
 *
 * @return 0 on success, `PTHREAD_BARRIER_SERIAL_THREAD` to one lucky
 *     thread which was the last arrival, or an errno on error
 * @raise EINVAL if barrier is used incorrectly
 */
 errno_t pthread_barrier_wait(pthread_barrier_t *barrier) {
-  if (nsync_counter_add(barrier->_nsync, -1)) {
+  int n;
-    nsync_counter_wait(barrier->_nsync, nsync_time_no_deadline);
+
-    return 0;
+  // enter barrier
-  } else {
+  atomic_fetch_add_explicit(&barrier->_waiters, 1, memory_order_acq_rel);
  n = atomic_fetch_sub_explicit(&barrier->_counter, 1, memory_order_acq_rel);
  n = n - 1;
  // this can only happen on invalid usage
  if (n < 0)
    return EINVAL;
  // reset count and wake waiters if we're last at barrier
  if (!n) {
    atomic_store_explicit(&barrier->_counter, barrier->_count,
                          memory_order_release);
    atomic_store_explicit(&barrier->_waiters, 0, memory_order_release);
    nsync_futex_wake_(&barrier->_waiters, INT_MAX, barrier->_pshared);
    return PTHREAD_BARRIER_SERIAL_THREAD;
  }
  // wait for everyone else to arrive at barrier
  BLOCK_CANCELATION;
  while ((n = atomic_load_explicit(&barrier->_waiters, memory_order_acquire)))
    nsync_futex_wait_(&barrier->_waiters, n, barrier->_pshared, 0);
  ALLOW_CANCELATION;
  return 0;
 }
--- a/libc/thread/pthread_barrierattr_getpshared.c
+++ b/libc/thread/pthread_barrierattr_getpshared.c
@ -23,7 +23,7 @@
 *
 * @param pshared is set to one of the following
 *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
 * @return 0 on success, or error on failure
 */
 errno_t pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr,
--- a/libc/thread/pthread_barrierattr_init.c
+++ b/libc/thread/pthread_barrierattr_init.c
@ -24,6 +24,6 @@
 * @return 0 on success, or error on failure
 */
 errno_t pthread_barrierattr_init(pthread_barrierattr_t *attr) {
-  *attr = 0;
+  *attr = PTHREAD_PROCESS_PRIVATE;
  return 0;
 }
--- a/libc/thread/pthread_barrierattr_setpshared.c
+++ b/libc/thread/pthread_barrierattr_setpshared.c
@ -24,13 +24,14 @@
 *
 * @param pshared can be one of
 *     - `PTHREAD_PROCESS_PRIVATE` (default)
- *     - `PTHREAD_PROCESS_SHARED` (unsupported)
+ *     - `PTHREAD_PROCESS_SHARED`
 * @return 0 on success, or error on failure
 * @raises EINVAL if `pshared` is invalid
 */
 errno_t pthread_barrierattr_setpshared(pthread_barrierattr_t *attr,
                                       int pshared) {
  switch (pshared) {
    case PTHREAD_PROCESS_SHARED:
    case PTHREAD_PROCESS_PRIVATE:
      *attr = pshared;
      return 0;
--- a/libc/thread/thread.h
+++ b/libc/thread/thread.h
@ -46,7 +46,7 @@ COSMOPOLITAN_C_START_
 #define PTHREAD_RWLOCK_INITIALIZER {0}
 #define PTHREAD_MUTEX_INITIALIZER  {0}
-#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, 0, PTHREAD_MUTEX_RECURSIVE}
+#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE}
 typedef uintptr_t pthread_t;
 typedef int pthread_id_np_t;
@ -66,7 +66,10 @@ typedef struct pthread_spinlock_s {
 typedef struct pthread_mutex_s {
  uint32_t _nsync;
-  int32_t _pid;
+  union {
    int32_t _pid;
    _Atomic(int32_t) _futex;
  };
  _Atomic(uint64_t) _word;
 } pthread_mutex_t;
@ -92,7 +95,10 @@ typedef struct pthread_rwlock_s {
 } pthread_rwlock_t;
 typedef struct pthread_barrier_s {
-  void *_nsync;
+  int _count;
  char _pshared;
  _Atomic(int) _counter;
  _Atomic(int) _waiters;
 } pthread_barrier_t;
 typedef struct pthread_attr_s {
--- a/test/libc/thread/footek_test.c
+++ b/test/libc/thread/footek_test.c
@ -0,0 +1,236 @@
 #include <assert.h>
 #include <cosmo.h>
 #include <linux/futex.h>
 #include <pthread.h>
 #include <stdatomic.h>
 #include <stdio.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <time.h>
 #include <unistd.h>
 #include "third_party/nsync/futex.internal.h"
 // THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
 // YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
 // MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
 // arm fleet
 // with futexes
 // 30 threads / 100000 iterations
 //
 //          242,604 us real
 //        4,222,946 us user
 //        1,079,229 us sys
 // footek_test on studio.test.          630 µs   17'415 µs     256'782 µs
 //        1,362,557 us real
 //        3,232,978 us user
 //        2,104,824 us sys
 // footek_test on pi.test.              611 µs   21'708 µs   1'385'129 µs
 //        1,346,482 us real
 //        3,370,513 us user
 //        1,992,383 us sys
 // footek_test on freebsdarm.test.      427 µs   19'967 µs   1'393'476 µs
 // arm fleet
 // without futexes
 // 30 threads / 100000 iterations
 //
 //        1,282,084 us real
 //       29,359,582 us user
 //           34,553 us sys
 // footek_test on studio.test.          961 µs   12'907 µs   1'287'983 µs
 //        4,070,988 us real
 //       16,203,990 us user
 //            7,999 us sys
 // footek_test on pi.test.              459 µs   16'376 µs   4'095'512 µs
 //        7,012,493 us real
 //       27,936,725 us user
 //            7,871 us sys
 // footek_test on freebsdarm.test.      502 µs   16'446 µs   7'051'545 µs
 // x86 fleet
 // with futexes
 // 30 threads / 100000 iterations
 //
 //          146,015 us real
 //          169,427 us user
 //           68,939 us sys
 // footek_test on rhel7.test.           376 µs    2'259 µs     153'024 µs
 //          144,917 us real
 //          383,317 us user
 //          191,203 us sys
 // footek_test on xnu.test.          11'143 µs    9'159 µs     164'865 µs
 //          244,286 us real
 //          405,395 us user
 //          956,122 us sys
 // footek_test on freebsd.test.         394 µs    2'165 µs     256'227 µs
 //          209,095 us real
 //          616,634 us user
 //            9,945 us sys
 // footek_test on netbsd.test.          502 µs    2'020 µs     261'895 µs
 //          344,876 us real
 //           50,000 us user
 //        1,240,000 us sys
 // footek_test on openbsd.test.         457 µs    2'737 µs     396'342 µs
 //        1,193,906 us real
 //       17,546,875 us user
 //        3,000,000 us sys
 // footek_test on win10.test.           462 µs   59'528 µs   1'348'265 µs
 // x86 fleet
 // without futexes
 // 30 threads / 100000 iterations
 //
 //          897,815 us real
 //        1,763,705 us user
 //            9,696 us sys
 // footek_test on rhel7.test.           423 µs    2'638 µs     912'241 µs
 //          790,332 us real
 //        2,359,967 us user
 //                0 us sys
 // footek_test on netbsd.test.        1'151 µs    2'634 µs   1'014'867 µs
 //        2,332,724 us real
 //        9,150,000 us user
 //           10,000 us sys
 // footek_test on openbsd.test.         557 µs    3'020 µs   2'554'648 µs
 //        2,528,863 us real
 //       56,546,875 us user
 //        1,671,875 us sys
 // footek_test on win10.test.           962 µs    9'698 µs   2'751'905 µs
 //        2,916,033 us real
 //       17,236,103 us user
 //                0 us sys
 // footek_test on freebsd.test.         690 µs    3'011 µs   2'925'997 µs
 //        4,225,726 us real
 //       16,679,456 us user
 //           16,265 us sys
 // footek_test on xnu.test.          98'468 µs    5'242 µs   5'191'724 µs
 #define USE_FUTEX  1
 #define THREADS    30
 #define ITERATIONS 30000
 #define MUTEX_LOCKED(word)  ((word) & 8)
 #define MUTEX_WAITING(word) ((word) & 16)
 #define MUTEX_LOCK(word)        ((word) | 8)
 #define MUTEX_SET_WAITING(word) ((word) | 16)
 #define MUTEX_UNLOCK(word)      ((word) & ~(8 | 16))
 void lock(atomic_int *futex) {
  int word, cs;
  for (int i = 0; i < 4; ++i) {
    word = 0;
    if (atomic_compare_exchange_strong_explicit(
            futex, &word, 1, memory_order_acquire, memory_order_acquire))
      return;
    pthread_pause_np();
  }
  if (word == 1)
    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
  while (word > 0) {
    pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
 #if USE_FUTEX
    nsync_futex_wait_(futex, 2, 0, 0);
 #endif
    pthread_setcancelstate(cs, 0);
    word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
  }
 }
 void unlock(atomic_int *futex) {
  int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
  if (word == 2) {
    atomic_store_explicit(futex, 0, memory_order_release);
 #if USE_FUTEX
    nsync_futex_wake_(futex, 1, 0);
 #endif
  }
 }
 int g_chores;
 atomic_int g_lock;
 pthread_mutex_t g_locker;
 void *worker(void *arg) {
  for (int i = 0; i < ITERATIONS; ++i) {
    lock(&g_lock);
    ++g_chores;
    unlock(&g_lock);
  }
  return 0;
 }
 int main() {
  struct timeval start;
  gettimeofday(&start, 0);
  pthread_t th[THREADS];
  for (int i = 0; i < THREADS; ++i)
    pthread_create(&th[i], 0, worker, 0);
  for (int i = 0; i < THREADS; ++i)
    pthread_join(th[i], 0);
  npassert(g_chores == THREADS * ITERATIONS);
  struct rusage ru;
  struct timeval end;
  gettimeofday(&end, 0);
  getrusage(RUSAGE_SELF, &ru);
  printf("%,16ld us real\n"
         "%,16ld us user\n"
         "%,16ld us sys\n",
         timeval_tomicros(timeval_sub(end, start)),  //
         timeval_tomicros(ru.ru_utime),              //
         timeval_tomicros(ru.ru_stime));
  CheckForMemoryLeaks();
 }
 // COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
 // WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
 // x86 fleet
 // with pthread_mutex_t
 // 30 threads / 100000 iterations
 //
 //          186,976 us real
 //           43,609 us user
 //          205,585 us sys
 // footek_test on freebsd.test.         410 µs    2'054 µs     195'339 µs
 //          238,902 us real
 //          235,743 us user
 //           97,881 us sys
 // footek_test on rhel7.test.           343 µs    2'339 µs     246'926 µs
 //          201,285 us real
 //          249,612 us user
 //          141,230 us sys
 // footek_test on xnu.test.           1'960 µs    5'350 µs     265'758 µs
 //          303,363 us real
 //           60,000 us user
 //          410,000 us sys
 // footek_test on openbsd.test.         545 µs    3'023 µs     326'200 µs
 //          386,085 us real
 //          586,455 us user
 //          466,991 us sys
 // footek_test on netbsd.test.          344 µs    2'421 µs     413'440 µs
 //          245,010 us real
 //          437,500 us user
 //          140,625 us sys
 // footek_test on win10.test.           300 µs   18'574 µs     441'225 µs
 // arm fleet
 // with pthread_mutex_t
 // 30 threads / 100000 iterations
 //
 //           87,132 us real
 //          183,517 us user
 //           20,020 us sys
 // footek_test on studio.test.          560 µs   12'418 µs      92'825 µs
 //          679,374 us real
 //          957,678 us user
 //          605,078 us sys
 // footek_test on pi.test.              462 µs   16'574 µs     702'833 µs
 //          902,343 us real
 //        1,459,706 us user
 //          781,140 us sys
 // footek_test on freebsdarm.test.      400 µs   16'261 µs     970'022 µs
--- a/third_party/nsync/common.c
+++ b/third_party/nsync/common.c
@ -37,6 +37,7 @@
 #include "third_party/nsync/atomic.internal.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/mu_semaphore.h"
 #include "third_party/nsync/mu_semaphore.internal.h"
 #include "third_party/nsync/wait_s.internal.h"
 __static_yoink("nsync_notice");
@ -147,9 +148,9 @@ static void free_waiters_push (waiter *w) {
 static void free_waiters_populate (void) {
 	int n;
-	if (IsNetbsd () || IsXnuSilicon ()) {
+	if (IsNetbsd () || (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ())) {
-		// netbsd needs one file descriptor per semaphore (!!)
+		// netbsd needs a real file descriptor per semaphore
-		// tim cook wants us to use his grand central dispatch
+		// tim cook wants us to use his lol central dispatch
 		n = 1;
 	} else {
 		n = getpagesize() / sizeof(waiter);
--- a/third_party/nsync/futex.c
+++ b/third_party/nsync/futex.c
@ -52,6 +52,7 @@
 #include "third_party/nsync/atomic.h"
 #include "third_party/nsync/common.internal.h"
 #include "third_party/nsync/futex.internal.h"
 #include "libc/intrin/kprintf.h"
 #include "third_party/nsync/time.h"
 #define FUTEX_WAIT_BITS_ FUTEX_BITSET_MATCH_ANY
@ -138,7 +139,7 @@ static int nsync_futex_polyfill_ (atomic_int *w, int expect, struct timespec *ab
 		}
 		if (_weaken (pthread_testcancel_np) &&
 		    _weaken (pthread_testcancel_np) ()) {
-			return -ETIMEDOUT;
+			return -ECANCELED;
 		}
 		if (abstime && timespec_cmp (timespec_real (), *abstime) >= 0) {
 			return -ETIMEDOUT;
@ -163,7 +164,7 @@ static int nsync_futex_wait_win32_ (atomic_int *w, int expect, char pshare,
 	for (;;) {
 		now = timespec_real ();
-		if (timespec_cmp (now, deadline) > 0) {
+		if (timespec_cmp (now, deadline) >= 0) {
 			return etimedout();
 		}
 		wait = timespec_sub (deadline, now);
--- a/third_party/nsync/mu_semaphore.c
+++ b/third_party/nsync/mu_semaphore.c
@ -21,14 +21,9 @@
 #include "third_party/nsync/mu_semaphore.internal.h"
 __static_yoink("nsync_notice");
 /* Apple's ulock (part by Cosmo futexes) is an internal API, but:
   1. Unlike GCD it's cancellable, i.e. can be EINTR'd by signals
   2. We currently always use ulock anyway for joining threads */
 #define PREFER_GCD_OVER_ULOCK 1
 /* Initialize *s; the initial value is 0. */
 bool nsync_mu_semaphore_init (nsync_semaphore *s) {
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		return nsync_mu_semaphore_init_gcd (s);
 	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_init_sem (s);
@ -44,7 +39,7 @@ bool nsync_mu_semaphore_init (nsync_semaphore *s) {
 errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		err = nsync_mu_semaphore_p_gcd (s);
 	} else if (IsNetbsd ()) {
 		err = nsync_mu_semaphore_p_sem (s);
@ -62,7 +57,7 @@ errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
 errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_deadline) {
 	errno_t err;
 	BEGIN_CANCELATION_POINT;
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		err = nsync_mu_semaphore_p_with_deadline_gcd (s, abs_deadline);
 	} else if (IsNetbsd ()) {
 		err = nsync_mu_semaphore_p_with_deadline_sem (s, abs_deadline);
@ -75,7 +70,7 @@ errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_d
 /* Ensure that the count of *s is at least 1. */
 void nsync_mu_semaphore_v (nsync_semaphore *s) {
-	if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
+	if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
 		return nsync_mu_semaphore_v_gcd (s);
 	} else if (IsNetbsd ()) {
 		return nsync_mu_semaphore_v_sem (s);
--- a/third_party/nsync/mu_semaphore.internal.h
+++ b/third_party/nsync/mu_semaphore.internal.h
@ -4,6 +4,20 @@
 #include "third_party/nsync/time.h"
 COSMOPOLITAN_C_START_
 /* XNU ulock (used by cosmo futexes) is an internal API, however:
     1. Unlike GCD it's cancelable i.e. can be EINTR'd by signals
     2. We have no choice but to use ulock for joining threads
     3. Grand Central Dispatch requires a busy loop workaround
     4. ulock makes our mutexes use 20% more system time (meh)
     5. ulock makes our mutexes use 40% less wall time (good)
     6. ulock makes our mutexes use 64% less user time (woop)
   ulock is an outstanding system call that must be used.
   gcd is not an acceptable alternative to ulock. */
 #define NSYNC_USE_GRAND_CENTRAL 0
 bool nsync_mu_semaphore_init_futex(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_futex(nsync_semaphore *);
 errno_t nsync_mu_semaphore_p_with_deadline_futex(nsync_semaphore *, nsync_time);