From e398f3887ce4933adfcb8bb7729ea6eb0d39dbb5 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Wed, 24 Jul 2024 01:05:00 -0700 Subject: [PATCH] Make more improvements to threads and mappings - NetBSD should now have faster synchronization - POSIX barriers may now be shared across processes - An edge case with memory map tracking has been fixed - Grand Central Dispatch is no longer used on MacOS ARM64 - POSIX mutexes in normal mode now use futexes across processes --- libc/intrin/cp.c | 12 +- libc/intrin/maps.h | 2 + libc/intrin/mmap.c | 55 +++-- libc/intrin/mprotect.c | 13 +- libc/intrin/pthread_mutex_lock.c | 92 +++++--- libc/intrin/pthread_mutex_trylock.c | 111 +++++---- libc/intrin/pthread_mutex_unlock.c | 90 ++++--- libc/intrin/pthread_yield_np.c | 2 +- libc/thread/pthread_barrier_destroy.c | 10 +- libc/thread/pthread_barrier_init.c | 14 +- libc/thread/pthread_barrier_wait.c | 42 +++- libc/thread/pthread_barrierattr_getpshared.c | 2 +- libc/thread/pthread_barrierattr_init.c | 2 +- libc/thread/pthread_barrierattr_setpshared.c | 3 +- libc/thread/thread.h | 12 +- test/libc/thread/footek_test.c | 236 +++++++++++++++++++ third_party/nsync/common.c | 7 +- third_party/nsync/futex.c | 5 +- third_party/nsync/mu_semaphore.c | 13 +- third_party/nsync/mu_semaphore.internal.h | 14 ++ 20 files changed, 566 insertions(+), 171 deletions(-) create mode 100644 test/libc/thread/footek_test.c diff --git a/libc/intrin/cp.c b/libc/intrin/cp.c index 411226157..5f4061033 100644 --- a/libc/intrin/cp.c +++ b/libc/intrin/cp.c @@ -26,11 +26,9 @@ int begin_cancelation_point(void) { int state = 0; - struct CosmoTib *tib; - struct PosixThread *pt; if (__tls_enabled) { - tib = __get_tls(); - if ((pt = (struct PosixThread *)tib->tib_pthread)) { + struct PosixThread *pt; + if ((pt = _pthread_self())) { state = pt->pt_flags & PT_INCANCEL; pt->pt_flags |= PT_INCANCEL; } @@ -39,11 +37,9 @@ int begin_cancelation_point(void) { } void end_cancelation_point(int state) { - struct CosmoTib *tib; - struct PosixThread *pt; if (__tls_enabled) { - tib = __get_tls(); - if ((pt = (struct PosixThread *)tib->tib_pthread)) { + struct PosixThread *pt; + if ((pt = _pthread_self())) { pt->pt_flags &= ~PT_INCANCEL; pt->pt_flags |= state; } diff --git a/libc/intrin/maps.h b/libc/intrin/maps.h index a438d1221..3a30c752a 100644 --- a/libc/intrin/maps.h +++ b/libc/intrin/maps.h @@ -6,6 +6,8 @@ #include "libc/thread/tls2.internal.h" COSMOPOLITAN_C_START_ +#define MAPS_RETRY ((void *)-1) + #define MAP_TREE_CONTAINER(e) TREE_CONTAINER(struct Map, tree, e) struct Map { diff --git a/libc/intrin/mmap.c b/libc/intrin/mmap.c index cc342b73f..65cccc769 100644 --- a/libc/intrin/mmap.c +++ b/libc/intrin/mmap.c @@ -120,6 +120,7 @@ static int __muntrack(char *addr, size_t size, int pagesz, struct Map *map; struct Map *next; struct Map *floor; +StartOver: floor = __maps_floor(addr); for (map = floor; map && map->addr <= addr + size; map = next) { next = __maps_next(map); @@ -148,6 +149,8 @@ static int __muntrack(char *addr, size_t size, int pagesz, ASSERT(left > 0); struct Map *leftmap; if ((leftmap = __maps_alloc())) { + if (leftmap == MAPS_RETRY) + goto StartOver; map->addr += left; map->size = right; if (!(map->flags & MAP_ANONYMOUS)) @@ -167,6 +170,8 @@ static int __muntrack(char *addr, size_t size, int pagesz, size_t right = map_addr + map_size - addr; struct Map *rightmap; if ((rightmap = __maps_alloc())) { + if (rightmap == MAPS_RETRY) + goto StartOver; map->size = left; __maps.pages -= (right + pagesz - 1) / pagesz; rightmap->addr = addr; @@ -184,8 +189,14 @@ static int __muntrack(char *addr, size_t size, int pagesz, size_t right = map_size - middle - left; struct Map *leftmap; if ((leftmap = __maps_alloc())) { + if (leftmap == MAPS_RETRY) + goto StartOver; struct Map *middlemap; if ((middlemap = __maps_alloc())) { + if (middlemap == MAPS_RETRY) { + __maps_free(leftmap); + goto StartOver; + } leftmap->addr = map_addr; leftmap->size = left; leftmap->off = map->off; @@ -204,6 +215,7 @@ static int __muntrack(char *addr, size_t size, int pagesz, *deleted = middlemap; __maps_check(); } else { + __maps_free(leftmap); rc = -1; } } else { @@ -304,12 +316,11 @@ struct Map *__maps_alloc(void) { map->flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK; map->hand = sys.maphandle; __maps_lock(); - __maps_insert(map++); + __maps_insert(map); __maps_unlock(); - map->addr = MAP_FAILED; - for (int i = 1; i < gransz / sizeof(struct Map) - 1; ++i) + for (int i = 1; i < gransz / sizeof(struct Map); ++i) __maps_free(map + i); - return map; + return MAPS_RETRY; } static int __munmap(char *addr, size_t size) { @@ -396,21 +407,32 @@ void *__maps_pickaddr(size_t size) { static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd, int64_t off, int pagesz, int gransz) { + // allocate Map object + struct Map *map; + do { + if (!(map = __maps_alloc())) + return MAP_FAILED; + } while (map == MAPS_RETRY); + // polyfill nuances of fixed mappings int sysflags = flags; bool noreplace = false; bool should_untrack = false; if (flags & MAP_FIXED_NOREPLACE) { - if (flags & MAP_FIXED) + if (flags & MAP_FIXED) { + __maps_free(map); return (void *)einval(); + } sysflags &= ~MAP_FIXED_NOREPLACE; if (IsLinux()) { noreplace = true; sysflags |= MAP_FIXED_NOREPLACE_linux; } else if (IsFreebsd() || IsNetbsd()) { sysflags |= MAP_FIXED; - if (__maps_overlaps(addr, size, pagesz)) + if (__maps_overlaps(addr, size, pagesz)) { + __maps_free(map); return (void *)eexist(); + } } else { noreplace = true; } @@ -418,11 +440,6 @@ static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd, should_untrack = true; } - // allocate Map object - struct Map *map; - if (!(map = __maps_alloc())) - return MAP_FAILED; - // remove mapping we blew away if (IsWindows() && should_untrack) __munmap(addr, size); @@ -572,23 +589,27 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size, return (void *)einval(); } + // allocate object for tracking new mapping + struct Map *map; + do { + if (!(map = __maps_alloc())) + return (void *)enomem(); + } while (map == MAPS_RETRY); + // check old interval is fully contained within one mapping struct Map *old_map; if (!(old_map = __maps_floor(old_addr)) || old_addr + old_size > old_map->addr + PGUP(old_map->size) || - old_addr < old_map->addr) + old_addr < old_map->addr) { + __maps_free(map); return (void *)efault(); + } // save old properties int old_off = old_map->off; int old_prot = old_map->prot; int old_flags = old_map->flags; - // allocate object for tracking new mapping - struct Map *map; - if (!(map = __maps_alloc())) - return (void *)enomem(); - // netbsd mremap fixed returns enoent rather than unmapping old pages if (IsNetbsd() && (flags & MREMAP_FIXED)) if (__munmap(new_addr, new_size)) { diff --git a/libc/intrin/mprotect.c b/libc/intrin/mprotect.c index cc8a23ee9..784906acc 100644 --- a/libc/intrin/mprotect.c +++ b/libc/intrin/mprotect.c @@ -75,6 +75,7 @@ int __mprotect(char *addr, size_t size, int prot) { return edeadlk(); } struct Map *map, *floor; +StartOver: floor = __maps_floor(addr); for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) { char *map_addr = map->addr; @@ -93,10 +94,12 @@ int __mprotect(char *addr, size_t size, int prot) { } } else if (addr <= map_addr) { // change lefthand side of mapping - size_t left = PGUP(addr + size - map_addr); + size_t left = addr + size - map_addr; size_t right = map_size - left; struct Map *leftmap; if ((leftmap = __maps_alloc())) { + if (leftmap == MAPS_RETRY) + goto StartOver; if (!__mprotect_chunk(map_addr, left, prot, false)) { leftmap->addr = map_addr; leftmap->size = left; @@ -127,6 +130,8 @@ int __mprotect(char *addr, size_t size, int prot) { size_t right = map_addr + map_size - addr; struct Map *leftmap; if ((leftmap = __maps_alloc())) { + if (leftmap == MAPS_RETRY) + goto StartOver; if (!__mprotect_chunk(map_addr + left, right, prot, false)) { leftmap->addr = map_addr; leftmap->size = left; @@ -159,8 +164,14 @@ int __mprotect(char *addr, size_t size, int prot) { size_t right = map_size - middle - left; struct Map *leftmap; if ((leftmap = __maps_alloc())) { + if (leftmap == MAPS_RETRY) + goto StartOver; struct Map *midlmap; if ((midlmap = __maps_alloc())) { + if (midlmap == MAPS_RETRY) { + __maps_free(leftmap); + goto StartOver; + } if (!__mprotect_chunk(map_addr + left, middle, prot, false)) { leftmap->addr = map_addr; leftmap->size = left; diff --git a/libc/intrin/pthread_mutex_lock.c b/libc/intrin/pthread_mutex_lock.c index 0b96949de..b293a6b75 100644 --- a/libc/intrin/pthread_mutex_lock.c +++ b/libc/intrin/pthread_mutex_lock.c @@ -27,41 +27,47 @@ #include "libc/runtime/internal.h" #include "libc/thread/lock.h" #include "libc/thread/thread.h" +#include "third_party/nsync/futex.internal.h" #include "third_party/nsync/mu.h" -static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) { - int me; +static void pthread_mutex_lock_naive(pthread_mutex_t *mutex, uint64_t word) { int backoff = 0; - uint64_t word, lock; - - // get current state of lock - word = atomic_load_explicit(&mutex->_word, memory_order_relaxed); - -#if PTHREAD_USE_NSYNC - // use fancy nsync mutex if possible - if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && // - MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && // - _weaken(nsync_mu_lock)) { - _weaken(nsync_mu_lock)((nsync_mu *)mutex); - return 0; + uint64_t lock; + for (;;) { + word = MUTEX_UNLOCK(word); + lock = MUTEX_LOCK(word); + if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock, + memory_order_acquire, + memory_order_relaxed)) + return; + backoff = pthread_delay_np(mutex, backoff); } -#endif +} - // implement barebones normal mutexes - if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) { - for (;;) { - word = MUTEX_UNLOCK(word); - lock = MUTEX_LOCK(word); - if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock, - memory_order_acquire, - memory_order_relaxed)) - return 0; - backoff = pthread_delay_np(mutex, backoff); - } +// see "take 3" algorithm in "futexes are tricky" by ulrich drepper +// slightly improved to attempt acquiring multiple times b4 syscall +static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) { + int word; + for (int i = 0; i < 4; ++i) { + word = 0; + if (atomic_compare_exchange_strong_explicit( + futex, &word, 1, memory_order_acquire, memory_order_acquire)) + return; + pthread_pause_np(); } + if (word == 1) + word = atomic_exchange_explicit(futex, 2, memory_order_acquire); + while (word > 0) { + _weaken(nsync_futex_wait_)(futex, 2, pshare, 0); + word = atomic_exchange_explicit(futex, 2, memory_order_acquire); + } +} - // implement recursive mutexes - me = gettid(); +static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex, + uint64_t word) { + uint64_t lock; + int backoff = 0; + int me = gettid(); for (;;) { if (MUTEX_OWNER(word) == me) { if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) { @@ -91,6 +97,36 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) { } } +static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) { + uint64_t word; + + // get current state of lock + word = atomic_load_explicit(&mutex->_word, memory_order_relaxed); + +#if PTHREAD_USE_NSYNC + // use superior mutexes if possible + if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && // + MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && // + _weaken(nsync_mu_lock)) { + _weaken(nsync_mu_lock)((nsync_mu *)mutex); + return 0; + } +#endif + + // handle normal mutexes + if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) { + if (_weaken(nsync_futex_wait_)) { + pthread_mutex_lock_drepper(&mutex->_futex, MUTEX_PSHARED(word)); + } else { + pthread_mutex_lock_naive(mutex, word); + } + return 0; + } + + // handle recursive and error checking mutexes + return pthread_mutex_lock_recursive(mutex, word); +} + /** * Locks mutex. * diff --git a/libc/intrin/pthread_mutex_trylock.c b/libc/intrin/pthread_mutex_trylock.c index a793eed34..5fd06a078 100644 --- a/libc/intrin/pthread_mutex_trylock.c +++ b/libc/intrin/pthread_mutex_trylock.c @@ -24,54 +24,33 @@ #include "libc/runtime/internal.h" #include "libc/thread/lock.h" #include "libc/thread/thread.h" +#include "third_party/nsync/futex.internal.h" #include "third_party/nsync/mu.h" -/** - * Attempts acquiring lock. - * - * Unlike pthread_mutex_lock() this function won't block and instead - * returns an error immediately if the lock couldn't be acquired. - * - * @return 0 if lock was acquired, otherwise an errno - * @raise EAGAIN if maximum number of recursive locks is held - * @raise EBUSY if lock is currently held in read or write mode - * @raise EINVAL if `mutex` doesn't refer to an initialized lock - * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the - * current thread already holds this mutex - */ -errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) { - int me; - uint64_t word, lock; +static errno_t pthread_mutex_trylock_naive(pthread_mutex_t *mutex, + uint64_t word) { + uint64_t lock; + word = MUTEX_UNLOCK(word); + lock = MUTEX_LOCK(word); + if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock, + memory_order_acquire, + memory_order_relaxed)) + return 0; + return EBUSY; +} - // get current state of lock - word = atomic_load_explicit(&mutex->_word, memory_order_relaxed); +static errno_t pthread_mutex_trylock_drepper(atomic_int *futex) { + int word = 0; + if (atomic_compare_exchange_strong_explicit( + futex, &word, 1, memory_order_acquire, memory_order_acquire)) + return 0; + return EBUSY; +} -#if PTHREAD_USE_NSYNC - // delegate to *NSYNC if possible - if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && - MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && // - _weaken(nsync_mu_trylock)) { - if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) { - return 0; - } else { - return EBUSY; - } - } -#endif - - // handle normal mutexes - if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) { - word = MUTEX_UNLOCK(word); - lock = MUTEX_LOCK(word); - if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock, - memory_order_acquire, - memory_order_relaxed)) - return 0; - return EBUSY; - } - - // handle recursive and error check mutexes - me = gettid(); +static errno_t pthread_mutex_trylock_recursive(pthread_mutex_t *mutex, + uint64_t word) { + uint64_t lock; + int me = gettid(); for (;;) { if (MUTEX_OWNER(word) == me) { if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) { @@ -100,3 +79,47 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) { return EBUSY; } } + +/** + * Attempts acquiring lock. + * + * Unlike pthread_mutex_lock() this function won't block and instead + * returns an error immediately if the lock couldn't be acquired. + * + * @return 0 if lock was acquired, otherwise an errno + * @raise EAGAIN if maximum number of recursive locks is held + * @raise EBUSY if lock is currently held in read or write mode + * @raise EINVAL if `mutex` doesn't refer to an initialized lock + * @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the + * current thread already holds this mutex + */ +errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) { + + // get current state of lock + uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed); + +#if PTHREAD_USE_NSYNC + // use superior mutexes if possible + if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && + MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && // + _weaken(nsync_mu_trylock)) { + if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) { + return 0; + } else { + return EBUSY; + } + } +#endif + + // handle normal mutexes + if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) { + if (_weaken(nsync_futex_wait_)) { + return pthread_mutex_trylock_drepper(&mutex->_futex); + } else { + return pthread_mutex_trylock_naive(mutex, word); + } + } + + // handle recursive and error checking mutexes + return pthread_mutex_trylock_recursive(mutex, word); +} diff --git a/libc/intrin/pthread_mutex_unlock.c b/libc/intrin/pthread_mutex_unlock.c index 4d796d9a4..fcb549dcb 100644 --- a/libc/intrin/pthread_mutex_unlock.c +++ b/libc/intrin/pthread_mutex_unlock.c @@ -25,45 +25,26 @@ #include "libc/runtime/internal.h" #include "libc/thread/lock.h" #include "libc/thread/thread.h" +#include "third_party/nsync/futex.internal.h" #include "third_party/nsync/mu.h" -/** - * Releases mutex. - * - * This function does nothing in vfork() children. - * - * @return 0 on success or error number on failure - * @raises EPERM if in error check mode and not owned by caller - * @vforksafe - */ -errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) { - int me; - uint64_t word, lock; +static void pthread_mutex_unlock_naive(pthread_mutex_t *mutex, uint64_t word) { + uint64_t lock = MUTEX_UNLOCK(word); + atomic_store_explicit(&mutex->_word, lock, memory_order_release); +} - LOCKTRACE("pthread_mutex_unlock(%t)", mutex); - - // get current state of lock - word = atomic_load_explicit(&mutex->_word, memory_order_relaxed); - -#if PTHREAD_USE_NSYNC - // use fancy nsync mutex if possible - if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && // - MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && // - _weaken(nsync_mu_unlock)) { - _weaken(nsync_mu_unlock)((nsync_mu *)mutex); - return 0; +// see "take 3" algorithm in "futexes are tricky" by ulrich drepper +static void pthread_mutex_unlock_drepper(atomic_int *futex, char pshare) { + int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release); + if (word == 2) { + atomic_store_explicit(futex, 0, memory_order_release); + _weaken(nsync_futex_wake_)(futex, 1, pshare); } -#endif +} - // implement barebones normal mutexes - if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) { - lock = MUTEX_UNLOCK(word); - atomic_store_explicit(&mutex->_word, lock, memory_order_release); - return 0; - } - - // implement recursive mutex unlocking - me = gettid(); +static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex, + uint64_t word) { + int me = gettid(); for (;;) { // we allow unlocking an initialized lock that wasn't locked, but we @@ -88,3 +69,44 @@ errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) { return 0; } } + +/** + * Releases mutex. + * + * This function does nothing in vfork() children. + * + * @return 0 on success or error number on failure + * @raises EPERM if in error check mode and not owned by caller + * @vforksafe + */ +errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) { + uint64_t word; + + LOCKTRACE("pthread_mutex_unlock(%t)", mutex); + + // get current state of lock + word = atomic_load_explicit(&mutex->_word, memory_order_relaxed); + +#if PTHREAD_USE_NSYNC + // use superior mutexes if possible + if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && // + MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && // + _weaken(nsync_mu_unlock)) { + _weaken(nsync_mu_unlock)((nsync_mu *)mutex); + return 0; + } +#endif + + // implement barebones normal mutexes + if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) { + if (_weaken(nsync_futex_wake_)) { + pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word)); + } else { + pthread_mutex_unlock_naive(mutex, word); + } + return 0; + } + + // handle recursive and error checking mutexes + return pthread_mutex_unlock_recursive(mutex, word); +} diff --git a/libc/intrin/pthread_yield_np.c b/libc/intrin/pthread_yield_np.c index fefc3f283..4fa0c988c 100644 --- a/libc/intrin/pthread_yield_np.c +++ b/libc/intrin/pthread_yield_np.c @@ -32,7 +32,7 @@ void sys_sched_yield(void); int pthread_yield_np(void) { if (IsXnuSilicon()) { __syslib->__pthread_yield_np(); - } else if (IsOpenbsd() || IsNetbsd()) { + } else if (IsOpenbsd()) { // sched_yield() is punishingly slow on OpenBSD // it's ruinously slow it'll destroy everything pthread_pause_np(); diff --git a/libc/thread/pthread_barrier_destroy.c b/libc/thread/pthread_barrier_destroy.c index 5893558ac..5a75ad3e3 100644 --- a/libc/thread/pthread_barrier_destroy.c +++ b/libc/thread/pthread_barrier_destroy.c @@ -16,9 +16,10 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/errno.h" +#include "libc/intrin/atomic.h" #include "libc/str/str.h" #include "libc/thread/thread.h" -#include "third_party/nsync/counter.h" /** * Destroys barrier. @@ -27,9 +28,8 @@ * @raise EINVAL if threads are still inside the barrier */ errno_t pthread_barrier_destroy(pthread_barrier_t *barrier) { - if (barrier->_nsync) { - nsync_counter_free(barrier->_nsync); - barrier->_nsync = 0; - } + if (atomic_load_explicit(&barrier->_waiters, memory_order_relaxed)) + return EINVAL; + memset(barrier, -1, sizeof(*barrier)); return 0; } diff --git a/libc/thread/pthread_barrier_init.c b/libc/thread/pthread_barrier_init.c index 43770caa2..77079c54a 100644 --- a/libc/thread/pthread_barrier_init.c +++ b/libc/thread/pthread_barrier_init.c @@ -17,8 +17,9 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/errno.h" +#include "libc/intrin/atomic.h" +#include "libc/limits.h" #include "libc/thread/thread.h" -#include "third_party/nsync/counter.h" /** * Initializes barrier. @@ -28,16 +29,17 @@ * before the barrier is released, which must be greater than zero * @return 0 on success, or error number on failure * @raise EINVAL if `count` isn't greater than zero - * @raise ENOMEM if insufficient memory exists */ errno_t pthread_barrier_init(pthread_barrier_t *barrier, const pthread_barrierattr_t *attr, unsigned count) { - nsync_counter c; if (!count) return EINVAL; - if (!(c = nsync_counter_new(count))) - return ENOMEM; - *barrier = (pthread_barrier_t){._nsync = c}; + if (count > INT_MAX) + return EINVAL; + barrier->_count = count; + barrier->_pshared = attr ? *attr : PTHREAD_PROCESS_PRIVATE; + atomic_store_explicit(&barrier->_counter, count, memory_order_relaxed); + atomic_store_explicit(&barrier->_waiters, 0, memory_order_relaxed); return 0; } diff --git a/libc/thread/pthread_barrier_wait.c b/libc/thread/pthread_barrier_wait.c index 3dfcec5b5..27f64773e 100644 --- a/libc/thread/pthread_barrier_wait.c +++ b/libc/thread/pthread_barrier_wait.c @@ -16,25 +16,53 @@ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/calls/blockcancel.internal.h" +#include "libc/errno.h" +#include "libc/intrin/atomic.h" +#include "libc/limits.h" #include "libc/thread/thread.h" -#include "third_party/nsync/counter.h" +#include "third_party/nsync/futex.internal.h" /** * Waits for all threads to arrive at barrier. * * When the barrier is broken, the state becomes reset to what it was * when pthread_barrier_init() was called, so that the barrior may be - * used again in the same way. The last thread to arrive shall be the - * last to leave and it returns a magic value. + * used again in the same way. + * + * Unlike pthread_cond_timedwait() this function is not a cancelation + * point. It is not needed to have cleanup handlers on block cancels. * * @return 0 on success, `PTHREAD_BARRIER_SERIAL_THREAD` to one lucky * thread which was the last arrival, or an errno on error + * @raise EINVAL if barrier is used incorrectly */ errno_t pthread_barrier_wait(pthread_barrier_t *barrier) { - if (nsync_counter_add(barrier->_nsync, -1)) { - nsync_counter_wait(barrier->_nsync, nsync_time_no_deadline); - return 0; - } else { + int n; + + // enter barrier + atomic_fetch_add_explicit(&barrier->_waiters, 1, memory_order_acq_rel); + n = atomic_fetch_sub_explicit(&barrier->_counter, 1, memory_order_acq_rel); + n = n - 1; + + // this can only happen on invalid usage + if (n < 0) + return EINVAL; + + // reset count and wake waiters if we're last at barrier + if (!n) { + atomic_store_explicit(&barrier->_counter, barrier->_count, + memory_order_release); + atomic_store_explicit(&barrier->_waiters, 0, memory_order_release); + nsync_futex_wake_(&barrier->_waiters, INT_MAX, barrier->_pshared); return PTHREAD_BARRIER_SERIAL_THREAD; } + + // wait for everyone else to arrive at barrier + BLOCK_CANCELATION; + while ((n = atomic_load_explicit(&barrier->_waiters, memory_order_acquire))) + nsync_futex_wait_(&barrier->_waiters, n, barrier->_pshared, 0); + ALLOW_CANCELATION; + + return 0; } diff --git a/libc/thread/pthread_barrierattr_getpshared.c b/libc/thread/pthread_barrierattr_getpshared.c index 8c0725963..e71846dd2 100644 --- a/libc/thread/pthread_barrierattr_getpshared.c +++ b/libc/thread/pthread_barrierattr_getpshared.c @@ -23,7 +23,7 @@ * * @param pshared is set to one of the following * - `PTHREAD_PROCESS_PRIVATE` (default) - * - `PTHREAD_PROCESS_SHARED` (unsupported) + * - `PTHREAD_PROCESS_SHARED` * @return 0 on success, or error on failure */ errno_t pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr, diff --git a/libc/thread/pthread_barrierattr_init.c b/libc/thread/pthread_barrierattr_init.c index 473038114..f4abec468 100644 --- a/libc/thread/pthread_barrierattr_init.c +++ b/libc/thread/pthread_barrierattr_init.c @@ -24,6 +24,6 @@ * @return 0 on success, or error on failure */ errno_t pthread_barrierattr_init(pthread_barrierattr_t *attr) { - *attr = 0; + *attr = PTHREAD_PROCESS_PRIVATE; return 0; } diff --git a/libc/thread/pthread_barrierattr_setpshared.c b/libc/thread/pthread_barrierattr_setpshared.c index 05dcee1c1..0f74e0436 100644 --- a/libc/thread/pthread_barrierattr_setpshared.c +++ b/libc/thread/pthread_barrierattr_setpshared.c @@ -24,13 +24,14 @@ * * @param pshared can be one of * - `PTHREAD_PROCESS_PRIVATE` (default) - * - `PTHREAD_PROCESS_SHARED` (unsupported) + * - `PTHREAD_PROCESS_SHARED` * @return 0 on success, or error on failure * @raises EINVAL if `pshared` is invalid */ errno_t pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared) { switch (pshared) { + case PTHREAD_PROCESS_SHARED: case PTHREAD_PROCESS_PRIVATE: *attr = pshared; return 0; diff --git a/libc/thread/thread.h b/libc/thread/thread.h index 406c19ff8..1c3ff8eff 100644 --- a/libc/thread/thread.h +++ b/libc/thread/thread.h @@ -46,7 +46,7 @@ COSMOPOLITAN_C_START_ #define PTHREAD_RWLOCK_INITIALIZER {0} #define PTHREAD_MUTEX_INITIALIZER {0} -#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, 0, PTHREAD_MUTEX_RECURSIVE} +#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE} typedef uintptr_t pthread_t; typedef int pthread_id_np_t; @@ -66,7 +66,10 @@ typedef struct pthread_spinlock_s { typedef struct pthread_mutex_s { uint32_t _nsync; - int32_t _pid; + union { + int32_t _pid; + _Atomic(int32_t) _futex; + }; _Atomic(uint64_t) _word; } pthread_mutex_t; @@ -92,7 +95,10 @@ typedef struct pthread_rwlock_s { } pthread_rwlock_t; typedef struct pthread_barrier_s { - void *_nsync; + int _count; + char _pshared; + _Atomic(int) _counter; + _Atomic(int) _waiters; } pthread_barrier_t; typedef struct pthread_attr_s { diff --git a/test/libc/thread/footek_test.c b/test/libc/thread/footek_test.c new file mode 100644 index 000000000..98e07e5e9 --- /dev/null +++ b/test/libc/thread/footek_test.c @@ -0,0 +1,236 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "third_party/nsync/futex.internal.h" + +// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT +// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN +// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER. + +// arm fleet +// with futexes +// 30 threads / 100000 iterations +// +// 242,604 us real +// 4,222,946 us user +// 1,079,229 us sys +// footek_test on studio.test. 630 µs 17'415 µs 256'782 µs +// 1,362,557 us real +// 3,232,978 us user +// 2,104,824 us sys +// footek_test on pi.test. 611 µs 21'708 µs 1'385'129 µs +// 1,346,482 us real +// 3,370,513 us user +// 1,992,383 us sys +// footek_test on freebsdarm.test. 427 µs 19'967 µs 1'393'476 µs + +// arm fleet +// without futexes +// 30 threads / 100000 iterations +// +// 1,282,084 us real +// 29,359,582 us user +// 34,553 us sys +// footek_test on studio.test. 961 µs 12'907 µs 1'287'983 µs +// 4,070,988 us real +// 16,203,990 us user +// 7,999 us sys +// footek_test on pi.test. 459 µs 16'376 µs 4'095'512 µs +// 7,012,493 us real +// 27,936,725 us user +// 7,871 us sys +// footek_test on freebsdarm.test. 502 µs 16'446 µs 7'051'545 µs + +// x86 fleet +// with futexes +// 30 threads / 100000 iterations +// +// 146,015 us real +// 169,427 us user +// 68,939 us sys +// footek_test on rhel7.test. 376 µs 2'259 µs 153'024 µs +// 144,917 us real +// 383,317 us user +// 191,203 us sys +// footek_test on xnu.test. 11'143 µs 9'159 µs 164'865 µs +// 244,286 us real +// 405,395 us user +// 956,122 us sys +// footek_test on freebsd.test. 394 µs 2'165 µs 256'227 µs +// 209,095 us real +// 616,634 us user +// 9,945 us sys +// footek_test on netbsd.test. 502 µs 2'020 µs 261'895 µs +// 344,876 us real +// 50,000 us user +// 1,240,000 us sys +// footek_test on openbsd.test. 457 µs 2'737 µs 396'342 µs +// 1,193,906 us real +// 17,546,875 us user +// 3,000,000 us sys +// footek_test on win10.test. 462 µs 59'528 µs 1'348'265 µs + +// x86 fleet +// without futexes +// 30 threads / 100000 iterations +// +// 897,815 us real +// 1,763,705 us user +// 9,696 us sys +// footek_test on rhel7.test. 423 µs 2'638 µs 912'241 µs +// 790,332 us real +// 2,359,967 us user +// 0 us sys +// footek_test on netbsd.test. 1'151 µs 2'634 µs 1'014'867 µs +// 2,332,724 us real +// 9,150,000 us user +// 10,000 us sys +// footek_test on openbsd.test. 557 µs 3'020 µs 2'554'648 µs +// 2,528,863 us real +// 56,546,875 us user +// 1,671,875 us sys +// footek_test on win10.test. 962 µs 9'698 µs 2'751'905 µs +// 2,916,033 us real +// 17,236,103 us user +// 0 us sys +// footek_test on freebsd.test. 690 µs 3'011 µs 2'925'997 µs +// 4,225,726 us real +// 16,679,456 us user +// 16,265 us sys +// footek_test on xnu.test. 98'468 µs 5'242 µs 5'191'724 µs + +#define USE_FUTEX 1 +#define THREADS 30 +#define ITERATIONS 30000 + +#define MUTEX_LOCKED(word) ((word) & 8) +#define MUTEX_WAITING(word) ((word) & 16) + +#define MUTEX_LOCK(word) ((word) | 8) +#define MUTEX_SET_WAITING(word) ((word) | 16) +#define MUTEX_UNLOCK(word) ((word) & ~(8 | 16)) + +void lock(atomic_int *futex) { + int word, cs; + for (int i = 0; i < 4; ++i) { + word = 0; + if (atomic_compare_exchange_strong_explicit( + futex, &word, 1, memory_order_acquire, memory_order_acquire)) + return; + pthread_pause_np(); + } + if (word == 1) + word = atomic_exchange_explicit(futex, 2, memory_order_acquire); + while (word > 0) { + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs); +#if USE_FUTEX + nsync_futex_wait_(futex, 2, 0, 0); +#endif + pthread_setcancelstate(cs, 0); + word = atomic_exchange_explicit(futex, 2, memory_order_acquire); + } +} + +void unlock(atomic_int *futex) { + int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release); + if (word == 2) { + atomic_store_explicit(futex, 0, memory_order_release); +#if USE_FUTEX + nsync_futex_wake_(futex, 1, 0); +#endif + } +} + +int g_chores; +atomic_int g_lock; +pthread_mutex_t g_locker; + +void *worker(void *arg) { + for (int i = 0; i < ITERATIONS; ++i) { + lock(&g_lock); + ++g_chores; + unlock(&g_lock); + } + return 0; +} + +int main() { + struct timeval start; + gettimeofday(&start, 0); + + pthread_t th[THREADS]; + for (int i = 0; i < THREADS; ++i) + pthread_create(&th[i], 0, worker, 0); + for (int i = 0; i < THREADS; ++i) + pthread_join(th[i], 0); + npassert(g_chores == THREADS * ITERATIONS); + + struct rusage ru; + struct timeval end; + gettimeofday(&end, 0); + getrusage(RUSAGE_SELF, &ru); + printf("%,16ld us real\n" + "%,16ld us user\n" + "%,16ld us sys\n", + timeval_tomicros(timeval_sub(end, start)), // + timeval_tomicros(ru.ru_utime), // + timeval_tomicros(ru.ru_stime)); + + CheckForMemoryLeaks(); +} + +// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC +// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES + +// x86 fleet +// with pthread_mutex_t +// 30 threads / 100000 iterations +// +// 186,976 us real +// 43,609 us user +// 205,585 us sys +// footek_test on freebsd.test. 410 µs 2'054 µs 195'339 µs +// 238,902 us real +// 235,743 us user +// 97,881 us sys +// footek_test on rhel7.test. 343 µs 2'339 µs 246'926 µs +// 201,285 us real +// 249,612 us user +// 141,230 us sys +// footek_test on xnu.test. 1'960 µs 5'350 µs 265'758 µs +// 303,363 us real +// 60,000 us user +// 410,000 us sys +// footek_test on openbsd.test. 545 µs 3'023 µs 326'200 µs +// 386,085 us real +// 586,455 us user +// 466,991 us sys +// footek_test on netbsd.test. 344 µs 2'421 µs 413'440 µs +// 245,010 us real +// 437,500 us user +// 140,625 us sys +// footek_test on win10.test. 300 µs 18'574 µs 441'225 µs + +// arm fleet +// with pthread_mutex_t +// 30 threads / 100000 iterations +// +// 87,132 us real +// 183,517 us user +// 20,020 us sys +// footek_test on studio.test. 560 µs 12'418 µs 92'825 µs +// 679,374 us real +// 957,678 us user +// 605,078 us sys +// footek_test on pi.test. 462 µs 16'574 µs 702'833 µs +// 902,343 us real +// 1,459,706 us user +// 781,140 us sys +// footek_test on freebsdarm.test. 400 µs 16'261 µs 970'022 µs diff --git a/third_party/nsync/common.c b/third_party/nsync/common.c index 2b150401c..d7fa348cd 100644 --- a/third_party/nsync/common.c +++ b/third_party/nsync/common.c @@ -37,6 +37,7 @@ #include "third_party/nsync/atomic.internal.h" #include "third_party/nsync/common.internal.h" #include "third_party/nsync/mu_semaphore.h" +#include "third_party/nsync/mu_semaphore.internal.h" #include "third_party/nsync/wait_s.internal.h" __static_yoink("nsync_notice"); @@ -147,9 +148,9 @@ static void free_waiters_push (waiter *w) { static void free_waiters_populate (void) { int n; - if (IsNetbsd () || IsXnuSilicon ()) { - // netbsd needs one file descriptor per semaphore (!!) - // tim cook wants us to use his grand central dispatch + if (IsNetbsd () || (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ())) { + // netbsd needs a real file descriptor per semaphore + // tim cook wants us to use his lol central dispatch n = 1; } else { n = getpagesize() / sizeof(waiter); diff --git a/third_party/nsync/futex.c b/third_party/nsync/futex.c index 971501901..9a0f264a5 100644 --- a/third_party/nsync/futex.c +++ b/third_party/nsync/futex.c @@ -52,6 +52,7 @@ #include "third_party/nsync/atomic.h" #include "third_party/nsync/common.internal.h" #include "third_party/nsync/futex.internal.h" +#include "libc/intrin/kprintf.h" #include "third_party/nsync/time.h" #define FUTEX_WAIT_BITS_ FUTEX_BITSET_MATCH_ANY @@ -138,7 +139,7 @@ static int nsync_futex_polyfill_ (atomic_int *w, int expect, struct timespec *ab } if (_weaken (pthread_testcancel_np) && _weaken (pthread_testcancel_np) ()) { - return -ETIMEDOUT; + return -ECANCELED; } if (abstime && timespec_cmp (timespec_real (), *abstime) >= 0) { return -ETIMEDOUT; @@ -163,7 +164,7 @@ static int nsync_futex_wait_win32_ (atomic_int *w, int expect, char pshare, for (;;) { now = timespec_real (); - if (timespec_cmp (now, deadline) > 0) { + if (timespec_cmp (now, deadline) >= 0) { return etimedout(); } wait = timespec_sub (deadline, now); diff --git a/third_party/nsync/mu_semaphore.c b/third_party/nsync/mu_semaphore.c index fd9d855ce..274b4e75b 100644 --- a/third_party/nsync/mu_semaphore.c +++ b/third_party/nsync/mu_semaphore.c @@ -21,14 +21,9 @@ #include "third_party/nsync/mu_semaphore.internal.h" __static_yoink("nsync_notice"); -/* Apple's ulock (part by Cosmo futexes) is an internal API, but: - 1. Unlike GCD it's cancellable, i.e. can be EINTR'd by signals - 2. We currently always use ulock anyway for joining threads */ -#define PREFER_GCD_OVER_ULOCK 1 - /* Initialize *s; the initial value is 0. */ bool nsync_mu_semaphore_init (nsync_semaphore *s) { - if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) { + if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) { return nsync_mu_semaphore_init_gcd (s); } else if (IsNetbsd ()) { return nsync_mu_semaphore_init_sem (s); @@ -44,7 +39,7 @@ bool nsync_mu_semaphore_init (nsync_semaphore *s) { errno_t nsync_mu_semaphore_p (nsync_semaphore *s) { errno_t err; BEGIN_CANCELATION_POINT; - if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) { + if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) { err = nsync_mu_semaphore_p_gcd (s); } else if (IsNetbsd ()) { err = nsync_mu_semaphore_p_sem (s); @@ -62,7 +57,7 @@ errno_t nsync_mu_semaphore_p (nsync_semaphore *s) { errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_deadline) { errno_t err; BEGIN_CANCELATION_POINT; - if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) { + if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) { err = nsync_mu_semaphore_p_with_deadline_gcd (s, abs_deadline); } else if (IsNetbsd ()) { err = nsync_mu_semaphore_p_with_deadline_sem (s, abs_deadline); @@ -75,7 +70,7 @@ errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_d /* Ensure that the count of *s is at least 1. */ void nsync_mu_semaphore_v (nsync_semaphore *s) { - if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) { + if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) { return nsync_mu_semaphore_v_gcd (s); } else if (IsNetbsd ()) { return nsync_mu_semaphore_v_sem (s); diff --git a/third_party/nsync/mu_semaphore.internal.h b/third_party/nsync/mu_semaphore.internal.h index 31b51975d..8795fe349 100755 --- a/third_party/nsync/mu_semaphore.internal.h +++ b/third_party/nsync/mu_semaphore.internal.h @@ -4,6 +4,20 @@ #include "third_party/nsync/time.h" COSMOPOLITAN_C_START_ +/* XNU ulock (used by cosmo futexes) is an internal API, however: + + 1. Unlike GCD it's cancelable i.e. can be EINTR'd by signals + 2. We have no choice but to use ulock for joining threads + 3. Grand Central Dispatch requires a busy loop workaround + 4. ulock makes our mutexes use 20% more system time (meh) + 5. ulock makes our mutexes use 40% less wall time (good) + 6. ulock makes our mutexes use 64% less user time (woop) + + ulock is an outstanding system call that must be used. + gcd is not an acceptable alternative to ulock. */ + +#define NSYNC_USE_GRAND_CENTRAL 0 + bool nsync_mu_semaphore_init_futex(nsync_semaphore *); errno_t nsync_mu_semaphore_p_futex(nsync_semaphore *); errno_t nsync_mu_semaphore_p_with_deadline_futex(nsync_semaphore *, nsync_time);