Make more improvements to threads and mappings

- NetBSD should now have faster synchronization
- POSIX barriers may now be shared across processes
- An edge case with memory map tracking has been fixed
- Grand Central Dispatch is no longer used on MacOS ARM64
- POSIX mutexes in normal mode now use futexes across processes
This commit is contained in:
Justine Tunney 2024-07-24 01:05:00 -07:00
parent 2187d6d2dd
commit e398f3887c
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
20 changed files with 566 additions and 171 deletions

View file

@ -26,11 +26,9 @@
int begin_cancelation_point(void) {
int state = 0;
struct CosmoTib *tib;
struct PosixThread *pt;
if (__tls_enabled) {
tib = __get_tls();
if ((pt = (struct PosixThread *)tib->tib_pthread)) {
struct PosixThread *pt;
if ((pt = _pthread_self())) {
state = pt->pt_flags & PT_INCANCEL;
pt->pt_flags |= PT_INCANCEL;
}
@ -39,11 +37,9 @@ int begin_cancelation_point(void) {
}
void end_cancelation_point(int state) {
struct CosmoTib *tib;
struct PosixThread *pt;
if (__tls_enabled) {
tib = __get_tls();
if ((pt = (struct PosixThread *)tib->tib_pthread)) {
struct PosixThread *pt;
if ((pt = _pthread_self())) {
pt->pt_flags &= ~PT_INCANCEL;
pt->pt_flags |= state;
}

View file

@ -6,6 +6,8 @@
#include "libc/thread/tls2.internal.h"
COSMOPOLITAN_C_START_
#define MAPS_RETRY ((void *)-1)
#define MAP_TREE_CONTAINER(e) TREE_CONTAINER(struct Map, tree, e)
struct Map {

View file

@ -120,6 +120,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
struct Map *map;
struct Map *next;
struct Map *floor;
StartOver:
floor = __maps_floor(addr);
for (map = floor; map && map->addr <= addr + size; map = next) {
next = __maps_next(map);
@ -148,6 +149,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
ASSERT(left > 0);
struct Map *leftmap;
if ((leftmap = __maps_alloc())) {
if (leftmap == MAPS_RETRY)
goto StartOver;
map->addr += left;
map->size = right;
if (!(map->flags & MAP_ANONYMOUS))
@ -167,6 +170,8 @@ static int __muntrack(char *addr, size_t size, int pagesz,
size_t right = map_addr + map_size - addr;
struct Map *rightmap;
if ((rightmap = __maps_alloc())) {
if (rightmap == MAPS_RETRY)
goto StartOver;
map->size = left;
__maps.pages -= (right + pagesz - 1) / pagesz;
rightmap->addr = addr;
@ -184,8 +189,14 @@ static int __muntrack(char *addr, size_t size, int pagesz,
size_t right = map_size - middle - left;
struct Map *leftmap;
if ((leftmap = __maps_alloc())) {
if (leftmap == MAPS_RETRY)
goto StartOver;
struct Map *middlemap;
if ((middlemap = __maps_alloc())) {
if (middlemap == MAPS_RETRY) {
__maps_free(leftmap);
goto StartOver;
}
leftmap->addr = map_addr;
leftmap->size = left;
leftmap->off = map->off;
@ -204,6 +215,7 @@ static int __muntrack(char *addr, size_t size, int pagesz,
*deleted = middlemap;
__maps_check();
} else {
__maps_free(leftmap);
rc = -1;
}
} else {
@ -304,12 +316,11 @@ struct Map *__maps_alloc(void) {
map->flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NOFORK;
map->hand = sys.maphandle;
__maps_lock();
__maps_insert(map++);
__maps_insert(map);
__maps_unlock();
map->addr = MAP_FAILED;
for (int i = 1; i < gransz / sizeof(struct Map) - 1; ++i)
for (int i = 1; i < gransz / sizeof(struct Map); ++i)
__maps_free(map + i);
return map;
return MAPS_RETRY;
}
static int __munmap(char *addr, size_t size) {
@ -396,21 +407,32 @@ void *__maps_pickaddr(size_t size) {
static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
int64_t off, int pagesz, int gransz) {
// allocate Map object
struct Map *map;
do {
if (!(map = __maps_alloc()))
return MAP_FAILED;
} while (map == MAPS_RETRY);
// polyfill nuances of fixed mappings
int sysflags = flags;
bool noreplace = false;
bool should_untrack = false;
if (flags & MAP_FIXED_NOREPLACE) {
if (flags & MAP_FIXED)
if (flags & MAP_FIXED) {
__maps_free(map);
return (void *)einval();
}
sysflags &= ~MAP_FIXED_NOREPLACE;
if (IsLinux()) {
noreplace = true;
sysflags |= MAP_FIXED_NOREPLACE_linux;
} else if (IsFreebsd() || IsNetbsd()) {
sysflags |= MAP_FIXED;
if (__maps_overlaps(addr, size, pagesz))
if (__maps_overlaps(addr, size, pagesz)) {
__maps_free(map);
return (void *)eexist();
}
} else {
noreplace = true;
}
@ -418,11 +440,6 @@ static void *__mmap_chunk(void *addr, size_t size, int prot, int flags, int fd,
should_untrack = true;
}
// allocate Map object
struct Map *map;
if (!(map = __maps_alloc()))
return MAP_FAILED;
// remove mapping we blew away
if (IsWindows() && should_untrack)
__munmap(addr, size);
@ -572,23 +589,27 @@ static void *__mremap_impl(char *old_addr, size_t old_size, size_t new_size,
return (void *)einval();
}
// allocate object for tracking new mapping
struct Map *map;
do {
if (!(map = __maps_alloc()))
return (void *)enomem();
} while (map == MAPS_RETRY);
// check old interval is fully contained within one mapping
struct Map *old_map;
if (!(old_map = __maps_floor(old_addr)) ||
old_addr + old_size > old_map->addr + PGUP(old_map->size) ||
old_addr < old_map->addr)
old_addr < old_map->addr) {
__maps_free(map);
return (void *)efault();
}
// save old properties
int old_off = old_map->off;
int old_prot = old_map->prot;
int old_flags = old_map->flags;
// allocate object for tracking new mapping
struct Map *map;
if (!(map = __maps_alloc()))
return (void *)enomem();
// netbsd mremap fixed returns enoent rather than unmapping old pages
if (IsNetbsd() && (flags & MREMAP_FIXED))
if (__munmap(new_addr, new_size)) {

View file

@ -75,6 +75,7 @@ int __mprotect(char *addr, size_t size, int prot) {
return edeadlk();
}
struct Map *map, *floor;
StartOver:
floor = __maps_floor(addr);
for (map = floor; map && map->addr <= addr + size; map = __maps_next(map)) {
char *map_addr = map->addr;
@ -93,10 +94,12 @@ int __mprotect(char *addr, size_t size, int prot) {
}
} else if (addr <= map_addr) {
// change lefthand side of mapping
size_t left = PGUP(addr + size - map_addr);
size_t left = addr + size - map_addr;
size_t right = map_size - left;
struct Map *leftmap;
if ((leftmap = __maps_alloc())) {
if (leftmap == MAPS_RETRY)
goto StartOver;
if (!__mprotect_chunk(map_addr, left, prot, false)) {
leftmap->addr = map_addr;
leftmap->size = left;
@ -127,6 +130,8 @@ int __mprotect(char *addr, size_t size, int prot) {
size_t right = map_addr + map_size - addr;
struct Map *leftmap;
if ((leftmap = __maps_alloc())) {
if (leftmap == MAPS_RETRY)
goto StartOver;
if (!__mprotect_chunk(map_addr + left, right, prot, false)) {
leftmap->addr = map_addr;
leftmap->size = left;
@ -159,8 +164,14 @@ int __mprotect(char *addr, size_t size, int prot) {
size_t right = map_size - middle - left;
struct Map *leftmap;
if ((leftmap = __maps_alloc())) {
if (leftmap == MAPS_RETRY)
goto StartOver;
struct Map *midlmap;
if ((midlmap = __maps_alloc())) {
if (midlmap == MAPS_RETRY) {
__maps_free(leftmap);
goto StartOver;
}
if (!__mprotect_chunk(map_addr + left, middle, prot, false)) {
leftmap->addr = map_addr;
leftmap->size = left;

View file

@ -27,41 +27,47 @@
#include "libc/runtime/internal.h"
#include "libc/thread/lock.h"
#include "libc/thread/thread.h"
#include "third_party/nsync/futex.internal.h"
#include "third_party/nsync/mu.h"
static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
int me;
static void pthread_mutex_lock_naive(pthread_mutex_t *mutex, uint64_t word) {
int backoff = 0;
uint64_t word, lock;
// get current state of lock
word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
#if PTHREAD_USE_NSYNC
// use fancy nsync mutex if possible
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && //
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_lock)) {
_weaken(nsync_mu_lock)((nsync_mu *)mutex);
return 0;
}
#endif
// implement barebones normal mutexes
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
uint64_t lock;
for (;;) {
word = MUTEX_UNLOCK(word);
lock = MUTEX_LOCK(word);
if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
memory_order_acquire,
memory_order_relaxed))
return 0;
return;
backoff = pthread_delay_np(mutex, backoff);
}
}
// implement recursive mutexes
me = gettid();
// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
// slightly improved to attempt acquiring multiple times b4 syscall
static void pthread_mutex_lock_drepper(atomic_int *futex, char pshare) {
int word;
for (int i = 0; i < 4; ++i) {
word = 0;
if (atomic_compare_exchange_strong_explicit(
futex, &word, 1, memory_order_acquire, memory_order_acquire))
return;
pthread_pause_np();
}
if (word == 1)
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
while (word > 0) {
_weaken(nsync_futex_wait_)(futex, 2, pshare, 0);
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
}
}
static errno_t pthread_mutex_lock_recursive(pthread_mutex_t *mutex,
uint64_t word) {
uint64_t lock;
int backoff = 0;
int me = gettid();
for (;;) {
if (MUTEX_OWNER(word) == me) {
if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -91,6 +97,36 @@ static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
}
}
static errno_t pthread_mutex_lock_impl(pthread_mutex_t *mutex) {
uint64_t word;
// get current state of lock
word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
#if PTHREAD_USE_NSYNC
// use superior mutexes if possible
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && //
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_lock)) {
_weaken(nsync_mu_lock)((nsync_mu *)mutex);
return 0;
}
#endif
// handle normal mutexes
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
if (_weaken(nsync_futex_wait_)) {
pthread_mutex_lock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
} else {
pthread_mutex_lock_naive(mutex, word);
}
return 0;
}
// handle recursive and error checking mutexes
return pthread_mutex_lock_recursive(mutex, word);
}
/**
* Locks mutex.
*

View file

@ -24,43 +24,12 @@
#include "libc/runtime/internal.h"
#include "libc/thread/lock.h"
#include "libc/thread/thread.h"
#include "third_party/nsync/futex.internal.h"
#include "third_party/nsync/mu.h"
/**
* Attempts acquiring lock.
*
* Unlike pthread_mutex_lock() this function won't block and instead
* returns an error immediately if the lock couldn't be acquired.
*
* @return 0 if lock was acquired, otherwise an errno
* @raise EAGAIN if maximum number of recursive locks is held
* @raise EBUSY if lock is currently held in read or write mode
* @raise EINVAL if `mutex` doesn't refer to an initialized lock
* @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
* current thread already holds this mutex
*/
errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
int me;
uint64_t word, lock;
// get current state of lock
word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
#if PTHREAD_USE_NSYNC
// delegate to *NSYNC if possible
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_trylock)) {
if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
return 0;
} else {
return EBUSY;
}
}
#endif
// handle normal mutexes
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
static errno_t pthread_mutex_trylock_naive(pthread_mutex_t *mutex,
uint64_t word) {
uint64_t lock;
word = MUTEX_UNLOCK(word);
lock = MUTEX_LOCK(word);
if (atomic_compare_exchange_weak_explicit(&mutex->_word, &word, lock,
@ -70,8 +39,18 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
return EBUSY;
}
// handle recursive and error check mutexes
me = gettid();
static errno_t pthread_mutex_trylock_drepper(atomic_int *futex) {
int word = 0;
if (atomic_compare_exchange_strong_explicit(
futex, &word, 1, memory_order_acquire, memory_order_acquire))
return 0;
return EBUSY;
}
static errno_t pthread_mutex_trylock_recursive(pthread_mutex_t *mutex,
uint64_t word) {
uint64_t lock;
int me = gettid();
for (;;) {
if (MUTEX_OWNER(word) == me) {
if (MUTEX_TYPE(word) != PTHREAD_MUTEX_ERRORCHECK) {
@ -100,3 +79,47 @@ errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
return EBUSY;
}
}
/**
* Attempts acquiring lock.
*
* Unlike pthread_mutex_lock() this function won't block and instead
* returns an error immediately if the lock couldn't be acquired.
*
* @return 0 if lock was acquired, otherwise an errno
* @raise EAGAIN if maximum number of recursive locks is held
* @raise EBUSY if lock is currently held in read or write mode
* @raise EINVAL if `mutex` doesn't refer to an initialized lock
* @raise EDEADLK if `mutex` is `PTHREAD_MUTEX_ERRORCHECK` and the
* current thread already holds this mutex
*/
errno_t pthread_mutex_trylock(pthread_mutex_t *mutex) {
// get current state of lock
uint64_t word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
#if PTHREAD_USE_NSYNC
// use superior mutexes if possible
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL &&
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_trylock)) {
if (_weaken(nsync_mu_trylock)((nsync_mu *)mutex)) {
return 0;
} else {
return EBUSY;
}
}
#endif
// handle normal mutexes
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
if (_weaken(nsync_futex_wait_)) {
return pthread_mutex_trylock_drepper(&mutex->_futex);
} else {
return pthread_mutex_trylock_naive(mutex, word);
}
}
// handle recursive and error checking mutexes
return pthread_mutex_trylock_recursive(mutex, word);
}

View file

@ -25,45 +25,26 @@
#include "libc/runtime/internal.h"
#include "libc/thread/lock.h"
#include "libc/thread/thread.h"
#include "third_party/nsync/futex.internal.h"
#include "third_party/nsync/mu.h"
/**
* Releases mutex.
*
* This function does nothing in vfork() children.
*
* @return 0 on success or error number on failure
* @raises EPERM if in error check mode and not owned by caller
* @vforksafe
*/
errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
int me;
uint64_t word, lock;
LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
// get current state of lock
word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
#if PTHREAD_USE_NSYNC
// use fancy nsync mutex if possible
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && //
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_unlock)) {
_weaken(nsync_mu_unlock)((nsync_mu *)mutex);
return 0;
}
#endif
// implement barebones normal mutexes
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
lock = MUTEX_UNLOCK(word);
static void pthread_mutex_unlock_naive(pthread_mutex_t *mutex, uint64_t word) {
uint64_t lock = MUTEX_UNLOCK(word);
atomic_store_explicit(&mutex->_word, lock, memory_order_release);
return 0;
}
// implement recursive mutex unlocking
me = gettid();
// see "take 3" algorithm in "futexes are tricky" by ulrich drepper
static void pthread_mutex_unlock_drepper(atomic_int *futex, char pshare) {
int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
if (word == 2) {
atomic_store_explicit(futex, 0, memory_order_release);
_weaken(nsync_futex_wake_)(futex, 1, pshare);
}
}
static errno_t pthread_mutex_unlock_recursive(pthread_mutex_t *mutex,
uint64_t word) {
int me = gettid();
for (;;) {
// we allow unlocking an initialized lock that wasn't locked, but we
@ -88,3 +69,44 @@ errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
return 0;
}
}
/**
* Releases mutex.
*
* This function does nothing in vfork() children.
*
* @return 0 on success or error number on failure
* @raises EPERM if in error check mode and not owned by caller
* @vforksafe
*/
errno_t pthread_mutex_unlock(pthread_mutex_t *mutex) {
uint64_t word;
LOCKTRACE("pthread_mutex_unlock(%t)", mutex);
// get current state of lock
word = atomic_load_explicit(&mutex->_word, memory_order_relaxed);
#if PTHREAD_USE_NSYNC
// use superior mutexes if possible
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL && //
MUTEX_PSHARED(word) == PTHREAD_PROCESS_PRIVATE && //
_weaken(nsync_mu_unlock)) {
_weaken(nsync_mu_unlock)((nsync_mu *)mutex);
return 0;
}
#endif
// implement barebones normal mutexes
if (MUTEX_TYPE(word) == PTHREAD_MUTEX_NORMAL) {
if (_weaken(nsync_futex_wake_)) {
pthread_mutex_unlock_drepper(&mutex->_futex, MUTEX_PSHARED(word));
} else {
pthread_mutex_unlock_naive(mutex, word);
}
return 0;
}
// handle recursive and error checking mutexes
return pthread_mutex_unlock_recursive(mutex, word);
}

View file

@ -32,7 +32,7 @@ void sys_sched_yield(void);
int pthread_yield_np(void) {
if (IsXnuSilicon()) {
__syslib->__pthread_yield_np();
} else if (IsOpenbsd() || IsNetbsd()) {
} else if (IsOpenbsd()) {
// sched_yield() is punishingly slow on OpenBSD
// it's ruinously slow it'll destroy everything
pthread_pause_np();

View file

@ -16,9 +16,10 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/errno.h"
#include "libc/intrin/atomic.h"
#include "libc/str/str.h"
#include "libc/thread/thread.h"
#include "third_party/nsync/counter.h"
/**
* Destroys barrier.
@ -27,9 +28,8 @@
* @raise EINVAL if threads are still inside the barrier
*/
errno_t pthread_barrier_destroy(pthread_barrier_t *barrier) {
if (barrier->_nsync) {
nsync_counter_free(barrier->_nsync);
barrier->_nsync = 0;
}
if (atomic_load_explicit(&barrier->_waiters, memory_order_relaxed))
return EINVAL;
memset(barrier, -1, sizeof(*barrier));
return 0;
}

View file

@ -17,8 +17,9 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/errno.h"
#include "libc/intrin/atomic.h"
#include "libc/limits.h"
#include "libc/thread/thread.h"
#include "third_party/nsync/counter.h"
/**
* Initializes barrier.
@ -28,16 +29,17 @@
* before the barrier is released, which must be greater than zero
* @return 0 on success, or error number on failure
* @raise EINVAL if `count` isn't greater than zero
* @raise ENOMEM if insufficient memory exists
*/
errno_t pthread_barrier_init(pthread_barrier_t *barrier,
const pthread_barrierattr_t *attr,
unsigned count) {
nsync_counter c;
if (!count)
return EINVAL;
if (!(c = nsync_counter_new(count)))
return ENOMEM;
*barrier = (pthread_barrier_t){._nsync = c};
if (count > INT_MAX)
return EINVAL;
barrier->_count = count;
barrier->_pshared = attr ? *attr : PTHREAD_PROCESS_PRIVATE;
atomic_store_explicit(&barrier->_counter, count, memory_order_relaxed);
atomic_store_explicit(&barrier->_waiters, 0, memory_order_relaxed);
return 0;
}

View file

@ -16,25 +16,53 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/blockcancel.internal.h"
#include "libc/errno.h"
#include "libc/intrin/atomic.h"
#include "libc/limits.h"
#include "libc/thread/thread.h"
#include "third_party/nsync/counter.h"
#include "third_party/nsync/futex.internal.h"
/**
* Waits for all threads to arrive at barrier.
*
* When the barrier is broken, the state becomes reset to what it was
* when pthread_barrier_init() was called, so that the barrior may be
* used again in the same way. The last thread to arrive shall be the
* last to leave and it returns a magic value.
* used again in the same way.
*
* Unlike pthread_cond_timedwait() this function is not a cancelation
* point. It is not needed to have cleanup handlers on block cancels.
*
* @return 0 on success, `PTHREAD_BARRIER_SERIAL_THREAD` to one lucky
* thread which was the last arrival, or an errno on error
* @raise EINVAL if barrier is used incorrectly
*/
errno_t pthread_barrier_wait(pthread_barrier_t *barrier) {
if (nsync_counter_add(barrier->_nsync, -1)) {
nsync_counter_wait(barrier->_nsync, nsync_time_no_deadline);
return 0;
} else {
int n;
// enter barrier
atomic_fetch_add_explicit(&barrier->_waiters, 1, memory_order_acq_rel);
n = atomic_fetch_sub_explicit(&barrier->_counter, 1, memory_order_acq_rel);
n = n - 1;
// this can only happen on invalid usage
if (n < 0)
return EINVAL;
// reset count and wake waiters if we're last at barrier
if (!n) {
atomic_store_explicit(&barrier->_counter, barrier->_count,
memory_order_release);
atomic_store_explicit(&barrier->_waiters, 0, memory_order_release);
nsync_futex_wake_(&barrier->_waiters, INT_MAX, barrier->_pshared);
return PTHREAD_BARRIER_SERIAL_THREAD;
}
// wait for everyone else to arrive at barrier
BLOCK_CANCELATION;
while ((n = atomic_load_explicit(&barrier->_waiters, memory_order_acquire)))
nsync_futex_wait_(&barrier->_waiters, n, barrier->_pshared, 0);
ALLOW_CANCELATION;
return 0;
}

View file

@ -23,7 +23,7 @@
*
* @param pshared is set to one of the following
* - `PTHREAD_PROCESS_PRIVATE` (default)
* - `PTHREAD_PROCESS_SHARED` (unsupported)
* - `PTHREAD_PROCESS_SHARED`
* @return 0 on success, or error on failure
*/
errno_t pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr,

View file

@ -24,6 +24,6 @@
* @return 0 on success, or error on failure
*/
errno_t pthread_barrierattr_init(pthread_barrierattr_t *attr) {
*attr = 0;
*attr = PTHREAD_PROCESS_PRIVATE;
return 0;
}

View file

@ -24,13 +24,14 @@
*
* @param pshared can be one of
* - `PTHREAD_PROCESS_PRIVATE` (default)
* - `PTHREAD_PROCESS_SHARED` (unsupported)
* - `PTHREAD_PROCESS_SHARED`
* @return 0 on success, or error on failure
* @raises EINVAL if `pshared` is invalid
*/
errno_t pthread_barrierattr_setpshared(pthread_barrierattr_t *attr,
int pshared) {
switch (pshared) {
case PTHREAD_PROCESS_SHARED:
case PTHREAD_PROCESS_PRIVATE:
*attr = pshared;
return 0;

View file

@ -46,7 +46,7 @@ COSMOPOLITAN_C_START_
#define PTHREAD_RWLOCK_INITIALIZER {0}
#define PTHREAD_MUTEX_INITIALIZER {0}
#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, 0, PTHREAD_MUTEX_RECURSIVE}
#define PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP {0, {}, PTHREAD_MUTEX_RECURSIVE}
typedef uintptr_t pthread_t;
typedef int pthread_id_np_t;
@ -66,7 +66,10 @@ typedef struct pthread_spinlock_s {
typedef struct pthread_mutex_s {
uint32_t _nsync;
union {
int32_t _pid;
_Atomic(int32_t) _futex;
};
_Atomic(uint64_t) _word;
} pthread_mutex_t;
@ -92,7 +95,10 @@ typedef struct pthread_rwlock_s {
} pthread_rwlock_t;
typedef struct pthread_barrier_s {
void *_nsync;
int _count;
char _pshared;
_Atomic(int) _counter;
_Atomic(int) _waiters;
} pthread_barrier_t;
typedef struct pthread_attr_s {

View file

@ -0,0 +1,236 @@
#include <assert.h>
#include <cosmo.h>
#include <linux/futex.h>
#include <pthread.h>
#include <stdatomic.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <time.h>
#include <unistd.h>
#include "third_party/nsync/futex.internal.h"
// THIS IS AN EXAMPLE OF HOW TO USE COSMOPOLITAN FUTEXES TO IMPLEMENT
// YOUR OWN MUTEXES FROM SCRATCH. LOOK AT HOW MUCH BETTER THIS IT CAN
// MAKE THINGS COMPARED TO SPIN LOCKS. ALGORITHM FROM ULRICH DREPPER.
// arm fleet
// with futexes
// 30 threads / 100000 iterations
//
// 242,604 us real
// 4,222,946 us user
// 1,079,229 us sys
// footek_test on studio.test. 630 µs 17'415 µs 256'782 µs
// 1,362,557 us real
// 3,232,978 us user
// 2,104,824 us sys
// footek_test on pi.test. 611 µs 21'708 µs 1'385'129 µs
// 1,346,482 us real
// 3,370,513 us user
// 1,992,383 us sys
// footek_test on freebsdarm.test. 427 µs 19'967 µs 1'393'476 µs
// arm fleet
// without futexes
// 30 threads / 100000 iterations
//
// 1,282,084 us real
// 29,359,582 us user
// 34,553 us sys
// footek_test on studio.test. 961 µs 12'907 µs 1'287'983 µs
// 4,070,988 us real
// 16,203,990 us user
// 7,999 us sys
// footek_test on pi.test. 459 µs 16'376 µs 4'095'512 µs
// 7,012,493 us real
// 27,936,725 us user
// 7,871 us sys
// footek_test on freebsdarm.test. 502 µs 16'446 µs 7'051'545 µs
// x86 fleet
// with futexes
// 30 threads / 100000 iterations
//
// 146,015 us real
// 169,427 us user
// 68,939 us sys
// footek_test on rhel7.test. 376 µs 2'259 µs 153'024 µs
// 144,917 us real
// 383,317 us user
// 191,203 us sys
// footek_test on xnu.test. 11'143 µs 9'159 µs 164'865 µs
// 244,286 us real
// 405,395 us user
// 956,122 us sys
// footek_test on freebsd.test. 394 µs 2'165 µs 256'227 µs
// 209,095 us real
// 616,634 us user
// 9,945 us sys
// footek_test on netbsd.test. 502 µs 2'020 µs 261'895 µs
// 344,876 us real
// 50,000 us user
// 1,240,000 us sys
// footek_test on openbsd.test. 457 µs 2'737 µs 396'342 µs
// 1,193,906 us real
// 17,546,875 us user
// 3,000,000 us sys
// footek_test on win10.test. 462 µs 59'528 µs 1'348'265 µs
// x86 fleet
// without futexes
// 30 threads / 100000 iterations
//
// 897,815 us real
// 1,763,705 us user
// 9,696 us sys
// footek_test on rhel7.test. 423 µs 2'638 µs 912'241 µs
// 790,332 us real
// 2,359,967 us user
// 0 us sys
// footek_test on netbsd.test. 1'151 µs 2'634 µs 1'014'867 µs
// 2,332,724 us real
// 9,150,000 us user
// 10,000 us sys
// footek_test on openbsd.test. 557 µs 3'020 µs 2'554'648 µs
// 2,528,863 us real
// 56,546,875 us user
// 1,671,875 us sys
// footek_test on win10.test. 962 µs 9'698 µs 2'751'905 µs
// 2,916,033 us real
// 17,236,103 us user
// 0 us sys
// footek_test on freebsd.test. 690 µs 3'011 µs 2'925'997 µs
// 4,225,726 us real
// 16,679,456 us user
// 16,265 us sys
// footek_test on xnu.test. 98'468 µs 5'242 µs 5'191'724 µs
#define USE_FUTEX 1
#define THREADS 30
#define ITERATIONS 30000
#define MUTEX_LOCKED(word) ((word) & 8)
#define MUTEX_WAITING(word) ((word) & 16)
#define MUTEX_LOCK(word) ((word) | 8)
#define MUTEX_SET_WAITING(word) ((word) | 16)
#define MUTEX_UNLOCK(word) ((word) & ~(8 | 16))
void lock(atomic_int *futex) {
int word, cs;
for (int i = 0; i < 4; ++i) {
word = 0;
if (atomic_compare_exchange_strong_explicit(
futex, &word, 1, memory_order_acquire, memory_order_acquire))
return;
pthread_pause_np();
}
if (word == 1)
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
while (word > 0) {
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
#if USE_FUTEX
nsync_futex_wait_(futex, 2, 0, 0);
#endif
pthread_setcancelstate(cs, 0);
word = atomic_exchange_explicit(futex, 2, memory_order_acquire);
}
}
void unlock(atomic_int *futex) {
int word = atomic_fetch_sub_explicit(futex, 1, memory_order_release);
if (word == 2) {
atomic_store_explicit(futex, 0, memory_order_release);
#if USE_FUTEX
nsync_futex_wake_(futex, 1, 0);
#endif
}
}
int g_chores;
atomic_int g_lock;
pthread_mutex_t g_locker;
void *worker(void *arg) {
for (int i = 0; i < ITERATIONS; ++i) {
lock(&g_lock);
++g_chores;
unlock(&g_lock);
}
return 0;
}
int main() {
struct timeval start;
gettimeofday(&start, 0);
pthread_t th[THREADS];
for (int i = 0; i < THREADS; ++i)
pthread_create(&th[i], 0, worker, 0);
for (int i = 0; i < THREADS; ++i)
pthread_join(th[i], 0);
npassert(g_chores == THREADS * ITERATIONS);
struct rusage ru;
struct timeval end;
gettimeofday(&end, 0);
getrusage(RUSAGE_SELF, &ru);
printf("%,16ld us real\n"
"%,16ld us user\n"
"%,16ld us sys\n",
timeval_tomicros(timeval_sub(end, start)), //
timeval_tomicros(ru.ru_utime), //
timeval_tomicros(ru.ru_stime));
CheckForMemoryLeaks();
}
// COMPARE ULRICH DREPPER'S LOCKING ALGORITHM WITH MIKE BURROWS *NSYNC
// WHICH IS WHAT COSMOPOLITAN LIBC USES FOR YOUR POSIX THREADS MUTEXES
// x86 fleet
// with pthread_mutex_t
// 30 threads / 100000 iterations
//
// 186,976 us real
// 43,609 us user
// 205,585 us sys
// footek_test on freebsd.test. 410 µs 2'054 µs 195'339 µs
// 238,902 us real
// 235,743 us user
// 97,881 us sys
// footek_test on rhel7.test. 343 µs 2'339 µs 246'926 µs
// 201,285 us real
// 249,612 us user
// 141,230 us sys
// footek_test on xnu.test. 1'960 µs 5'350 µs 265'758 µs
// 303,363 us real
// 60,000 us user
// 410,000 us sys
// footek_test on openbsd.test. 545 µs 3'023 µs 326'200 µs
// 386,085 us real
// 586,455 us user
// 466,991 us sys
// footek_test on netbsd.test. 344 µs 2'421 µs 413'440 µs
// 245,010 us real
// 437,500 us user
// 140,625 us sys
// footek_test on win10.test. 300 µs 18'574 µs 441'225 µs
// arm fleet
// with pthread_mutex_t
// 30 threads / 100000 iterations
//
// 87,132 us real
// 183,517 us user
// 20,020 us sys
// footek_test on studio.test. 560 µs 12'418 µs 92'825 µs
// 679,374 us real
// 957,678 us user
// 605,078 us sys
// footek_test on pi.test. 462 µs 16'574 µs 702'833 µs
// 902,343 us real
// 1,459,706 us user
// 781,140 us sys
// footek_test on freebsdarm.test. 400 µs 16'261 µs 970'022 µs

View file

@ -37,6 +37,7 @@
#include "third_party/nsync/atomic.internal.h"
#include "third_party/nsync/common.internal.h"
#include "third_party/nsync/mu_semaphore.h"
#include "third_party/nsync/mu_semaphore.internal.h"
#include "third_party/nsync/wait_s.internal.h"
__static_yoink("nsync_notice");
@ -147,9 +148,9 @@ static void free_waiters_push (waiter *w) {
static void free_waiters_populate (void) {
int n;
if (IsNetbsd () || IsXnuSilicon ()) {
// netbsd needs one file descriptor per semaphore (!!)
// tim cook wants us to use his grand central dispatch
if (IsNetbsd () || (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ())) {
// netbsd needs a real file descriptor per semaphore
// tim cook wants us to use his lol central dispatch
n = 1;
} else {
n = getpagesize() / sizeof(waiter);

View file

@ -52,6 +52,7 @@
#include "third_party/nsync/atomic.h"
#include "third_party/nsync/common.internal.h"
#include "third_party/nsync/futex.internal.h"
#include "libc/intrin/kprintf.h"
#include "third_party/nsync/time.h"
#define FUTEX_WAIT_BITS_ FUTEX_BITSET_MATCH_ANY
@ -138,7 +139,7 @@ static int nsync_futex_polyfill_ (atomic_int *w, int expect, struct timespec *ab
}
if (_weaken (pthread_testcancel_np) &&
_weaken (pthread_testcancel_np) ()) {
return -ETIMEDOUT;
return -ECANCELED;
}
if (abstime && timespec_cmp (timespec_real (), *abstime) >= 0) {
return -ETIMEDOUT;
@ -163,7 +164,7 @@ static int nsync_futex_wait_win32_ (atomic_int *w, int expect, char pshare,
for (;;) {
now = timespec_real ();
if (timespec_cmp (now, deadline) > 0) {
if (timespec_cmp (now, deadline) >= 0) {
return etimedout();
}
wait = timespec_sub (deadline, now);

View file

@ -21,14 +21,9 @@
#include "third_party/nsync/mu_semaphore.internal.h"
__static_yoink("nsync_notice");
/* Apple's ulock (part by Cosmo futexes) is an internal API, but:
1. Unlike GCD it's cancellable, i.e. can be EINTR'd by signals
2. We currently always use ulock anyway for joining threads */
#define PREFER_GCD_OVER_ULOCK 1
/* Initialize *s; the initial value is 0. */
bool nsync_mu_semaphore_init (nsync_semaphore *s) {
if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
return nsync_mu_semaphore_init_gcd (s);
} else if (IsNetbsd ()) {
return nsync_mu_semaphore_init_sem (s);
@ -44,7 +39,7 @@ bool nsync_mu_semaphore_init (nsync_semaphore *s) {
errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
errno_t err;
BEGIN_CANCELATION_POINT;
if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
err = nsync_mu_semaphore_p_gcd (s);
} else if (IsNetbsd ()) {
err = nsync_mu_semaphore_p_sem (s);
@ -62,7 +57,7 @@ errno_t nsync_mu_semaphore_p (nsync_semaphore *s) {
errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_deadline) {
errno_t err;
BEGIN_CANCELATION_POINT;
if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
err = nsync_mu_semaphore_p_with_deadline_gcd (s, abs_deadline);
} else if (IsNetbsd ()) {
err = nsync_mu_semaphore_p_with_deadline_sem (s, abs_deadline);
@ -75,7 +70,7 @@ errno_t nsync_mu_semaphore_p_with_deadline (nsync_semaphore *s, nsync_time abs_d
/* Ensure that the count of *s is at least 1. */
void nsync_mu_semaphore_v (nsync_semaphore *s) {
if (PREFER_GCD_OVER_ULOCK && IsXnuSilicon ()) {
if (NSYNC_USE_GRAND_CENTRAL && IsXnuSilicon ()) {
return nsync_mu_semaphore_v_gcd (s);
} else if (IsNetbsd ()) {
return nsync_mu_semaphore_v_sem (s);

View file

@ -4,6 +4,20 @@
#include "third_party/nsync/time.h"
COSMOPOLITAN_C_START_
/* XNU ulock (used by cosmo futexes) is an internal API, however:
1. Unlike GCD it's cancelable i.e. can be EINTR'd by signals
2. We have no choice but to use ulock for joining threads
3. Grand Central Dispatch requires a busy loop workaround
4. ulock makes our mutexes use 20% more system time (meh)
5. ulock makes our mutexes use 40% less wall time (good)
6. ulock makes our mutexes use 64% less user time (woop)
ulock is an outstanding system call that must be used.
gcd is not an acceptable alternative to ulock. */
#define NSYNC_USE_GRAND_CENTRAL 0
bool nsync_mu_semaphore_init_futex(nsync_semaphore *);
errno_t nsync_mu_semaphore_p_futex(nsync_semaphore *);
errno_t nsync_mu_semaphore_p_with_deadline_futex(nsync_semaphore *, nsync_time);