Get threads working well on MacOS Arm64

- Now using 10x better GCD semaphores
- We now generate Linux-like thread ids
- We now use fast system clock / sleep libraries
- The APE M1 loader now generates Linux-like stacks
This commit is contained in:
Justine Tunney 2023-06-04 01:57:10 -07:00
parent b5eab2b0b7
commit bcf9af94bf
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
2037 changed files with 4664 additions and 4451 deletions

View file

@ -27,7 +27,6 @@
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/atomic.h"
#include "libc/intrin/describeflags.internal.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/strace.internal.h"
#include "libc/limits.h"
#include "libc/macros.internal.h"
@ -53,6 +52,9 @@
#include "libc/thread/tls2.h"
#include "libc/thread/xnu.internal.h"
#define kMaxThreadIds 32768
#define kMinThreadId 262144
#define __NR_thr_new 455
#define __NR_clone_linux 56
#define __NR__lwp_create 309
@ -64,8 +66,11 @@
#define LWP_SUSPENDED 0x00000080
struct CloneArgs {
union {
int tid;
_Alignas(16) union {
struct {
int tid;
int this;
};
uint32_t utid;
int64_t tid64;
};
@ -77,6 +82,12 @@ struct CloneArgs {
void *arg;
};
static struct CloneArgs *AllocateCloneArgs(char *stk, size_t stksz) {
return (struct CloneArgs *)(((uintptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-16);
}
#ifdef __x86_64__
////////////////////////////////////////////////////////////////////////////////
@ -108,7 +119,7 @@ WinThreadEntry(int rdi, // rcx
int rc;
if (wt->tls) __set_tls_win32(wt->tls);
*wt->ctid = wt->tid;
rc = WinThreadLaunch(wt->arg, wt->tid, wt->func, (intptr_t)wt & -16);
rc = WinThreadLaunch(wt->arg, wt->tid, wt->func, (intptr_t)wt);
// we can now clear ctid directly since we're no longer using our own
// stack memory, which can now be safely free'd by the parent thread.
*wt->ztid = 0;
@ -124,9 +135,7 @@ static textwindows errno_t CloneWindows(int (*func)(void *, int), char *stk,
void *tls, int *ptid, int *ctid) {
int64_t h;
struct CloneArgs *wt;
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-alignof(struct CloneArgs));
wt = AllocateCloneArgs(stk, stksz);
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->func = func;
@ -155,7 +164,6 @@ void XnuThreadThunk(void *pthread, // rdi x0
asm("XnuThreadThunk:\n\t"
"xor\t%ebp,%ebp\n\t"
"mov\t%r8,%rsp\n\t"
"and\t$-16,%rsp\n\t"
"push\t%rax\n\t"
"jmp\tXnuThreadMain\n\t"
".size\tXnuThreadThunk,.-XnuThreadThunk");
@ -209,9 +217,7 @@ static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
_npassert(sys_bsdthread_register(XnuThreadThunk, 0, 0, 0, 0, 0, 0) != -1);
once = true;
}
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-alignof(struct CloneArgs));
wt = AllocateCloneArgs(stk, stksz);
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
@ -248,9 +254,7 @@ static errno_t CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
bool failed;
int64_t tid;
struct CloneArgs *wt;
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-alignof(struct CloneArgs));
wt = AllocateCloneArgs(stk, stksz);
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->tls = tls;
@ -260,7 +264,7 @@ static errno_t CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
.start_func = FreebsdThreadMain,
.arg = wt,
.stack_base = stk,
.stack_size = (((intptr_t)wt - (intptr_t)stk) & -16) - 8,
.stack_size = (uintptr_t)wt - (uintptr_t)stk,
.tls_base = flags & CLONE_SETTLS ? tls : 0,
.tls_size = 64,
.child_tid = &wt->tid64,
@ -346,8 +350,7 @@ static wontreturn void NetbsdThreadMain(void *arg, // rdi
// we no longer use the stack after this point
// %eax = int __lwp_exit(void);
asm volatile("movl\t$0,%2\n\t" // *wt->ztid = 0
"syscall\n\t" // __lwp_exit()
"ud2"
"syscall" // __lwp_exit()
: "=a"(ax), "=d"(dx), "=m"(*ztid)
: "0"(310)
: "rcx", "r11", "memory");
@ -440,20 +443,18 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
static void *SiliconThreadMain(void *arg) {
register struct CloneArgs *wt asm("x21") = arg;
asm volatile("ldr\tx28,%0" : /* no outputs */ : "m"(wt->tls));
int tid = sys_gettid();
*wt->ctid = tid;
*wt->ptid = tid;
*wt->ctid = wt->this;
register long x0 asm("x0") = (long)wt->arg;
register long x1 asm("x1") = (long)tid;
register long x1 asm("x1") = (long)wt->tid;
asm volatile("mov\tx19,x29\n\t" // save frame pointer
"mov\tx20,sp\n\t" // save stack pointer
"mov\tx29,#0\n\t" // reset backtrace
"mov\tsp,x21\n\t" // switch stack
"mov\tsp,%3\n\t" // switch stack
"blr\t%2\n\t" // wt->func(wt->arg, tid)
"mov\tx29,x19\n\t" // restore frame pointer
"mov\tsp,x20" // restore stack pointer
: "+r"(x0)
: "r"(x1), "r"(wt->func)
: "r"(x1), "r"(wt->func), "r"(wt)
: "x19", "x20", "memory");
*wt->ztid = 0;
return 0;
@ -462,18 +463,24 @@ static void *SiliconThreadMain(void *arg) {
static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, int *ptid,
int *ctid) {
errno_t res;
unsigned tid;
pthread_t th;
struct CloneArgs *wt;
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-MAX(16, alignof(struct CloneArgs)));
static atomic_uint tids;
wt = AllocateCloneArgs(stk, stksz);
tid = atomic_fetch_add_explicit(&tids, 1, memory_order_acq_rel);
wt->this = tid = (tid & (kMaxThreadIds - 1)) + kMinThreadId;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->tls = flags & CLONE_SETTLS ? tls : 0;
wt->func = fn;
wt->arg = arg;
return __syslib->pthread_create(&th, 0, SiliconThreadMain, wt);
if (!(res = __syslib->pthread_create(&th, 0, SiliconThreadMain, wt)) &&
(flags & CLONE_PARENT_SETTID)) {
*ptid = tid;
}
return res;
}
#endif /* __aarch64__ */

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/calls.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/intrin/strace.internal.h"
@ -27,11 +28,16 @@
#include "libc/runtime/stack.h"
#include "libc/runtime/syslib.internal.h"
#include "libc/sysv/consts/prot.h"
#include "libc/sysv/consts/sig.h"
#include "libc/thread/thread.h"
#include "libc/thread/tls.h"
#ifndef __x86_64__
void __wipe(uintptr_t);
/**
* @fileoverview Cosmopolitan C Runtime, Second Edition
*/
void __wipe(uintptr_t) _Hide;
int main(int, char **, char **) __attribute__((__weak__));
typedef int init_f(int argc, char **argv, char **envp, unsigned long *auxv);
@ -75,12 +81,14 @@ textstartup void cosmo(long *sp, struct Syslib *m1) {
while (*auxv++) donothing;
// detect apple m1 environment
if ((__syslib = m1)) {
if (SupportsXnu() && (__syslib = m1)) {
hostos = _HOSTXNU;
magnums = syscon_xnu;
} else {
} else if (SupportsLinux()) {
hostos = _HOSTLINUX;
magnums = syscon_linux;
} else {
notpossible;
}
// setup system magic numbers
@ -88,6 +96,18 @@ textstartup void cosmo(long *sp, struct Syslib *m1) {
*mp = *magnums++;
}
// check system call abi compatibility
if (SupportsXnu() && __syslib && __syslib->version < SYSLIB_VERSION) {
sys_write(2, "need newer ape loader\n", 22);
_Exit(127);
}
// disable enosys trapping
if (IsBsd()) {
void *act[6] = {SIG_IGN};
sys_sigaction(SIGSYS, act, 0, 8, 0);
}
// needed by kisdangerous()
__oldstack = (intptr_t)sp;
__pid = sys_getpid().ax;
@ -97,7 +117,7 @@ textstartup void cosmo(long *sp, struct Syslib *m1) {
_mmi.p = _mmi.s;
__mmi_lock_obj._type = PTHREAD_MUTEX_RECURSIVE;
// record system-provided stack to memory manager
// record provided stack to memory manager
_mmi.i = 1;
_mmi.p->x = (uintptr_t)GetStackAddr() >> 16;
_mmi.p->y = (uintptr_t)(GetStackAddr() + (GetStackSize() - FRAMESIZE)) >> 16;
@ -106,6 +126,7 @@ textstartup void cosmo(long *sp, struct Syslib *m1) {
#if 0
#if IsAsan()
// TODO(jart): Figure out ASAN data model on AARCH64.
__asan_init(argc, argv, envp, auxv);
#endif
#endif

View file

@ -66,5 +66,5 @@ void __enable_threads(void) {
STRACE("__enable_threads()");
FixupLockNops();
#endif
__threaded = sys_gettid();
__threaded = __tls_enabled ? __get_tls()->tib_tid : sys_gettid();
}

View file

@ -190,7 +190,7 @@ textstartup void __enable_tls(void) {
tib->tib_strace = __strace;
tib->tib_ftrace = __ftrace;
tib->tib_pthread = (pthread_t)&_pthread_main;
if (IsLinux()) {
if (IsLinux() || IsXnuSilicon()) {
// gnu/systemd guarantees pid==tid for the main thread so we can
// avoid issuing a superfluous system call at startup in program
tid = __pid;

View file

@ -39,28 +39,29 @@ int sys_fork(void) {
#elif defined(__aarch64__)
int flags = 17; // SIGCHLD
void *child_stack = 0;
void *parent_tidptr = 0;
void *newtls = 0;
void *child_tidptr = 0;
register long r0 asm("x0") = (long)flags;
register long r1 asm("x1") = (long)child_stack;
register long r2 asm("x2") = (long)parent_tidptr;
register long r3 asm("x3") = (long)newtls;
register long r4 asm("x4") = (long)child_tidptr;
register int res_x0 asm("x0");
register int res_x1 asm("x1");
asm volatile("mov\tx8,%2\n\t"
"mov\tx16,%3\n\t"
"svc\t0"
: "=r"(res_x0), "=r"(res_x1)
: "i"(220), "i"(2), "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4)
: "x8", "x16", "memory");
if (IsXnu() && res_x0 != -1) {
res_x0 &= res_x1 - 1;
if (IsLinux()) {
int flags = 17; // SIGCHLD
void *child_stack = 0;
void *parent_tidptr = 0;
void *newtls = 0;
void *child_tidptr = 0;
register long r0 asm("x0") = (long)flags;
register long r1 asm("x1") = (long)child_stack;
register long r2 asm("x2") = (long)parent_tidptr;
register long r3 asm("x3") = (long)newtls;
register long r4 asm("x4") = (long)child_tidptr;
register int res_x0 asm("x0");
asm volatile("mov\tx8,%1\n\t"
"svc\t0"
: "=r"(res_x0)
: "i"(220), "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4)
: "x8", "x16", "memory");
return _sysret(res_x0);
} else if (__syslib) {
return _sysret(__syslib->fork());
} else {
return enosys();
}
return _sysret(res_x0);
#else

View file

@ -56,7 +56,7 @@ int _fork(uint32_t dwCreationFlags) {
__pid = dx;
if (__tls_enabled) {
tib = __get_tls();
tid = IsLinux() ? dx : sys_gettid();
tid = IsLinux() || IsXnuSilicon() ? dx : sys_gettid();
atomic_store_explicit(&tib->tib_tid, tid, memory_order_relaxed);
if ((pt = (struct PosixThread *)tib->tib_pthread)) {
atomic_store_explicit(&pt->ptid, tid, memory_order_relaxed);

View file

@ -1,6 +1,5 @@
#ifndef COSMOPOLITAN_LIBC_RUNTIME_SYSLIB_H_
#define COSMOPOLITAN_LIBC_RUNTIME_SYSLIB_H_
#include "libc/calls/struct/iovec.h"
#include "libc/calls/struct/sigaction.h"
#include "libc/calls/struct/sigset.h"
#include "libc/calls/struct/timespec.h"
@ -8,43 +7,41 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
/**
* @fileoverview System DSO interfaces provided by APE loader.
*
* These functions are owned by the platform C library. Regardless of
* platform, POSIX APIs returning `long` will follow the Linux Kernel
* `-errno` convention, and hence should be wrapped with `_sysret()`.
*/
#define SYSLIB_MAGIC ('s' | 'l' << 8 | 'i' << 16 | 'b' << 24)
#define SYSLIB_VERSION 0
#define SYSLIB_VERSION 1
typedef uint64_t dispatch_time_t;
typedef uint64_t dispatch_semaphore_t;
struct Syslib {
int magic;
int version;
void (*exit)(int) wontreturn;
long (*fork)(void);
long (*read)(int, void *, size_t);
long (*pread)(int, void *, size_t, int64_t);
long (*readv)(int, const struct iovec *, int);
long (*write)(int, const void *, size_t);
long (*pwrite)(int, const void *, size_t, int64_t);
long (*writev)(int, const struct iovec *, int);
long (*openat)(int, const char *, int, ...);
long (*pipe)(int[2]);
long (*close)(int);
long (*clock_gettime)(int, struct timespec *);
long (*nanosleep)(const struct timespec *, struct timespec *);
long (*mmap)(void *, size_t, int, int, int, int64_t);
long (*sigaction)(int, const struct sigaction *restrict,
struct sigaction *restrict);
int (*pthread_jit_write_protect_supported_np)(void);
void (*pthread_jit_write_protect_np)(int);
void (*sys_icache_invalidate)(void *, size_t);
pthread_t (*pthread_self)(void);
int (*pthread_create)(pthread_t *, const pthread_attr_t *, void *(*)(void *),
void *);
int (*pthread_detach)(pthread_t);
int (*pthread_join)(pthread_t, void **);
void (*pthread_exit)(void *);
int (*pthread_kill)(pthread_t, int);
int (*pthread_sigmask)(int, const sigset_t *restrict, sigset_t *restrict);
int (*pthread_setname_np)(const char *);
int (*pthread_key_create)(pthread_key_t *, void (*)(void *));
int (*pthread_setspecific)(pthread_key_t, const void *);
void *(*pthread_getspecific)(pthread_key_t);
dispatch_semaphore_t (*dispatch_semaphore_create)(long);
long (*dispatch_semaphore_signal)(dispatch_semaphore_t);
long (*dispatch_semaphore_wait)(dispatch_semaphore_t, dispatch_time_t);
dispatch_time_t (*dispatch_walltime)(const struct timespec *, int64_t);
};
extern struct Syslib *__syslib;