/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2021 Justine Alexandra Roberts Tunney │ │ │ │ Permission to use, copy, modify, and/or distribute this software for │ │ any purpose with or without fee is hereby granted, provided that the │ │ above copyright notice and this permission notice appear in all copies. │ │ │ │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/atomic.h" #include "libc/calls/state.internal.h" #include "libc/calls/struct/ucontext-netbsd.internal.h" #include "libc/dce.h" #include "libc/intrin/asmflag.h" #include "libc/intrin/atomic.h" #include "libc/intrin/ulock.h" #include "libc/limits.h" #include "libc/mem/alloca.h" #include "libc/nt/enum/processcreationflags.h" #include "libc/nt/runtime.h" #include "libc/nt/synchronization.h" #include "libc/nt/thread.h" #include "libc/nt/thunk/msabi.h" #include "libc/runtime/runtime.h" #include "libc/runtime/syslib.internal.h" #include "libc/sock/internal.h" #include "libc/sysv/consts/arch.h" #include "libc/thread/freebsd.internal.h" #include "libc/thread/openbsd.internal.h" #include "libc/thread/posixthread.internal.h" #include "libc/thread/xnu.internal.h" #define kMaxThreadIds 32768 #define kMinThreadId 262144 #define AMD64_SET_FSBASE 129 #define AMD64_SET_GSBASE 131 #define __NR_thr_new 455 #define __NR_clone_linux 56 #define __NR__lwp_create 309 #define __NR_getcontext_netbsd 307 #define __NR_bsdthread_create 0x02000168 #define __NR_thread_fast_set_cthread_self 0x03000003 #define PTHREAD_START_CUSTOM_XNU 0x01000000 #define LWP_DETACHED 0x00000040 #define LWP_SUSPENDED 0x00000080 struct CloneArgs { union { long sp; int64_t tid64; }; atomic_int *ptid; atomic_int *ctid; char *tls; int (*func)(void *); void *arg; }; int sys_set_tls(uintptr_t, void *); int __stack_call(void *, int, long, long, int (*)(void *), long); #ifdef __x86_64__ //////////////////////////////////////////////////////////////////////////////// // THE NEW TECHNOLOGY __msabi extern typeof(ExitThread) *const __imp_ExitThread; __msabi extern typeof(GetCurrentThreadId) *const __imp_GetCurrentThreadId; __msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll; textwindows dontinstrument wontreturn static void // WinThreadEntry(int rdi, // rcx int rsi, // rdx int rdx, // r8 struct CloneArgs *wt) { // r9 __set_tls_win32(wt->tls); int tid = __imp_GetCurrentThreadId(); atomic_int *ctid = wt->ctid; atomic_init(ctid, tid); atomic_init(wt->ptid, tid); int rc = __stack_call(wt->arg, tid, 0, 0, wt->func, wt->sp); // we can now clear ctid directly since we're no longer using our own // stack memory, which can now be safely free'd by the parent thread. atomic_store_explicit(ctid, 0, memory_order_release); __imp_WakeByAddressAll(ctid); // since we didn't indirect this function through NT2SYSV() it's not // safe to simply return, and as such, we need ExitThread(). __imp_ExitThread(rc); __builtin_unreachable(); } textwindows static errno_t CloneWindows(int (*func)(void *), char *stk, size_t stksz, void *arg, void *tls, atomic_int *ptid, atomic_int *ctid) { long sp; int64_t h; intptr_t tip; uint32_t utid; struct CloneArgs *wt; sp = tip = (intptr_t)stk + stksz; sp -= sizeof(struct CloneArgs); sp &= -alignof(struct CloneArgs); wt = (struct CloneArgs *)sp; wt->ctid = ctid; wt->ptid = ptid; wt->func = func; wt->arg = arg; wt->tls = tls; wt->sp = tip & -16; if ((h = CreateThread(0, 65536, (void *)WinThreadEntry, wt, kNtStackSizeParamIsAReservation, &utid))) { atomic_init(ptid, utid); struct CosmoTib *tib = tls; atomic_store_explicit(&tib->tib_syshand, h, memory_order_release); return 0; } else { return __dos2errno(GetLastError()); } } //////////////////////////////////////////////////////////////////////////////// // XNU'S NOT UNIX void XnuThreadThunk(void *pthread, // rdi x0 int machport, // rsi x1 void *(*func)(void *), // rdx x2 void *arg, // rcx x3 intptr_t *stack, // r8 x4 unsigned xnuflags); // r9 x5 asm("XnuThreadThunk:\n\t" "xor\t%ebp,%ebp\n\t" "mov\t%r8,%rsp\n\t" "push\t%rax\n\t" "jmp\tXnuThreadMain\n\t" ".size\tXnuThreadThunk,.-XnuThreadThunk"); __attribute__((__used__)) dontinstrument wontreturn static void XnuThreadMain(void *pthread, // rdi int tid, // rsi int (*func)(void *arg), // rdx void *arg, // rcx struct CloneArgs *wt, // r8 unsigned xnuflags) { // r9 atomic_init(wt->ctid, tid); atomic_init(wt->ptid, tid); // XNU uses the same 0x30 offset as the WIN32 TIB x64. They told the // Go team at Google that they Apply stands by our ability to use it // https://github.com/golang/go/issues/23617#issuecomment-376662373 int ax; asm volatile("syscall" : "=a"(ax) : "0"(__NR_thread_fast_set_cthread_self), "D"(wt->tls - 0x30) : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc"); func(arg); // we no longer use the stack after this point // %rax = int bsdthread_terminate(%rdi = void *stackaddr, // %rsi = size_t freesize, // %rdx = uint32_t port, // %r10 = uint32_t sem); asm volatile("movl\t$0,(%%rsi)\n\t" // *wt->ctid = 0 "mov\t$0x101,%%edi\n\t" // wake all "xor\t%%edx,%%edx\n\t" // wake_value "mov\t$0x02000204,%%eax\n\t" // ulock_wake() "syscall\n\t" // "xor\t%%edi,%%edi\n\t" // freeaddr "xor\t%%esi,%%esi\n\t" // freesize "xor\t%%edx,%%edx\n\t" // kport "xor\t%%r10d,%%r10d\n\t" // joinsem "mov\t$0x02000169,%%eax\n\t" // bsdthread_terminate() "syscall" : /* no outputs */ : "S"(wt->ctid) : "rax", "rcx", "r10", "r11", "memory"); __builtin_unreachable(); } static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, void *arg, void *tls, atomic_int *ptid, atomic_int *ctid) { // perform this weird mandatory system call once static bool once; if (!once) { sys_bsdthread_register(XnuThreadThunk, 0, 0, 0, 0, 0, 0); once = true; } // setup stack for thread long sp; struct CloneArgs *wt; sp = (intptr_t)stk + stksz; sp -= sizeof(struct CloneArgs); sp &= -alignof(struct CloneArgs); wt = (struct CloneArgs *)sp; sp &= -16; // pass parameters to new thread via xnu wt->ctid = ctid; wt->ptid = ptid; wt->tls = tls; return sys_clone_xnu(fn, arg, wt, 0, PTHREAD_START_CUSTOM_XNU); } //////////////////////////////////////////////////////////////////////////////// // OPEN BESIYATA DISHMAYA // we can't use address sanitizer because: // 1. __asan_handle_no_return wipes stack [todo?] relegated dontinstrument wontreturn static void OpenbsdThreadMain(void *p) { struct CloneArgs *wt = p; int tid = atomic_load_explicit(wt->ctid, memory_order_relaxed); atomic_init(wt->ptid, tid); wt->func(wt->arg); asm volatile("mov\t%1,%%rsp\n\t" // so syscall can validate stack exists "movl\t$0,(%2)\n\t" // *wt->ctid = 0 (old stack now free'd) "syscall\n\t" // futex(int*, op, val) will wake wait0 "xor\t%%edi,%%edi\n\t" // so kernel doesn't write to old stack "mov\t$302,%%eax\n\t" // __threxit(int *notdead) doesn't wake "syscall" : /* no outputs */ : "a"(83), "m"(__oldstack), "D"(wt->ctid), "S"(2 /* FUTEX_WAKE */), "d"(INT_MAX) : "rcx", "r11", "memory"); __builtin_unreachable(); } relegated static errno_t CloneOpenbsd(int (*func)(void *), char *stk, size_t stksz, void *arg, void *tls, atomic_int *ptid, atomic_int *ctid) { int rc; intptr_t sp; struct __tfork *tf; struct CloneArgs *wt; sp = (intptr_t)stk + stksz; sp -= sizeof(struct __tfork); sp &= -alignof(struct __tfork); tf = (struct __tfork *)sp; sp -= sizeof(struct CloneArgs); sp &= -alignof(struct CloneArgs); wt = (struct CloneArgs *)sp; sp &= -16; sp -= 8; *(intptr_t *)sp = (intptr_t)CloneOpenbsd + 1; wt->ctid = ctid; wt->ptid = ptid; wt->arg = arg; wt->func = func; tf->tf_stack = (char *)sp; tf->tf_tcb = tls; tf->tf_tid = ctid; if ((rc = __tfork_thread(tf, sizeof(*tf), OpenbsdThreadMain, wt)) >= 0) { atomic_init(ptid, rc); return 0; } else { return -rc; } } //////////////////////////////////////////////////////////////////////////////// // NET BESIYATA DISHMAYA wontreturn dontinstrument static void NetbsdThreadMain( void *arg, // rdi int (*func)(void *), // rsi atomic_int *ctid, // rdx atomic_int *ptid) { // rcx int ax; asm("syscall" : "=a"(ax) // man says always succeeds : "0"(311) // _lwp_self() : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc"); atomic_init(ctid, ax); atomic_init(ptid, ax); func(arg); // we no longer use the stack after this point // %eax = int __lwp_exit(void); asm volatile("movl\t$0,(%2)\n\t" // *ztid = 0 "syscall" // __lwp_exit() : "=a"(ax) : "0"(310), "r"(ctid) : "rcx", "r11", "memory"); __builtin_unreachable(); } static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, void *arg, void *tls, atomic_int *ptid, atomic_int *ctid) { // NetBSD has its own clone() and it works, but it's technically a // second-class API, intended to help Linux folks migrate to this. int ax; bool failed; intptr_t dx, sp; static bool once; struct ucontext_netbsd *ctx; static struct ucontext_netbsd netbsd_clone_template; // memoize arbitrary valid processor state structure if (!once) { asm volatile(CFLAG_ASM("syscall") : CFLAG_CONSTRAINT(failed), "=a"(ax) : "1"(__NR_getcontext_netbsd), "D"(&netbsd_clone_template) : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory"); once = true; } sp = (intptr_t)stk + stksz; // align the stack sp &= -16; // simulate call to misalign stack and ensure backtrace looks good sp -= 8; *(intptr_t *)sp = (intptr_t)CloneNetbsd + 1; // place the giant 784 byte ucontext structure in the red zone! // it only has to live long enough for the thread to come alive ctx = (struct ucontext_netbsd *)((sp - sizeof(struct ucontext_netbsd)) & -64); // pass parameters in process state memcpy(ctx, &netbsd_clone_template, sizeof(*ctx)); ctx->uc_link = 0; ctx->uc_mcontext.rbp = 0; ctx->uc_mcontext.rsp = sp; ctx->uc_mcontext.rip = (intptr_t)NetbsdThreadMain; ctx->uc_mcontext.rdi = (intptr_t)arg; ctx->uc_mcontext.rsi = (intptr_t)func; ctx->uc_mcontext.rdx = (intptr_t)ctid; ctx->uc_mcontext.rcx = (intptr_t)ptid; ctx->uc_flags |= _UC_STACK; ctx->uc_stack.ss_sp = stk; ctx->uc_stack.ss_size = stksz; ctx->uc_stack.ss_flags = 0; ctx->uc_flags |= _UC_TLSBASE; ctx->uc_mcontext._mc_tlsbase = (intptr_t)tls; // perform the system call int tid = 0; asm volatile(CFLAG_ASM("syscall") : CFLAG_CONSTRAINT(failed), "=a"(ax), "=d"(dx) : "1"(__NR__lwp_create), "D"(ctx), "S"(LWP_DETACHED), "2"(&tid) : "rcx", "r8", "r9", "r10", "r11", "memory"); if (!failed) { atomic_init(ptid, tid); return 0; } else { return ax; } } #endif /* __x86_64__ */ //////////////////////////////////////////////////////////////////////////////// // FREE BESIYATA DISHMAYA wontreturn dontinstrument static void FreebsdThreadMain(void *p) { struct CloneArgs *wt = p; #ifdef __aarch64__ asm volatile("mov\tx28,%0" : /* no outputs */ : "r"(wt->tls)); #elif defined(__x86_64__) sys_set_tls(AMD64_SET_GSBASE, wt->tls); #endif atomic_init(wt->ctid, wt->tid64); atomic_init(wt->ptid, wt->tid64); wt->func(wt->arg); // we no longer use the stack after this point // void thr_exit(%rdi = long *state); #ifdef __x86_64__ asm volatile("movl\t$0,%0\n\t" // *wt->ctid = 0 "syscall\n\t" // _umtx_op(wt->ctid, WAKE, INT_MAX) "movl\t$431,%%eax\n\t" // thr_exit(long *nonzeroes_and_wake) "xor\t%%edi,%%edi\n\t" // sad we can't use this free futex op "syscall\n\t" // thr_exit() fails if thread is orphaned "movl\t$1,%%eax\n\t" // _exit() "syscall" // : "=m"(*wt->ctid) : "a"(454), "D"(wt->ctid), "S"(UMTX_OP_WAKE), "d"(INT_MAX) : "rcx", "r8", "r9", "r10", "r11", "memory"); #elif defined(__aarch64__) register long x0 asm("x0") = (long)wt->ctid; register long x1 asm("x1") = UMTX_OP_WAKE; register long x2 asm("x2") = INT_MAX; register long x8 asm("x8") = 454; // _umtx_op asm volatile("str\twzr,%0\n\t" // *wt->ctid = 0 "svc\t0\n\t" // _umtx_op(wt->ctid, WAKE, INT_MAX) "mov\tx0,#0\n\t" // arg0 = 0 "mov\tx8,#431\n\t" // thr_exit "svc\t0\n\t" // thr_exit(long *nonzeroes_and_wake = 0) "mov\tx8,#1\n\t" // _exit "svc\t0" // _exit(long *nonzeroes_and_wake = 0) : "=m"(*wt->ctid) : "r"(x0), "r"(x1), "r"(x2), "r"(x8)); #else #error "unsupported architecture" #endif __builtin_unreachable(); } static errno_t CloneFreebsd(int (*func)(void *), char *stk, size_t stksz, void *arg, void *tls, atomic_int *ptid, atomic_int *ctid) { long sp; int64_t tid64; struct CloneArgs *wt; sp = (intptr_t)stk + stksz; sp -= sizeof(struct CloneArgs); sp &= -alignof(struct CloneArgs); wt = (struct CloneArgs *)sp; sp &= -16; wt->ctid = ctid; wt->ptid = ptid; wt->tls = tls; wt->func = func; wt->arg = arg; struct thr_param params = { .start_func = FreebsdThreadMain, .arg = wt, .stack_base = stk, .stack_size = sp - (long)stk, .tls_base = tls, .tls_size = 64, .child_tid = &wt->tid64, .parent_tid = &tid64, }; #ifdef __x86_64__ int ax; bool failed; asm volatile(CFLAG_ASM("syscall") : CFLAG_CONSTRAINT(failed), "=a"(ax) : "1"(__NR_thr_new), "D"(¶ms), "S"(sizeof(params)) : "rcx", "rdx", "r8", "r9", "r10", "r11", "memory"); if (failed) return ax; #elif defined(__aarch64__) register long x0 asm("x0") = (long)¶ms; register long x1 asm("x1") = sizeof(params); register int x8 asm("x8") = 0x1c7; // thr_new asm volatile("svc\t0" : "+r"(x0) : "r"(x1), "r"(x8) : "memory"); if (x0) return x0; #else #error "unsupported architecture" #endif atomic_init(ptid, tid64); return 0; } #ifdef __aarch64__ //////////////////////////////////////////////////////////////////////////////// // APPLE SILICON dontinstrument static void *SiliconThreadMain(void *arg) { struct CloneArgs *wt = arg; atomic_int *ctid = wt->ctid; int tid = atomic_load_explicit(ctid, memory_order_relaxed); asm volatile("mov\tx28,%0" : /* no outputs */ : "r"(wt->tls)); __stack_call(wt->arg, tid, 0, 0, wt->func, wt->sp); atomic_store_explicit(ctid, 0, memory_order_release); ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, ctid, 0); return 0; } static errno_t CloneSilicon(int (*fn)(void *), char *stk, size_t stksz, void *arg, void *tls, atomic_int *ptid, atomic_int *ctid) { // assign tid to new thread static atomic_uint tids; unsigned tid = atomic_fetch_add_explicit(&tids, 1, memory_order_relaxed); tid %= kMaxThreadIds; tid += kMinThreadId; atomic_init(ctid, tid); atomic_init(ptid, tid); // pass temp data on stack intptr_t sp, tip; struct CloneArgs *wt; sp = tip = (intptr_t)stk + stksz; sp -= sizeof(struct CloneArgs); sp &= -alignof(struct CloneArgs); wt = (struct CloneArgs *)sp; wt->func = fn; wt->arg = arg; wt->tls = tls; wt->ctid = ctid; wt->sp = tip & -16; // ask apple libc to spawn thread errno_t res; pthread_t th; size_t babystack = __syslib->__pthread_stack_min; #pragma GCC push_options #pragma GCC diagnostic ignored "-Walloca-larger-than=" void *attr = alloca(__syslib->__sizeof_pthread_attr_t); #pragma GCC pop_options __syslib->__pthread_attr_init(attr); __syslib->__pthread_attr_setguardsize(attr, 0); __syslib->__pthread_attr_setstacksize(attr, babystack); if (!(res = __syslib->__pthread_create(&th, attr, SiliconThreadMain, wt))) { atomic_init(ptid, tid); struct CosmoTib *tib = tls; atomic_store_explicit(&tib[-1].tib_syshand, th, memory_order_release); } __syslib->__pthread_attr_destroy(attr); return res; } #endif /* __aarch64__ */ //////////////////////////////////////////////////////////////////////////////// // GNU/SYSTEMD struct LinuxCloneArgs { int (*func)(void *); void *arg; char *tls; }; int sys_clone_linux(int flags, // rdi long sp, // rsi atomic_int *ptid, // rdx atomic_int *ctid, // rcx void *tls, // r8 void *func, // r9 void *arg); // 8(rsp) dontinstrument static int AmdLinuxThreadEntry(void *arg) { struct LinuxCloneArgs *wt = arg; #if defined(__x86_64__) sys_set_tls(ARCH_SET_GS, wt->tls); #endif return wt->func(wt->arg); } static int CloneLinux(int (*func)(void *), char *stk, size_t stksz, int flags, void *arg, void *tls, atomic_int *ptid, atomic_int *ctid) { long sp = (intptr_t)stk + stksz; #if defined(__x86_64__) sp -= sizeof(struct LinuxCloneArgs); sp &= -alignof(struct LinuxCloneArgs); struct LinuxCloneArgs *wt = (struct LinuxCloneArgs *)sp; sp &= -16; // align the stack wt->arg = arg; wt->tls = tls; wt->func = func; func = AmdLinuxThreadEntry; arg = wt; #elif defined(__aarch64__) sp &= -128; // for kernels <=4.6 #endif int rc; if ((rc = sys_clone_linux(flags, sp, ptid, ctid, tls, func, arg)) >= 0) { // clone() is documented as setting ptid before return return 0; } else { return -rc; } } //////////////////////////////////////////////////////////////////////////////// // COSMOPOLITAN /** * Creates thread without malloc() being linked. * * If you use clone() you're on your own. */ errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg, void *ptid, void *tls, void *ctid) { errno_t err; atomic_fetch_add(&_pthread_count, 1); if (IsLinux()) { err = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid); } else if (IsXnu()) { #if defined(__x86_64__) err = CloneXnu(func, stk, stksz, arg, tls, ptid, ctid); #elif defined(__aarch64__) err = CloneSilicon(func, stk, stksz, arg, tls, ptid, ctid); #else #error "unsupported architecture" #endif } else if (IsFreebsd()) { err = CloneFreebsd(func, stk, stksz, arg, tls, ptid, ctid); #if defined(__x86_64__) } else if (IsWindows()) { err = CloneWindows(func, stk, stksz, arg, tls, ptid, ctid); } else if (IsNetbsd()) { err = CloneNetbsd(func, stk, stksz, arg, tls, ptid, ctid); } else if (IsOpenbsd()) { err = CloneOpenbsd(func, stk, stksz, arg, tls, ptid, ctid); #endif /* __x86_64__ */ } else { err = ENOSYS; } if (SupportsBsd() && err == EPROCLIM) err = EAGAIN; if (err) atomic_fetch_sub(&_pthread_count, 1); return err; }