Make _Thread_local work across platforms

We now rewrite the binary image at runtime on Windows and XNU to change
mov %fs:0,%reg instructions to use %gs instead. There's also simpler
threading API introduced by this change and it's called _spawn() and
_join(), which has replaced most clone() usage.
This commit is contained in:
Justine Tunney 2022-07-10 04:01:17 -07:00
parent e4d6e263d4
commit 5f4f6b0e69
51 changed files with 808 additions and 1043 deletions

View file

@ -67,6 +67,7 @@ int chdir(const char *);
int chmod(const char *, uint32_t);
int chown(const char *, uint32_t, uint32_t);
int chroot(const char *);
int clone(void *, void *, size_t, int, void *, int *, void *, size_t, int *);
int close(int);
int creat(const char *, uint32_t);
int dup(int);
@ -196,9 +197,6 @@ ssize_t splice(int, int64_t *, int, int64_t *, size_t, uint32_t);
ssize_t write(int, const void *, size_t);
void sync(void);
int clone(int (*)(void *), void *, size_t, int, void *, int *, void *, size_t,
int *);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_CALLS_SYSCALLS_H_ */

View file

@ -18,6 +18,7 @@
*/
#include "libc/calls/strace.internal.h"
#include "libc/calls/struct/timespec.h"
#include "libc/errno.h"
#include "libc/fmt/itoa.h"
#include "libc/intrin/describeflags.internal.h"
#include "libc/intrin/futex.internal.h"

View file

@ -108,7 +108,6 @@ o/$(MODE)/libc/intrin/describeprotflags.o: \
OVERRIDE_CFLAGS += \
-fno-sanitize=address
o/$(MODE)/libc/intrin/tls.greg.o \
o/$(MODE)/libc/intrin/exit.greg.o \
o/$(MODE)/libc/intrin/exit1.greg.o \
o/$(MODE)/libc/intrin/getenv.greg.o \

View file

@ -1,125 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/nt/thread.h"
#include "libc/nt/thunk/msabi.h"
#include "libc/sysv/consts/nrlinux.h"
#define __NR_sysarch 0x000000a5 // freebsd+netbsd
#define AMD64_SET_GSBASE 131 // freebsd
#define AMD64_SET_FSBASE 129 // freebsd
#define X86_SET_GSBASE 16 // netbsd
#define X86_SET_FSBASE 17 // netbsd
#define __NR___set_tcb 0x00000149
#define __NR__lwp_setprivate 0x0000013d
#define __NR_thread_fast_set_cthread_self 0x03000003
/**
* Initializes thread information block.
*
* Here's the layout your c library assumes:
*
* offset size description
* 0x0000 0x08 linear address pointer
* 0x0030 0x08 linear address pointer
* 0x0038 0x04 tid
* 0x003c 0x04 errno
*
*/
privileged void *__initialize_tls(char tib[64]) {
if (tib) {
*(intptr_t *)(tib + 0x00) = (intptr_t)tib;
*(intptr_t *)(tib + 0x30) = (intptr_t)tib;
*(int *)(tib + 0x38) = -1; // tid
*(int *)(tib + 0x3c) = 0;
}
return tib;
}
/**
* Installs thread information block on main process.
*
* For example, to set up TLS correctly for the main thread, without
* creating any threads, then it's sufficient to say:
*
* __attribute__((__constructor__)) static void InitTls(void) {
* static char tls[64];
* __initialize_tls(tls);
* *(int *)(tls + 0x38) = gettid();
* *(int *)(tls + 0x3c) = __errno;
* __install_tls(tls);
* }
*
* We use a constructor here to make sure it only happens once. Please
* note that calling `clone` will do this automatically.
*
* Installing TLS causes the `__tls_enabled` variable to be set. This
* causes C library features such as `errno` and `gettid()` to use TLS.
* This can help things like recursive mutexes go significantly faster.
*
* To access your TLS storage, you can call `__get_tls()` or
* __get_tls_inline()` which return the address of the `tib`.
*
* @param tib is your thread information block, which must have at least
* 64 bytes on the righthand side of the tib pointer since those are
* the values your C library reserves for itself. memory on the left
* side of the pointer is reserved by the linker for _Thread_local.
*/
privileged void __install_tls(char tib[64]) {
int ax, dx;
assert(tib);
assert(!__tls_enabled);
assert(*(int *)(tib + 0x38) != -1);
if (IsWindows()) {
__tls_index = TlsAlloc();
asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tib));
} else if (IsFreebsd()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_sysarch), "D"(AMD64_SET_FSBASE), "S"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsNetbsd()) {
asm volatile("syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sysarch), "D"(X86_SET_FSBASE), "S"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsXnu()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_thread_fast_set_cthread_self),
"D"((intptr_t)tib - 0x30)
: "rcx", "r11", "memory", "cc");
} else if (IsOpenbsd()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR___set_tcb), "D"(tib)
: "rcx", "r11", "memory", "cc");
} else {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_linux_arch_prctl), "D"(ARCH_SET_FS), "S"(tib)
: "rcx", "r11", "memory");
}
__tls_enabled = true;
}

View file

@ -16,10 +16,10 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/bits/atomic.h"
#include "libc/calls/calls.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/futex.internal.h"
#include "libc/intrin/wait0.internal.h"
#include "libc/linux/futex.h"
@ -31,13 +31,13 @@
* by the clone() system call when a thread terminates. The purpose of
* this operation is to know when it's safe to munmap() a thread stack.
*/
void _wait0(const int *ptid) {
void _wait0(const int *ctid) {
int x;
for (;;) {
if (!(x = atomic_load_explicit(ptid, memory_order_acquire))) {
if (!(x = atomic_load_explicit(ctid, memory_order_acquire))) {
break;
} else if (IsLinux() /* || IsOpenbsd() */) {
_futex_wait(ptid, x, &(struct timespec){2});
_futex_wait(ctid, x, &(struct timespec){2});
} else {
sched_yield();
}

View file

@ -245,9 +245,12 @@ static wontreturn relegated noinstrument void __minicrash(int sig,
"RIP %x\n"
"RSP %x\n"
"RBP %x\n"
"PID %d\n"
"TID %d\n"
"\n",
kind, sig, __argv[0], ctx ? ctx->uc_mcontext.rip : 0,
ctx ? ctx->uc_mcontext.rsp : 0, ctx ? ctx->uc_mcontext.rbp : 0);
ctx ? ctx->uc_mcontext.rsp : 0, ctx ? ctx->uc_mcontext.rbp : 0, __pid,
sys_gettid());
__restorewintty();
_Exit(119);
}

View file

@ -26,7 +26,7 @@
// @param rdx is ptid
// @param rcx is ctid
// @param r8 is tls
// @param r9 is func
// @param r9 is func(void*,int)→int
// @param 8(rsp) is arg
// @return tid of child on success, or -1 w/ errno
sys_clone_linux:
@ -48,8 +48,9 @@ sys_clone_linux:
jmp 0b
2: xor %ebp,%ebp # child thread
mov %rbx,%rdi # arg
call *%r9 # func(arg)
xchg %eax,%edi # func(arg) exitcode
mov (%r10),%esi # tid
call *%r9 # func(arg,tid)
xchg %eax,%edi # func(arg,tid) exitcode
mov $60,%eax # __NR_exit(exitcode)
syscall
.endfn sys_clone_linux,globl,hidden

View file

@ -16,7 +16,6 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/struct/ucontext-netbsd.internal.h"
@ -24,7 +23,6 @@
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/spinlock.h"
#include "libc/limits.h"
#include "libc/macros.internal.h"
@ -68,19 +66,23 @@ struct CloneArgs {
};
union {
char lock;
void *pstack;
void *oldrsp;
};
int *ptid;
int *ctid;
int *ztid;
char *tls;
int (*func)(void *);
int (*func)(void *, int);
void *arg;
};
////////////////////////////////////////////////////////////////////////////////
// THE NEW TECHNOLOGY
int WinThreadLaunch(void *arg, int (*func)(void *), intptr_t rsp);
int WinThreadLaunch(void *arg, // rdi
int tid, // rsi
int (*func)(void *, int), // rdx
intptr_t rsp); // rcx
// we can't log this function because:
// 1. windows owns the backtrace pointer right now
@ -90,16 +92,20 @@ int WinThreadLaunch(void *arg, int (*func)(void *), intptr_t rsp);
// 2. windows owns the stack memory right now
// we need win32 raw imports because:
// 1. generated thunks are function logged
noasan noinstrument static textwindows wontreturn void WinThreadEntry(
int rdi, int rsi, int rdx, struct CloneArgs *wt) {
noasan noinstrument static textwindows wontreturn void //
WinThreadEntry(int rdi, // rcx
int rsi, // rdx
int rdx, // r8
struct CloneArgs *wt) { // r9
int rc;
if (wt->tls) {
asm("mov\t%1,%%gs:%0"
: "=m"(*((long *)0x1480 + __tls_index))
: "r"(wt->tls));
}
*wt->ptid = wt->tid;
*wt->ctid = wt->tid;
rc = WinThreadLaunch(wt->arg, wt->func, (intptr_t)wt & -16);
rc = WinThreadLaunch(wt->arg, wt->tid, wt->func, (intptr_t)wt & -16);
// we can now clear ctid directly since we're no longer using our own
// stack memory, which can now be safely free'd by the parent thread.
*wt->ztid = 0;
@ -109,14 +115,16 @@ noasan noinstrument static textwindows wontreturn void WinThreadEntry(
unreachable;
}
static textwindows int CloneWindows(int (*func)(void *), char *stk,
static textwindows int CloneWindows(int (*func)(void *, int), char *stk,
size_t stksz, int flags, void *arg,
void *tls, size_t tlssz, int *ctid) {
void *tls, size_t tlssz, int *ptid,
int *ctid) {
int64_t h;
struct CloneArgs *wt;
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-alignof(struct CloneArgs));
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->func = func;
@ -133,8 +141,12 @@ static textwindows int CloneWindows(int (*func)(void *), char *stk,
////////////////////////////////////////////////////////////////////////////////
// XNU'S NOT UNIX
void XnuThreadThunk(void *pthread, int machport, void *(*func)(void *),
void *arg, intptr_t *stack, unsigned xnuflags);
void XnuThreadThunk(void *pthread, // rdi
int machport, // rsi
void *(*func)(void *), // rdx
void *arg, // rcx
intptr_t *stack, // r8
unsigned xnuflags); // r9
asm("XnuThreadThunk:\n\t"
"xor\t%ebp,%ebp\n\t"
"mov\t%r8,%rsp\n\t"
@ -145,11 +157,18 @@ asm("XnuThreadThunk:\n\t"
__attribute__((__used__, __no_reorder__))
static wontreturn void
XnuThreadMain(void *pthread, int tid, int (*func)(void *arg), void *arg,
struct CloneArgs *wt, unsigned xnuflags) {
XnuThreadMain(void *pthread, // rdi
int tid, // rsi
int (*func)(void *arg, int tid), // rdx
void *arg, // rcx
struct CloneArgs *wt, // r8
unsigned xnuflags) { // r9
int ax;
wt->tid = tid;
*wt->ptid = tid;
*wt->ctid = tid;
_spunlock(&wt->lock);
if (wt->tls) {
// XNU uses the same 0x30 offset as the WIN32 TIB x64. They told the
// Go team at Google that they Apply stands by our ability to use it
@ -159,10 +178,9 @@ XnuThreadMain(void *pthread, int tid, int (*func)(void *arg), void *arg,
: "0"(__NR_thread_fast_set_cthread_self), "D"(wt->tls - 0x30)
: "rcx", "r11", "memory", "cc");
}
if (wt->ctid) {
*wt->ctid = tid;
}
func(arg);
func(arg, tid);
// we no longer use the stack after this point
// %rax = int bsdthread_terminate(%rdi = void *stackaddr,
// %rsi = size_t freesize,
@ -179,7 +197,7 @@ XnuThreadMain(void *pthread, int tid, int (*func)(void *arg), void *arg,
}
static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, size_t tlssz, int *ctid) {
void *arg, void *tls, size_t tlssz, int *ptid, int *ctid) {
int rc;
bool failed;
static bool once;
@ -198,6 +216,7 @@ static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-alignof(struct CloneArgs));
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->tls = flags & CLONE_SETTLS ? tls : 0;
@ -215,8 +234,9 @@ static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
static wontreturn void FreebsdThreadMain(void *p) {
struct CloneArgs *wt = p;
*wt->ptid = wt->tid;
*wt->ctid = wt->tid;
wt->func(wt->arg);
wt->func(wt->arg, wt->tid);
// we no longer use the stack after this point
// void thr_exit(%rdi = long *state);
asm volatile("movl\t$0,%0\n\t" // *wt->ztid = 0
@ -227,8 +247,9 @@ static wontreturn void FreebsdThreadMain(void *p) {
unreachable;
}
static int CloneFreebsd(int (*func)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, size_t tlssz, int *ctid) {
static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz,
int *ptid, int *ctid) {
int ax;
bool failed;
int64_t tid;
@ -236,6 +257,7 @@ static int CloneFreebsd(int (*func)(void *), char *stk, size_t stksz, int flags,
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
sizeof(struct CloneArgs)) &
-alignof(struct CloneArgs));
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->tls = tls;
@ -267,7 +289,9 @@ static int CloneFreebsd(int (*func)(void *), char *stk, size_t stksz, int flags,
static wontreturn void OpenbsdThreadMain(void *p) {
struct CloneArgs *wt = p;
wt->func(wt->arg);
*wt->ptid = wt->tid;
*wt->ctid = wt->tid;
wt->func(wt->arg, wt->tid);
// we no longer use the stack after this point. however openbsd
// validates the rsp register too so a race condition can still
// happen if the parent tries to free the stack. we'll solve it
@ -279,13 +303,14 @@ static wontreturn void OpenbsdThreadMain(void *p) {
"movl\t$0,(%%rdi)\n\t" // *wt->ztid = 0
"syscall" // __threxit()
: "=m"(*wt->ztid)
: "a"(302), "m"(wt->pstack), "D"(wt->ztid)
: "a"(302), "m"(wt->oldrsp), "D"(wt->ztid)
: "rcx", "r11", "memory");
unreachable;
}
static int CloneOpenbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, size_t tlssz, int *ctid) {
static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz,
int *ptid, int *ctid) {
int tid;
intptr_t sp;
struct __tfork *tf;
@ -297,13 +322,15 @@ static int CloneOpenbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
sp -= sizeof(struct CloneArgs);
sp &= -MAX(16, alignof(struct CloneArgs));
wt = (struct CloneArgs *)sp;
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->pstack = __builtin_frame_address(0);
wt->oldrsp = __builtin_frame_address(0);
wt->arg = arg;
wt->func = func;
tf->tf_stack = (char *)wt - 8;
tf->tf_tcb = flags & CLONE_SETTLS ? tls : 0;
tf->tf_tid = flags & CLONE_CHILD_SETTID ? ctid : 0;
tf->tf_tid = &wt->tid;
if ((tid = __tfork_thread(tf, sizeof(*tf), OpenbsdThreadMain, wt)) < 0) {
errno = -tid;
tid = -1;
@ -314,11 +341,17 @@ static int CloneOpenbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
////////////////////////////////////////////////////////////////////////////////
// NET BESIYATA DISHMAYA
static wontreturn void NetbsdThreadMain(void *arg, int (*func)(void *arg),
int *tid, int *ctid, int *ztid) {
static wontreturn void NetbsdThreadMain(void *arg, // rdi
int (*func)(void *, int), // rsi
int *tid, // rdx
int *ctid, // rcx
int *ztid, // r8
int *ptid) { // r9
int ax, dx;
*ctid = *tid;
func(arg);
ax = *tid;
*ptid = ax;
*ctid = ax;
func(arg, ax);
// we no longer use the stack after this point
// %eax = int __lwp_exit(void);
asm volatile("movl\t$0,%2\n\t" // *wt->ztid = 0
@ -330,8 +363,9 @@ static wontreturn void NetbsdThreadMain(void *arg, int (*func)(void *arg),
unreachable;
}
static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, size_t tlssz, int *ctid) {
static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
int *ctid) {
// NetBSD has its own clone() and it works, but it's technically a
// second-class API, intended to help Linux folks migrate to this.
bool failed;
@ -341,7 +375,6 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
static int broken;
struct ucontext_netbsd *ctx;
static struct ucontext_netbsd netbsd_clone_template;
_Static_assert(sizeof(struct ucontext_netbsd) == 784, "fix assembly");
// memoize arbitrary valid processor state structure
if (!once) {
@ -360,7 +393,7 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
}
sp = (intptr_t)(stk + stksz);
// allocate memory for child tid
// allocate memory for tid
sp -= sizeof(int);
sp = sp & -alignof(int);
tid = (int *)sp;
@ -388,6 +421,7 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
ctx->uc_mcontext.rdx = (intptr_t)tid;
ctx->uc_mcontext.rcx = (intptr_t)(flags & CLONE_CHILD_SETTID ? ctid : tid);
ctx->uc_mcontext.r8 = (intptr_t)(flags & CLONE_CHILD_CLEARTID ? ctid : tid);
ctx->uc_mcontext.r9 = (intptr_t)(flags & CLONE_PARENT_SETTID ? ptid : tid);
ctx->uc_flags |= _UC_STACK;
ctx->uc_stack.ss_sp = stk;
ctx->uc_stack.ss_size = stksz;
@ -413,8 +447,28 @@ static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, int flags,
////////////////////////////////////////////////////////////////////////////////
// GNU/SYSTEMD
int sys_clone_linux(int flags, char *stk, int *ptid, int *ctid, void *tls,
int (*func)(void *), void *arg);
int sys_clone_linux(int flags, // rdi
long sp, // rsi
int *ptid, // rdx
int *ctid, // rcx
void *tls, // r8
void *func, // r9
void *arg); // 8(rsp)
static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
int *ctid) {
long sp;
sp = (intptr_t)(stk + stksz);
if (~flags & CLONE_CHILD_SETTID) {
flags |= CLONE_CHILD_SETTID;
sp -= sizeof(int);
sp = sp & -alignof(int);
ctid = (int *)sp;
}
sp = sp & -16; // align the stack
return sys_clone_linux(flags, sp, ptid, ctid, tls, func, arg);
}
////////////////////////////////////////////////////////////////////////////////
// COSMOPOLITAN
@ -461,36 +515,69 @@ int sys_clone_linux(int flags, char *stk, int *ptid, int *ctid, void *tls,
* other calls like getpid() may return incorrect values.
*
* @param func is your callback function, which this wrapper requires
* not be null, otherwise EINVAL is raised
* not be null, otherwise EINVAL is raised. It is passed two args
* within the child thread: (1) the caller-supplied `arg` and (2)
* the new tid is always passed in the second arg for convenience
*
* @param stk points to the bottom of a caller allocated stack, which
* must be allocated via mmap() using the MAP_STACK flag, or else
* you won't get optimal performance and it won't work on OpenBSD
*
* @param stksz is the size of that stack in bytes, we recommend that
* that this be set to GetStackSize() or else memory safety tools
* like kprintf() can't do as good and quick of a job; this value
* must be 16-aligned plus it must be at least 4192 bytes in size
* and it's advised to have the bottom-most page, be a guard page
* @param flags should have:
* - `CLONE_THREAD|CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND`
* and you may optionally bitwise or any of the following:
* - `CLONE_CHILD_SETTID` is needed too if you use `ctid` which
* is part of the memory the child owns and it'll be set right
* before the callback function is invoked
* - `CLONE_CHILD_CLEARTID` causes `*ctid = 0` upon termination
* which can be used to implement join so that the parent may
* safely free the stack memory that the child is using
* - `CLONE_PARENT_SETTID` is needed too if you use `ptid` and this
* is guaranteed to happen before clone() returns
* - `CLONE_SETTLS` is needed too if you set `tls`. You may get this
* value from the thread by calling __get_tls(). There are a few
* layout expectations imposed by your C library. Those are all
* documented by __initialize_tls() which initializes the parts of
* the first 64 bytes of tls memory that libc cares about. This
* flag will transition the C runtime to the `__tls_enabled` state
* automatically. If it's used for one thread, then it must be
* used for all threads. The first time it's used, it must be used
* from the main thread.
* @param arg will be passed to your callback
*
* @param flags which SHOULD always have all of these flags:
*
* - `CLONE_THREAD`
* - `CLONE_VM`
* - `CLONE_FS`
* - `CLONE_FILES`
* - `CLONE_SIGHAND`
*
* This system call wrapper is intended for threads, and as such, we
* won't polyfill Linux's ability to simulate unrelated calls (e.g.
* fork, vfork) via clone() on other platforms. Please just call
* fork() and vfork() when that's what you want.
*
* Your `flags` may also optionally also additionally bitwise-OR any
* combination of the following additional flags:
*
* - `CLONE_PARENT_SETTID` must be specified if you intend to set
* the `ptid` argument, which is guaranteed to be updated with the
* child tid BEFORE BOTH clone() returns and `func` is invoked
*
* - `CLONE_CHILD_SETTID` must be specified if you intend to set the
* `ctid` argument, which is guaranteed to be updated with the
* child tid before `func` is called, however we CAN NOT guarantee
* this will happen BEFORE clone() returns
*
* - `CLONE_CHILD_CLEARTID` causes `*ctid = 0` upon child thread
* termination. This is used to implement join so that the parent
* may know when it's safe to free the child's stack memory, and
* as such, is guaranteed to happen AFTER the child thread has
* either terminated or has finished using its stack memory
*
* - `CLONE_SETTLS` is needed if you intend to specify the `tls`
* argument, which provides a fast-path solution for changing the
* appropriate TLS segment register within the child thread. The
* child thread may then obtain a reference to the TIB address you
* supplied, by calling __get_tls(). Your C library holds certain
* expectations about the layout of your Thread Information Block
* (TIB), which are all documented by __initialize_tls(). That
* function can be used to initialize the first positive 64 bytes
* of your TLS allocation, which is the memory Cosmopolitan Libc
* wants for itself (and negative addresses are reserved by the
* GNU Linker). Using this flag will transition the C runtime to a
* `__tls_enabled` state automatically. If you use TLS for just
* one thread, then you must be specify TLS for ALL THREADS. It's
* a good idea to do that since TLS can offer considerable (i.e.
* multiple orders of a magnitude) performance improvement for
* TID-dependent C library services, e.g. recursive mutexes.
*
* @param arg is passed as an argument to `func` in the child thread
* @param tls may be used to set the thread local storage segment;
* this parameter is ignored if `CLONE_SETTLS` is not set
* @param tlssz is the size of tls in bytes which must be at least 64
@ -499,8 +586,8 @@ int sys_clone_linux(int flags, char *stk, int *ptid, int *ctid, void *tls,
* @return tid of child on success, or -1 w/ errno
* @threadsafe
*/
int clone(int (*func)(void *), void *stk, size_t stksz, int flags, void *arg,
int *ptid, void *tls, size_t tlssz, int *ctid) {
int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
void *tls, size_t tlssz, int *ctid) {
int rc;
struct CloneArgs *wt;
@ -529,8 +616,7 @@ int clone(int (*func)(void *), void *stk, size_t stksz, int flags, void *arg,
!__asan_is_valid(ctid, sizeof(*ctid))))) {
rc = efault();
} else if (IsLinux()) {
rc =
sys_clone_linux(flags, (char *)stk + stksz, ptid, ctid, tls, func, arg);
rc = CloneLinux(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
} else if (!IsTiny() &&
(flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
@ -539,19 +625,20 @@ int clone(int (*func)(void *), void *stk, size_t stksz, int flags, void *arg,
STRACE("clone flag unsupported on this platform");
rc = einval();
} else if (IsXnu()) {
rc = CloneXnu(func, stk, stksz, flags, arg, tls, tlssz, ctid);
rc = CloneXnu(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
} else if (IsFreebsd()) {
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, tlssz, ctid);
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
} else if (IsNetbsd()) {
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, tlssz, ctid);
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
} else if (IsOpenbsd()) {
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, tlssz, ctid);
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
} else if (IsWindows()) {
rc = CloneWindows(func, stk, stksz, flags, arg, tls, tlssz, ctid);
rc = CloneWindows(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
} else {
rc = enosys();
}
// TODO(jart): do we need it?
if (rc != -1 && (flags & CLONE_PARENT_SETTID)) {
*ptid = rc;
}

View file

@ -64,7 +64,7 @@ static struct SymbolTable *GetSymbolTableFromZip(struct Zipos *zipos) {
lf = GetZipCfileOffset(zipos->map + cf);
size = GetZipLfileUncompressedSize(zipos->map + lf);
size2 = ROUNDUP(size, FRAMESIZE);
if ((res = mapanon(size2))) {
if ((res = _mapanon(size2))) {
switch (ZIP_LFILE_COMPRESSIONMETHOD(zipos->map + lf)) {
case kZipCompressionNone:
memcpy(res, (void *)ZIP_LFILE_CONTENT(zipos->map + lf), size);

View file

@ -54,13 +54,16 @@
* }
*
* That is performed automatically for unit test executables.
*
* @return memory map address on success, or null w/ errrno
*/
noasan void *mapanon(size_t size) {
void *_mapanon(size_t size) {
/* asan runtime depends on this function */
void *m;
m = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (m == MAP_FAILED && weaken(__oom_hook)) {
weaken(__oom_hook)(size);
return 0;
}
return m;
}

View file

@ -1,7 +1,7 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
@ -16,57 +16,26 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/atomic.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/asan.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/futex.h"
#include "libc/sysv/consts/nr.h"
#include "libc/thread/thread.h"
#include "libc/runtime/stack.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
/**
* Waits for thread to terminate and frees its memory.
* Allocates stack.
*
* @param td is thread descriptor memory
* @param exitcode optionally receives value returned by thread
* @return 0 on success, or error number on failure
* @raises EDEADLK when trying to join this thread
* @raises EINVAL if another thread is joining
* @raises ESRCH if no such thread exists
* @raises EINVAL if not joinable
* @threadsafe
* @return stack bottom address on success, or null w/ errrno
*/
int cthread_join(cthread_t td, void **exitcode) {
int x, rc, tid;
// otherwise, tid could be set to 0 even though `state` is not
// finished mark thread as joining
if (!td || (IsAsan() && !__asan_is_valid(td, sizeof(*td)))) {
rc = ESRCH;
tid = -1;
} else if ((tid = td->tid) == gettid()) { // tid must load before lock xadd
rc = EDEADLK;
} else if (atomic_load(&td->state) & (cthread_detached | cthread_joining)) {
rc = EINVAL;
} else {
if (~atomic_fetch_add(&td->state, cthread_joining) & cthread_finished) {
while ((x = atomic_load(&td->tid))) {
cthread_memory_wait32(&td->tid, x, 0);
}
}
if (exitcode) {
*exitcode = td->exitcode;
}
if (!munmap(td->alloc.bottom, td->alloc.top - td->alloc.bottom)) {
rc = 0;
} else {
rc = errno;
}
}
STRACE("cthread_join(%d, [%p]) → %s", tid, !rc && exitcode ? *exitcode : 0,
!rc ? "0" : strerrno(rc));
return rc;
void *_mapstack(void) {
return mmap(0, GetStackSize(), PROT_READ | PROT_WRITE,
MAP_STACK | MAP_ANONYMOUS, -1, 0);
}
/**
* Frees stack.
*
* @param stk was allocated by _mapstack()
*/
int _freestack(void *stk) {
return munmap(stk, GetStackSize());
}

View file

@ -6,7 +6,7 @@ COSMOPOLITAN_C_START_
cosmopolitan § runtime
*/
typedef long jmp_buf[8] forcealign(CACHELINE);
typedef long jmp_buf[8];
extern char **environ; /* CRT */
extern int __argc; /* CRT */
@ -45,8 +45,10 @@ extern size_t __virtualmax;
extern bool __isworker;
void mcount(void);
int _freestack(void *);
unsigned long getauxval(unsigned long);
void *mapanon(size_t) attributeallocsize((1));
void *_mapanon(size_t) attributeallocsize((1)) mallocesque;
void *_mapstack(void) returnsaligned((FRAMESIZE)) mallocesque;
int setjmp(jmp_buf) libcesque returnstwice paramsnonnull();
void longjmp(jmp_buf, int) libcesque wontreturn paramsnonnull();
axdx_t setlongerjmp(jmp_buf) libcesque returnstwice paramsnonnull();

View file

@ -16,29 +16,206 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/bits/bits.h"
#include "libc/calls/calls.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/kprintf.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/nt/thread.h"
#include "libc/nt/thunk/msabi.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/nrlinux.h"
#include "libc/thread/thread.h"
#include "third_party/xed/x86.h"
#define __NR_sysarch 0x000000a5 // freebsd+netbsd
#define AMD64_SET_GSBASE 131 // freebsd
#define AMD64_SET_FSBASE 129 // freebsd
#define X86_SET_GSBASE 16 // netbsd
#define X86_SET_FSBASE 17 // netbsd
#define __NR___set_tcb 0x00000149
#define __NR__lwp_setprivate 0x0000013d
#define __NR_thread_fast_set_cthread_self 0x03000003
#define _TLSZ ((intptr_t)_tls_size)
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t)
static char tibdefault[64];
extern int __threadcalls_end[];
extern int __threadcalls_start[];
extern unsigned char __get_tls_nt_rax[];
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
void __enable_tls(void) {
__initialize_tls(tibdefault);
*(int *)((char *)tibdefault + 0x38) = sys_gettid();
*(int *)((char *)tibdefault + 0x3c) = __errno;
__install_tls(tibdefault);
privileged void __enable_tls(void) {
assert(!__threaded);
assert(!__tls_enabled);
// allocate tls memory for main process
//
// %fs Linux/BSDs
// │
// _Thread_local │ __get_tls()
// ┌───┬──────────┬──────────┼───┐
// │pad│ .tdata │ .tbss │tib│
// └───┴──────────┴──────────┼───┘
// │
// Windows/Mac %gs
//
size_t siz;
cthread_t tib;
char *mem, *tls;
siz = ROUNDUP(_TLSZ + _TIBZ, FRAMESIZE);
mem = _mapanon(siz);
tib = (cthread_t)(mem + siz - _TIBZ);
tls = mem + siz - _TIBZ - _TLSZ;
tib->self = tib;
tib->self2 = tib;
tib->err = __errno;
tib->tid = sys_gettid();
memmove(tls, _tdata_start, _TLDZ);
// ask the operating system to change the x86 segment register
int ax, dx;
if (IsWindows()) {
__tls_index = __imp_TlsAlloc();
asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tib));
} else if (IsFreebsd()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_sysarch), "D"(AMD64_SET_FSBASE), "S"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsNetbsd()) {
asm volatile("syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sysarch), "D"(X86_SET_FSBASE), "S"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsXnu()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_thread_fast_set_cthread_self),
"D"((intptr_t)tib - 0x30)
: "rcx", "r11", "memory", "cc");
} else if (IsOpenbsd()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR___set_tcb), "D"(tib)
: "rcx", "r11", "memory", "cc");
} else {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_linux_arch_prctl), "D"(ARCH_SET_FS), "S"(tib)
: "rcx", "r11", "memory");
}
/*
* We need to rewrite SysV _Thread_local code. You MUST use the
* -mno-tls-direct-seg-refs flag which generates code like this
*
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
*
* Which on Mac we can replace with this:
*
* 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
*
* Whereas on Windows we'll replace it with this:
*
* 0f 1f 40 00 fatnop4
* e8 xx xx xx xx call __get_tls_nt_%R
*
* Since we have no idea where the TLS instructions exist in the
* binary, we need to disassemble the whole program image. This'll
* potentially take a few milliseconds for some larger programs.
*
* TODO(jart): compute probability this is just overkill
*/
if (IsWindows() || IsXnu()) {
int n, reg, dis;
unsigned char *p;
struct XedDecodedInst xedd;
__morph_begin();
// The most expensive part of this process is we need to compute the
// byte length of each instruction in our program. We'll use Intel's
// disassembler for this purpose.
for (p = _ereal; p < __privileged_start; p += n) {
xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64);
if (!xed_instruction_length_decode(&xedd, p, 15)) {
// We now know p[0] is most likely the first byte of an x86 op.
// Let's check and see if it's the GCC linear TIB address load.
// We hope and pray GCC won't generate TLS stores to %r8..%r15.
if (xedd.length == 9 && //
0144 == p[0] && // fs
0110 == p[1] && // rex.w (64-bit operand size)
0213 == p[2] && // mov reg/mem → reg (word-sized)
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
0000 == p[5] && // displacement (von Neumann endian)
0000 == p[6] && // displacement
0000 == p[7] && // displacement
0000 == p[8]) { // displacement
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
if (IsXnu()) {
p[0] = 0145; // this changes gs segment to fs segment
p[5] = 0x30; // tib slot index for tib linear address
}
// Windows is kind of complicated. We need to replace the
// segment mov instruction with a function call, that (a)
// won't clobber registers, and (b) has a return register
// that's the same as the mov destination. When setting
// function displacement, &CALL+5+DISP must equal &FUNC.
else {
reg = (p[3] & 070) >> 3;
dis = (__get_tls_nt_rax + reg * 18) - (p + 9);
p[0] = 0017; // map1
p[1] = 0037; // nopl (onl if reg=0)
p[2] = 0100; // mod/rm (%rax)+disp8
p[3] = 0000; // displacement
p[4] = 0350; // call
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
}
}
// Move to the next instruction.
n = xedd.length;
} else {
// If Xed failed to decode the instruction, then we'll just plow
// through memory one byte at a time until Xed's morale improves
n = 1;
}
}
__morph_end();
}
// we are now allowed to use tls
__tls_enabled = true;
}
privileged void __enable_threads(void) {
assert(!__threaded);
__threaded = gettid();
__morph_begin();
/*
* _NOPL("__threadcalls", func)
*
* The big ugly macro above is used by Cosmopolitan Libc to unser
* locking primitive (e.g. flockfile, funlockfile) have zero impact on
* performance and binary size when threads aren't actually in play.
*
* we have this
*
* 0f 1f 05 b1 19 00 00 nopl func(%rip)
@ -46,8 +223,10 @@ privileged void __enable_threads(void) {
* we're going to turn it into this
*
* 67 67 e8 b1 19 00 00 addr32 addr32 call func
*
* This is cheap and fast because the big ugly macro stored in the
* binary the offsets of all the instructions we need to change.
*/
__morph_begin();
for (int *p = __threadcalls_start; p < __threadcalls_end; ++p) {
_base[*p + 0] = 0x67;
_base[*p + 1] = 0x67;

View file

@ -26,8 +26,9 @@
// runtime facilities.
//
// @param %rdi is arg
// @param %rsi is func
// @param %rdx is stack
// @param %rsi is tid
// @param %rdx is func
// @param %rcx is stack
// @return %rax is exit code
// @see clone()
WinThreadLaunch:
@ -35,9 +36,9 @@ WinThreadLaunch:
push %r15
mov %rbp,%r15
mov %rsp,%rbx
mov %rdx,%rsp
mov %rcx,%rsp
xor %rbp,%rbp
call *%rsi
call *%rdx
mov %r15,%rbp
mov %rbx,%rsp
pop %r15

View file

@ -32,7 +32,7 @@
* they are passed in the 64kb bytes preceding src.
*
* @return pointer to end of decoded data, similar to mempcpy()
* @see mapanon(), lz4check()
* @see _mapanon(), lz4check()
*/
void *lz4decode(void *dest, const void *src) {
const unsigned char *frame, *block;

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/nexgen32e/gettls.h"
#include "libc/nexgen32e/threaded.h"
/**
* Returns address of thread information block.

View file

@ -40,6 +40,7 @@ LIBC_SYSV_A_FILES := \
libc/sysv/errno_location.greg.c \
libc/sysv/errno.c \
libc/sysv/gettls.greg.c \
libc/sysv/tlspolyfill.S \
libc/sysv/errfun.S \
libc/sysv/strace.greg.c \
libc/sysv/describeos.greg.c \

90
libc/sysv/tlspolyfill.S Normal file
View file

@ -0,0 +1,90 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Code morphing TLS polyfills for The New Technology.
//
// @note msvc generates this code so it's stable
// @note func ordering follows x86 reg encoding
// @note each function is exactly 18 bytes
// @see __enable_threads()
__get_tls_nt_rax:
push %rcx
mov __tls_index(%rip),%ecx
mov %gs:0x1480(,%rcx,8),%rax
pop %rcx
ret
.endfn __get_tls_nt_rax,globl,hidden
__get_tls_nt_rcx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rcx
pop %rax
ret
.endfn __get_tls_nt_rcx
__get_tls_nt_rdx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rdx
pop %rax
ret
.endfn __get_tls_nt_rdx
__get_tls_nt_rbx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rbx
pop %rax
ret
.endfn __get_tls_nt_rbx
__get_tls_nt_rsp:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rsp
pop %rax
ret
.endfn __get_tls_nt_rsp
__get_tls_nt_rbp:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rbp
pop %rax
ret
.endfn __get_tls_nt_rbp
__get_tls_nt_rsi:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rsi
pop %rax
ret
.endfn __get_tls_nt_rsi
__get_tls_nt_rdi:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rdi
pop %rax
ret
.endfn __get_tls_nt_rdi

View file

@ -1,133 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/atomic.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/errno.h"
#include "libc/intrin/setjmp.internal.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/clone.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/thread/internal.h"
#include "libc/thread/thread.h"
STATIC_YOINK("_main_thread_ctor");
static cthread_t cthread_allocate(const cthread_attr_t *attr) {
char *mem;
size_t size;
cthread_t td;
size = ROUNDUP(
attr->stacksize +
ROUNDUP((uintptr_t)_tls_size + sizeof(struct cthread_descriptor_t),
PAGESIZE),
FRAMESIZE);
mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_STACK | MAP_ANONYMOUS, -1, 0);
if (mem == MAP_FAILED) return 0;
if (attr->guardsize > PAGESIZE) {
mprotect(mem, attr->guardsize, PROT_NONE);
}
td = (cthread_t)(mem + size - sizeof(struct cthread_descriptor_t));
td->self = td;
td->self2 = td;
td->err = errno;
td->tid = -1;
td->stack.bottom = mem;
td->stack.top = mem + attr->stacksize;
td->alloc.bottom = mem;
td->alloc.top = mem + size;
if (attr->mode & CTHREAD_CREATE_DETACHED) {
td->state = cthread_detached;
} else {
td->state = cthread_started;
}
// Initialize TLS with content of .tdata section
memmove((void *)((intptr_t)td - (intptr_t)_tls_size), _tdata_start,
(intptr_t)_tdata_size);
return td;
}
static int cthread_start(void *arg) {
axdx_t rc;
void *exitcode;
cthread_t td = arg;
if (!(rc = setlongerjmp(td->exiter)).ax) {
exitcode = td->func(td->arg);
} else {
exitcode = (void *)rc.dx;
}
td->exitcode = exitcode;
_pthread_key_destruct(td->key);
if (atomic_load(&td->state) & cthread_detached) {
// we're still using the stack
// thus we can't munmap it yet
// kick the can down the road!
cthread_zombies_add(td);
}
atomic_fetch_add(&td->state, cthread_finished);
return 0;
}
/**
* Creates thread.
*
* @param ptd will receive pointer to new thread descriptor
* @param attr contains special configuration if non-null
* @param func is thread callback function
* @param arg is argument supplied to `func`
* @return 0 on success, or error number on failure
* @threadsafe
*/
int cthread_create(cthread_t *ptd, const cthread_attr_t *attr,
void *(*func)(void *), void *arg) {
int rc, tid;
cthread_t td;
cthread_attr_t default_attr;
__threaded = true;
cthread_zombies_reap();
cthread_attr_init(&default_attr);
if ((td = cthread_allocate(attr ? attr : &default_attr))) {
td->func = func;
td->arg = arg;
cthread_attr_destroy(&default_attr);
tid =
clone(cthread_start, td->stack.bottom, td->stack.top - td->stack.bottom,
CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
CLONE_SETTLS | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
td, 0, td, sizeof(struct cthread_descriptor_t), &td->tid);
if (tid != -1) {
*ptd = td;
rc = 0;
} else {
rc = errno;
munmap(td->alloc.bottom, td->alloc.top - td->alloc.bottom);
}
} else {
rc = errno;
tid = -1;
}
STRACE("cthread_create([%d], %p, %p, %p) → %s", tid, attr, func, arg,
!rc ? "0" : strerrno(rc));
return rc;
}

View file

@ -21,7 +21,7 @@
.init.start 400,_main_thread_ctor
push %rdi
push %rsi
call _main_thread_init
call __enable_tls
pop %rsi
pop %rdi
.init.end 400,_main_thread_ctor

View file

@ -1,7 +1,7 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
@ -16,53 +16,41 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/errno.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/mem/mem.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/thread/spawn.h"
#include "libc/thread/thread.h"
textstartup void _main_thread_init(void) {
_Static_assert(offsetof(struct cthread_descriptor_t, self) == 0x00, "");
_Static_assert(offsetof(struct cthread_descriptor_t, self2) == 0x30, "");
_Static_assert(offsetof(struct cthread_descriptor_t, tid) == 0x38, "");
_Static_assert(offsetof(struct cthread_descriptor_t, err) == 0x3c, "");
cthread_t td;
size_t totalsize;
char *mem, *bottom, *top;
#define _TLSZ ((intptr_t)_tls_size)
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t)
#define _MEMZ ROUNDUP(_TLSZ + _TIBZ, alignof(struct cthread_descriptor_t))
totalsize = ROUNDUP(
(uintptr_t)_tls_size + sizeof(struct cthread_descriptor_t), FRAMESIZE);
/**
* Allocates thread-local storage memory for new thread.
* @return buffer that must be released with free()
*/
char *_mktls(char **out_tib) {
char *tls;
cthread_t tib;
mem = mmap(0, totalsize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE,
-1, 0);
assert(mem != MAP_FAILED);
// Allocate enough TLS memory for all the GNU Linuker (_tls_size)
// organized _Thread_local data, as well as Cosmpolitan Libc (64)
if (!(tls = calloc(1, _MEMZ))) return 0;
bottom = mem;
top = mem + totalsize;
// set up thread informaiton block
tib = (cthread_t)(tls + _MEMZ - _TIBZ);
tib->self = tib;
tib->self2 = tib;
tib->err = 0;
tib->tid = -1;
memmove(tls, _tdata_start, _TLDZ);
td = (cthread_t)(top - sizeof(struct cthread_descriptor_t));
td->self = td;
td->self2 = td;
td->err = errno;
td->tid = gettid();
td->alloc.bottom = bottom;
td->alloc.top = top;
td->stack.bottom = GetStackAddr(0);
td->stack.top = td->stack.bottom + GetStackSize();
td->state = cthread_main;
// Initialize TLS with content of .tdata section
memmove((void *)((uintptr_t)td - (uintptr_t)_tls_size), _tdata_start,
(uintptr_t)_tdata_size);
// Set FS
__install_tls((char *)td);
if (out_tib) {
*out_tib = (char *)tib;
}
return tls;
}

113
libc/thread/spawn.c Normal file
View file

@ -0,0 +1,113 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/calls.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/wait0.internal.h"
#include "libc/macros.internal.h"
#include "libc/mem/mem.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/clone.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/thread/spawn.h"
#include "libc/thread/thread.h"
STATIC_YOINK("_main_thread_ctor");
/**
* @fileoverview Simple System Threads API
*/
#define _TLSZ ((intptr_t)_tls_size)
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t)
#define _MEMZ ROUNDUP(_TLSZ + _TIBZ, alignof(struct cthread_descriptor_t))
/**
* Spawns thread.
*
* @param fun is thread worker callback, which receives `arg` and `ctid`
* @param arg shall be passed to `fun`
* @param opt_out_thread needn't be initialiized and is always clobbered
* except when it isn't specified, in which case, the thread is kind
* of detached and will leak in stack / tls memory
* @return 0 on success, or -1 w/ errno
*/
int _spawn(int fun(void *, int), void *arg, struct spawn *opt_out_thread) {
struct spawn *th, ths;
// we need to to clobber the output memory before calling clone, since
// there's no guarantee clone() won't suspend the parent, and focus on
// running the child instead; in that case child might want to read it
if (opt_out_thread) {
th = opt_out_thread;
} else {
th = &ths;
}
// Allocate enough TLS memory for all the GNU Linuker (_tls_size)
// organized _Thread_local data, as well as Cosmpolitan Libc (64)
if (!(th->tls = _mktls(&th->tib))) {
return -1;
}
th->ctid = (int *)(th->tib + 0x38);
// We must use _mapstack() to allocate the stack because OpenBSD has
// very strict requirements for what's allowed to be used for stacks
if (!(th->stk = _mapstack())) {
free(th->tls);
return -1;
}
if (clone(fun, th->stk, GetStackSize(),
CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID,
arg, &th->ptid, th->tib, _TIBZ, th->ctid) == -1) {
_freestack(th->stk);
free(th->tls);
return -1;
}
return 0;
}
/**
* Waits for thread created by _spawn() to terminate.
*
* This will free your thread's stack and tls memory too.
*/
int _join(struct spawn *th) {
int rc;
if (th->ctid) {
// wait for ctid to become zero
_wait0(th->ctid);
// free thread memory
free(th->tls);
rc = munmap(th->stk, GetStackSize());
} else {
rc = 0;
}
bzero(th, sizeof(*th));
return rc;
}

20
libc/thread/spawn.h Normal file
View file

@ -0,0 +1,20 @@
#ifndef COSMOPOLITAN_LIBC_THREAD_SPAWN_H_
#define COSMOPOLITAN_LIBC_THREAD_SPAWN_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
struct spawn {
int ptid;
int *ctid;
char *stk;
char *tls;
char *tib;
};
int _spawn(int (*)(void *, int), void *, struct spawn *) hidden;
int _join(struct spawn *) hidden;
char *_mktls(char **) hidden;
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_THREAD_SPAWN_H_ */

View file

@ -15,7 +15,6 @@ enum cthread_state {
cthread_joining = 1,
cthread_finished = 2,
cthread_detached = 4,
cthread_main = 127,
};
struct cthread_descriptor_t {