Remove some legacy cruft

Function trace logs will report stack usage accurately. It won't include
the argv/environ block. Our clone() polyfill is now simpler and does not
use as much stack memory. Function call tracing on x86 is now faster too
This commit is contained in:
Justine Tunney 2025-01-02 18:44:07 -08:00
parent 8db646f6b2
commit a15958edc6
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
21 changed files with 291 additions and 467 deletions

View file

@ -254,7 +254,7 @@ static bool elf_slurp(struct Loaded *l, int fd, const char *file) {
return true;
}
static dontinline bool elf_load(struct Loaded *l, const char *file, long pagesz,
dontinline static bool elf_load(struct Loaded *l, const char *file, long pagesz,
char *interp_path, size_t interp_size) {
int fd;
if ((fd = open(file, O_RDONLY | O_CLOEXEC)) == -1)
@ -280,7 +280,7 @@ static long *push_strs(long *sp, char **list, int count) {
return sp;
}
static wontreturn dontinstrument void foreign_helper(void **p) {
wontreturn dontinstrument static void foreign_helper(void **p) {
__foreign.dlopen = p[0];
__foreign.dlsym = p[1];
__foreign.dlclose = p[2];
@ -288,7 +288,7 @@ static wontreturn dontinstrument void foreign_helper(void **p) {
_longjmp(__foreign.jb, 1);
}
static dontinline void elf_exec(const char *file, char **envp) {
dontinline static void elf_exec(const char *file, char **envp) {
// get microprocessor page size
long pagesz = __pagesize;
@ -412,7 +412,7 @@ static char *dlerror_set(const char *str) {
return dlerror_buf;
}
static dontinline char *foreign_alloc_block(void) {
dontinline static char *foreign_alloc_block(void) {
char *p = 0;
size_t sz = 65536;
if (!IsWindows()) {
@ -435,7 +435,7 @@ static dontinline char *foreign_alloc_block(void) {
return p;
}
static dontinline void *foreign_alloc(size_t n) {
dontinline static void *foreign_alloc(size_t n) {
void *res;
static char *block;
__dlopen_lock();
@ -548,7 +548,7 @@ static void *foreign_thunk_nt(void *func) {
return code;
}
static dontinline bool foreign_compile(char exe[hasatleast PATH_MAX]) {
dontinline static bool foreign_compile(char exe[hasatleast PATH_MAX]) {
// construct path
strlcpy(exe, get_tmp_dir(), PATH_MAX);

View file

@ -49,6 +49,6 @@
int __vcscanf(int (*)(void *), int (*)(int, void *), void *, const char *,
va_list);
int __fmt(void *, void *, const char *, va_list, int *);
__msabi char16_t *__itoa16(char16_t[21], uint64_t);
char16_t *__itoa16(char16_t[21], uint64_t) __msabi;
#endif /* COSMOPOLITAN_LIBC_FMT_STRTOL_H_ */

View file

@ -89,14 +89,14 @@ __msabi extern typeof(WriteFile) *const __imp_WriteFile;
extern pthread_mutex_t __sig_worker_lock;
HAIRY static bool __sig_ignored_by_default(int sig) {
textwindows static bool __sig_ignored_by_default(int sig) {
return sig == SIGURG || //
sig == SIGCONT || //
sig == SIGCHLD || //
sig == SIGWINCH;
}
HAIRY bool __sig_ignored(int sig) {
textwindows bool __sig_ignored(int sig) {
return __sighandrvas[sig] == (intptr_t)SIG_IGN ||
(__sighandrvas[sig] == (intptr_t)SIG_DFL &&
__sig_ignored_by_default(sig));
@ -532,14 +532,14 @@ textwindows void __sig_generate(int sig, int sic) {
}
}
HAIRY static char *__sig_stpcpy(char *d, const char *s) {
textwindows static char *__sig_stpcpy(char *d, const char *s) {
size_t i;
for (i = 0;; ++i)
if (!(d[i] = s[i]))
return d + i;
}
HAIRY wontreturn static void __sig_death(int sig, const char *thing) {
textwindows wontreturn static void __sig_death(int sig, const char *thing) {
#ifndef TINY
intptr_t hStderr;
char sigbuf[21], s[128], *p;
@ -810,7 +810,7 @@ HAIRY static uint32_t __sig_worker(void *arg) {
_pthread_mutex_unlock(&__sig_worker_lock);
Sleep(POLL_INTERVAL_MS);
}
return 0;
__builtin_unreachable();
}
__attribute__((__constructor__(10))) textstartup void __sig_init(void) {

View file

@ -34,6 +34,8 @@
#include "libc/nt/thunk/msabi.h"
#ifdef __x86_64__
#define ABI __msabi textwindows dontinstrument
// cut back on code size and avoid setting errno
// this code is a mandatory dependency of winmain
__msabi extern typeof(CloseHandle) *const __imp_CloseHandle;
@ -47,8 +49,8 @@ __msabi extern typeof(GetEnvironmentVariable)
*const __imp_GetEnvironmentVariableW;
// Generates C:\ProgramData\cosmo\sig\x\y.pid like path
__msabi textwindows dontinstrument char16_t *__sig_process_path(
char16_t *path, uint32_t pid, int create_directories) {
ABI char16_t *__sig_process_path(char16_t *path, uint32_t pid,
int create_directories) {
char16_t buf[3];
char16_t *p = path;
uint32_t vlen = __imp_GetEnvironmentVariableW(u"SYSTEMDRIVE", buf, 3);
@ -100,7 +102,7 @@ __msabi textwindows dontinstrument char16_t *__sig_process_path(
return path;
}
__msabi textwindows atomic_ulong *__sig_map_process(int pid, int disposition) {
ABI atomic_ulong *__sig_map_process(int pid, int disposition) {
char16_t path[128];
__sig_process_path(path, pid, disposition == kNtOpenAlways);
intptr_t hand = __imp_CreateFileW(path, kNtGenericRead | kNtGenericWrite,

View file

@ -32,8 +32,7 @@ int sys_sigprocmask(int how, const sigset_t *opt_set,
how, opt_set ? (sigset_t *)(intptr_t)(uint32_t)*opt_set : 0, 0, 0);
rc = 0;
}
if (rc != -1 && opt_out_oldset) {
if (rc != -1 && opt_out_oldset)
*opt_out_oldset = old[0];
}
return rc;
}

View file

@ -36,12 +36,10 @@
privileged const char *strsignal_r(int sig, char buf[21]) {
char *p;
const char *s;
if (!sig) {
if (!sig)
return "0";
}
if ((s = GetMagnumStr(kSignalNames, sig))) {
if ((s = GetMagnumStr(kSignalNames, sig)))
return s;
}
if (SIGRTMIN <= sig && sig <= SIGRTMAX) {
sig -= SIGRTMIN;
buf[0] = 'S';

View file

@ -79,7 +79,11 @@ int ulock_wait(uint32_t operation, void *addr, uint64_t value,
// it could also mean another thread calling ulock on this address was
// configured (via operation) in an inconsistent way.
//
int ulock_wake(uint32_t operation, void *addr, uint64_t wake_value) {
// should be dontinstrument because SiliconThreadMain() calls this from
// a stack managed by apple libc.
//
dontinstrument int ulock_wake(uint32_t operation, void *addr,
uint64_t wake_value) {
int rc;
rc = __syscall3i(operation, (long)addr, wake_value, 0x2000000 | 516);
LOCKTRACE("ulock_wake(%#x, %p, %lx) → %s", operation, addr, wake_value,

View file

@ -48,9 +48,8 @@
* @param st is open symbol table for current executable
* @return -1 w/ errno if error happened
*/
dontinstrument int PrintBacktraceUsingSymbols(int fd,
const struct StackFrame *bp,
struct SymbolTable *st) {
int PrintBacktraceUsingSymbols(int fd, const struct StackFrame *bp,
struct SymbolTable *st) {
size_t gi;
char *cxxbuf;
intptr_t addr;

View file

@ -33,14 +33,14 @@ static char __watch_last[4096];
void __watch_hook(void);
static dontinstrument inline void Copy(char *p, char *q, size_t n) {
dontinstrument static inline void Copy(char *p, char *q, size_t n) {
size_t i;
for (i = 0; i < n; ++i) {
p[i] = q[i];
}
}
static dontinstrument inline int Cmp(char *p, char *q, size_t n) {
dontinstrument static inline int Cmp(char *p, char *q, size_t n) {
if (n == 8)
return READ64LE(p) != READ64LE(q);
if (n == 4)

View file

@ -291,21 +291,6 @@
.balign 4
.endm
// Loads address of errno into %rcx
.macro .errno
call __errno_location
.endm
// Post-Initialization Read-Only (PIRO) BSS section.
// @param ss is an optional string, for control image locality
.macro .piro ss
.ifnb \ss
.section .piro.sort.bss.\ss,"aw",@nobits
.else
.section .piro.bss,"aw",@nobits
.endif
.endm
// Helpers for Cosmopolitan _init() amalgamation magic.
// @param name should be consistent across macros for a module
// @see libc/runtime/_init.S

View file

@ -71,7 +71,7 @@ struct Procs __proc = {
.lock = PTHREAD_MUTEX_INITIALIZER,
};
static textwindows void __proc_stats(int64_t h, struct rusage *ru) {
textwindows static void __proc_stats(int64_t h, struct rusage *ru) {
bzero(ru, sizeof(*ru));
struct NtProcessMemoryCountersEx memcount = {sizeof(memcount)};
GetProcessMemoryInfo(h, &memcount, sizeof(memcount));
@ -137,7 +137,7 @@ textwindows int __proc_harvest(struct Proc *pr, bool iswait4) {
return sic;
}
static textwindows dontinstrument uint32_t __proc_worker(void *arg) {
textwindows dontinstrument static uint32_t __proc_worker(void *arg) {
struct CosmoTib tls;
char *sp = __builtin_frame_address(0);
__bootstrap_tls(&tls, __builtin_frame_address(0));
@ -246,7 +246,7 @@ static textwindows dontinstrument uint32_t __proc_worker(void *arg) {
/**
* Lazy initializes process tracker data structures and worker.
*/
static textwindows void __proc_setup(void) {
textwindows static void __proc_setup(void) {
__proc.onbirth = CreateEvent(0, 0, 0, 0); // auto reset
__proc.haszombies = CreateEvent(0, 1, 0, 0); // manual reset
__proc.thread = CreateThread(0, STACK_SIZE, __proc_worker, 0,

View file

@ -26,7 +26,7 @@
// @param rdx x2 is ptid
// @param rcx x3 is ctid
// @param r8 x4 is tls
// @param r9 x5 is func(void*,int)→int
// @param r9 x5 is func(void*)→int
// @param 8(rsp) x6 is arg
// @return tid of child on success, or -errno on error
sys_clone_linux:
@ -45,16 +45,10 @@ sys_clone_linux:
ret
2: xor %ebp,%ebp // child thread
mov %rbx,%rdi // arg
mov %r10,%r15 // experiment
mov (%r10),%esi // tid
call *%r9 // func(arg,tid)
xchg %eax,%edi // func(arg,tid) exitcode
mov (%r15),%eax // experiment
test %eax,%eax // experiment
jz 1f // experiment
mov $60,%eax // __NR_exit(exitcode)
syscall
1: hlt // ctid was corrupted by program!
#elif defined(__aarch64__)
stp x29,x30,[sp,#-16]!
mov x29,sp
@ -69,7 +63,6 @@ sys_clone_linux:
2: mov x29,#0 // wipe backtrace
mov x28,x3 // set cosmo tls
mov x0,x6 // child thread
ldr w1,[x4] // arg2 = *ctid
blr x5
mov x8,#93 // __NR_exit
svc #0

View file

@ -16,50 +16,27 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/sysv/consts/clone.h"
#include "libc/assert.h"
#include "libc/atomic.h"
#include "libc/calls/calls.h"
#include "libc/calls/state.internal.h"
#include "libc/calls/struct/sigset.h"
#include "libc/calls/struct/ucontext-netbsd.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/calls/wincrash.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/asmflag.h"
#include "libc/intrin/atomic.h"
#include "libc/intrin/describeflags.h"
#include "libc/intrin/strace.h"
#include "libc/intrin/ulock.h"
#include "libc/intrin/weaken.h"
#include "libc/limits.h"
#include "libc/macros.h"
#include "libc/mem/alloca.h"
#include "libc/nt/enum/processcreationflags.h"
#include "libc/nt/runtime.h"
#include "libc/nt/signals.h"
#include "libc/nt/synchronization.h"
#include "libc/nt/thread.h"
#include "libc/nt/thunk/msabi.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h"
#include "libc/runtime/syslib.internal.h"
#include "libc/sock/internal.h"
#include "libc/stdalign.h"
#include "libc/stdio/sysparam.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/arch.h"
#include "libc/sysv/consts/clone.h"
#include "libc/sysv/consts/futex.h"
#include "libc/sysv/consts/nr.h"
#include "libc/sysv/consts/nrlinux.h"
#include "libc/sysv/errfuns.h"
#include "libc/thread/freebsd.internal.h"
#include "libc/thread/openbsd.internal.h"
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/thread.h"
#include "libc/thread/tls.h"
#include "libc/thread/xnu.internal.h"
#define kMaxThreadIds 32768
@ -79,28 +56,19 @@
#define LWP_SUSPENDED 0x00000080
struct CloneArgs {
alignas(16) union {
struct {
atomic_int tid;
int this;
};
union {
long sp;
int64_t tid64;
};
atomic_int *ptid;
atomic_int *ctid;
atomic_int *ztid;
char *tls;
int (*func)(void *, int);
int (*func)(void *);
void *arg;
long sp;
};
int sys_set_tls(uintptr_t, void *);
int __stack_call(void *, int, long, long, int (*)(void *, int), long);
static long AlignStack(long sp, char *stk, long stksz, int mal) {
return sp & -mal;
}
int __stack_call(void *, int, long, long, int (*)(void *), long);
#ifdef __x86_64__
@ -109,7 +77,6 @@ static long AlignStack(long sp, char *stk, long stksz, int mal) {
__msabi extern typeof(ExitThread) *const __imp_ExitThread;
__msabi extern typeof(GetCurrentThreadId) *const __imp_GetCurrentThreadId;
__msabi extern typeof(TlsSetValue) *const __imp_TlsSetValue;
__msabi extern typeof(WakeByAddressAll) *const __imp_WakeByAddressAll;
textwindows dontinstrument wontreturn static void //
@ -117,51 +84,45 @@ WinThreadEntry(int rdi, // rcx
int rsi, // rdx
int rdx, // r8
struct CloneArgs *wt) { // r9
int rc;
if (wt->tls)
__set_tls_win32(wt->tls);
__set_tls_win32(wt->tls);
int tid = __imp_GetCurrentThreadId();
atomic_int *ctid = wt->ctid;
atomic_init(ctid, tid);
atomic_init(wt->ptid, tid);
atomic_init(wt->ctid, tid);
rc = __stack_call(wt->arg, wt->tid, 0, 0, wt->func, wt->sp);
int rc = __stack_call(wt->arg, tid, 0, 0, wt->func, wt->sp);
// we can now clear ctid directly since we're no longer using our own
// stack memory, which can now be safely free'd by the parent thread.
atomic_store_explicit(wt->ztid, 0, memory_order_release);
__imp_WakeByAddressAll(wt->ztid);
atomic_store_explicit(ctid, 0, memory_order_release);
__imp_WakeByAddressAll(ctid);
// since we didn't indirect this function through NT2SYSV() it's not
// safe to simply return, and as such, we need ExitThread().
__imp_ExitThread(rc);
__builtin_unreachable();
}
static textwindows errno_t CloneWindows(int (*func)(void *, int), char *stk,
size_t stksz, int flags, void *arg,
void *tls, atomic_int *ptid,
atomic_int *ctid) {
textwindows static errno_t CloneWindows(int (*func)(void *), char *stk,
size_t stksz, void *arg, void *tls,
atomic_int *ptid, atomic_int *ctid) {
long sp;
int64_t h;
intptr_t tip;
uint32_t utid;
struct CloneArgs *wt;
sp = (intptr_t)stk + stksz;
sp = AlignStack(sp, stk, stksz, 16);
sp = tip = (intptr_t)stk + stksz;
sp -= sizeof(struct CloneArgs);
sp &= -alignof(struct CloneArgs);
wt = (struct CloneArgs *)sp;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->ctid = ctid;
wt->ptid = ptid;
wt->func = func;
wt->arg = arg;
wt->tls = flags & CLONE_SETTLS ? tls : 0;
wt->sp = sp;
wt->tls = tls;
wt->sp = tip & -16;
if ((h = CreateThread(&kNtIsInheritable, 65536, (void *)WinThreadEntry, wt,
kNtStackSizeParamIsAReservation, &utid))) {
if (flags & CLONE_PARENT_SETTID)
atomic_init(ptid, utid);
if (flags & CLONE_SETTLS) {
struct CosmoTib *tib = tls;
atomic_store_explicit(&tib->tib_syshand, h, memory_order_release);
}
atomic_init(ptid, utid);
struct CosmoTib *tib = tls;
atomic_store_explicit(&tib->tib_syshand, h, memory_order_release);
return 0;
} else {
return __dos2errno(GetLastError());
@ -185,37 +146,33 @@ asm("XnuThreadThunk:\n\t"
".size\tXnuThreadThunk,.-XnuThreadThunk");
__attribute__((__used__))
static dontinstrument wontreturn void
XnuThreadMain(void *pthread, // rdi
int tid, // rsi
int (*func)(void *arg, int tid), // rdx
void *arg, // rcx
struct CloneArgs *wt, // r8
unsigned xnuflags) { // r9
int ax;
wt->tid = tid;
dontinstrument wontreturn static void
XnuThreadMain(void *pthread, // rdi
int tid, // rsi
int (*func)(void *arg), // rdx
void *arg, // rcx
struct CloneArgs *wt, // r8
unsigned xnuflags) { // r9
atomic_init(wt->ctid, tid);
atomic_init(wt->ptid, tid);
if (wt->tls) {
// XNU uses the same 0x30 offset as the WIN32 TIB x64. They told the
// Go team at Google that they Apply stands by our ability to use it
// https://github.com/golang/go/issues/23617#issuecomment-376662373
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_thread_fast_set_cthread_self), "D"(wt->tls - 0x30)
: "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc");
}
// XNU uses the same 0x30 offset as the WIN32 TIB x64. They told the
// Go team at Google that they Apply stands by our ability to use it
// https://github.com/golang/go/issues/23617#issuecomment-376662373
int ax;
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_thread_fast_set_cthread_self), "D"(wt->tls - 0x30)
: "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc");
func(arg, tid);
func(arg);
// we no longer use the stack after this point
// %rax = int bsdthread_terminate(%rdi = void *stackaddr,
// %rsi = size_t freesize,
// %rdx = uint32_t port,
// %r10 = uint32_t sem);
asm volatile("movl\t$0,(%%rsi)\n\t" // *wt->ztid = 0
asm volatile("movl\t$0,(%%rsi)\n\t" // *wt->ctid = 0
"mov\t$0x101,%%edi\n\t" // wake all
"xor\t%%edx,%%edx\n\t" // wake_value
"mov\t$0x02000204,%%eax\n\t" // ulock_wake()
@ -227,19 +184,18 @@ XnuThreadMain(void *pthread, // rdi
"mov\t$0x02000169,%%eax\n\t" // bsdthread_terminate()
"syscall"
: /* no outputs */
: "S"(wt->ztid)
: "S"(wt->ctid)
: "rax", "rcx", "r10", "r11", "memory");
__builtin_unreachable();
}
static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, atomic_int *ptid,
atomic_int *ctid) {
static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, void *arg,
void *tls, atomic_int *ptid, atomic_int *ctid) {
// perform this weird mandatory system call once
static bool once;
if (!once) {
npassert(sys_bsdthread_register(XnuThreadThunk, 0, 0, 0, 0, 0, 0) != -1);
sys_bsdthread_register(XnuThreadThunk, 0, 0, 0, 0, 0, 0);
once = true;
}
@ -247,16 +203,15 @@ static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
long sp;
struct CloneArgs *wt;
sp = (intptr_t)stk + stksz;
sp = AlignStack(sp, stk, stksz, 16);
sp -= sizeof(struct CloneArgs);
sp &= -alignof(struct CloneArgs);
wt = (struct CloneArgs *)sp;
sp &= -16;
// pass parameters to new thread via xnu
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->tls = flags & CLONE_SETTLS ? tls : 0;
wt->ctid = ctid;
wt->ptid = ptid;
wt->tls = tls;
return sys_clone_xnu(fn, arg, wt, 0, PTHREAD_START_CUSTOM_XNU);
}
@ -267,25 +222,25 @@ static errno_t CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
// 1. __asan_handle_no_return wipes stack [todo?]
relegated dontinstrument wontreturn static void OpenbsdThreadMain(void *p) {
struct CloneArgs *wt = p;
atomic_init(wt->ptid, wt->tid);
atomic_init(wt->ctid, wt->tid);
wt->func(wt->arg, wt->tid);
asm volatile("mov\t%2,%%rsp\n\t" // so syscall can validate stack exists
"movl\t$0,(%%rdi)\n\t" // *wt->ztid = 0 (old stack now free'd)
int tid = atomic_load_explicit(wt->ctid, memory_order_relaxed);
atomic_init(wt->ptid, tid);
wt->func(wt->arg);
asm volatile("mov\t%1,%%rsp\n\t" // so syscall can validate stack exists
"movl\t$0,(%2)\n\t" // *wt->ctid = 0 (old stack now free'd)
"syscall\n\t" // futex(int*, op, val) will wake wait0
"xor\t%%edi,%%edi\n\t" // so kernel doesn't write to old stack
"mov\t$302,%%eax\n\t" // __threxit(int *notdead) doesn't wake
"syscall"
: "=m"(*wt->ztid)
: "a"(83), "m"(__oldstack), "D"(wt->ztid),
: /* no outputs */
: "a"(83), "m"(__oldstack), "D"(wt->ctid),
"S"(2 /* FUTEX_WAKE */), "d"(INT_MAX)
: "rcx", "r11", "memory");
__builtin_unreachable();
}
relegated errno_t CloneOpenbsd(int (*func)(void *, int), char *stk,
size_t stksz, int flags, void *arg, void *tls,
atomic_int *ptid, atomic_int *ctid) {
relegated static errno_t CloneOpenbsd(int (*func)(void *), char *stk,
size_t stksz, void *arg, void *tls,
atomic_int *ptid, atomic_int *ctid) {
int rc;
intptr_t sp;
struct __tfork *tf;
@ -297,18 +252,18 @@ relegated errno_t CloneOpenbsd(int (*func)(void *, int), char *stk,
sp -= sizeof(struct CloneArgs);
sp &= -alignof(struct CloneArgs);
wt = (struct CloneArgs *)sp;
sp = AlignStack(sp, stk, stksz, 16);
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
sp &= -16;
sp -= 8;
*(intptr_t *)sp = (intptr_t)CloneOpenbsd + 1;
wt->ctid = ctid;
wt->ptid = ptid;
wt->arg = arg;
wt->func = func;
tf->tf_stack = (char *)sp - 8;
tf->tf_tcb = flags & CLONE_SETTLS ? tls : 0;
tf->tf_tid = &wt->tid;
tf->tf_stack = (char *)sp;
tf->tf_tcb = tls;
tf->tf_tid = ctid;
if ((rc = __tfork_thread(tf, sizeof(*tf), OpenbsdThreadMain, wt)) >= 0) {
if (flags & CLONE_PARENT_SETTID)
atomic_init(ptid, rc);
atomic_init(ptid, rc);
return 0;
} else {
return -rc;
@ -319,35 +274,30 @@ relegated errno_t CloneOpenbsd(int (*func)(void *, int), char *stk,
// NET BESIYATA DISHMAYA
wontreturn dontinstrument static void NetbsdThreadMain(
void *arg, // rdi
int (*func)(void *, int), // rsi
int flags, // rdx
atomic_int *ctid, // rcx
atomic_int *ptid) { // r8
int ax, dx;
static atomic_int clobber;
atomic_int *ztid = &clobber;
ax = sys_gettid();
if (flags & CLONE_CHILD_SETTID)
atomic_init(ctid, ax);
if (flags & CLONE_PARENT_SETTID)
atomic_init(ptid, ax);
if (flags & CLONE_CHILD_CLEARTID)
ztid = ctid;
func(arg, ax);
void *arg, // rdi
int (*func)(void *), // rsi
atomic_int *ctid, // rdx
atomic_int *ptid) { // rcx
int ax;
asm("syscall"
: "=a"(ax) // man says always succeeds
: "0"(311) // _lwp_self()
: "rcx", "rdx", "r8", "r9", "r10", "r11", "memory", "cc");
atomic_init(ctid, ax);
atomic_init(ptid, ax);
func(arg);
// we no longer use the stack after this point
// %eax = int __lwp_exit(void);
asm volatile("movl\t$0,%2\n\t" // *ztid = 0
"syscall" // __lwp_exit()
: "=a"(ax), "=d"(dx), "=m"(*ztid)
: "0"(310)
asm volatile("movl\t$0,(%2)\n\t" // *ztid = 0
"syscall" // __lwp_exit()
: "=a"(ax)
: "0"(310), "r"(ctid)
: "rcx", "r11", "memory");
__builtin_unreachable();
}
static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, atomic_int *ptid,
atomic_int *ctid) {
static int CloneNetbsd(int (*func)(void *), char *stk, size_t stksz, void *arg,
void *tls, atomic_int *ptid, atomic_int *ctid) {
// NetBSD has its own clone() and it works, but it's technically a
// second-class API, intended to help Linux folks migrate to this.
int ax;
@ -363,13 +313,12 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
: CFLAG_CONSTRAINT(failed), "=a"(ax)
: "1"(__NR_getcontext_netbsd), "D"(&netbsd_clone_template)
: "rcx", "rdx", "r8", "r9", "r10", "r11", "memory");
npassert(!failed);
once = true;
}
sp = (intptr_t)stk + stksz;
// align the stack
sp = AlignStack(sp, stk, stksz, 16);
sp &= -16;
// simulate call to misalign stack and ensure backtrace looks good
sp -= 8;
@ -377,8 +326,7 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
// place the giant 784 byte ucontext structure in the red zone!
// it only has to live long enough for the thread to come alive
ctx = (struct ucontext_netbsd *)((sp - sizeof(struct ucontext_netbsd)) &
-alignof(struct ucontext_netbsd));
ctx = (struct ucontext_netbsd *)((sp - sizeof(struct ucontext_netbsd)) & -64);
// pass parameters in process state
memcpy(ctx, &netbsd_clone_template, sizeof(*ctx));
@ -388,17 +336,14 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
ctx->uc_mcontext.rip = (intptr_t)NetbsdThreadMain;
ctx->uc_mcontext.rdi = (intptr_t)arg;
ctx->uc_mcontext.rsi = (intptr_t)func;
ctx->uc_mcontext.rdx = flags;
ctx->uc_mcontext.rcx = (intptr_t)ctid;
ctx->uc_mcontext.r8 = (intptr_t)ptid;
ctx->uc_mcontext.rdx = (intptr_t)ctid;
ctx->uc_mcontext.rcx = (intptr_t)ptid;
ctx->uc_flags |= _UC_STACK;
ctx->uc_stack.ss_sp = stk;
ctx->uc_stack.ss_size = stksz;
ctx->uc_stack.ss_flags = 0;
if (flags & CLONE_SETTLS) {
ctx->uc_flags |= _UC_TLSBASE;
ctx->uc_mcontext._mc_tlsbase = (intptr_t)tls;
}
ctx->uc_flags |= _UC_TLSBASE;
ctx->uc_mcontext._mc_tlsbase = (intptr_t)tls;
// perform the system call
int tid = 0;
@ -407,9 +352,7 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
: "1"(__NR__lwp_create), "D"(ctx), "S"(LWP_DETACHED), "2"(&tid)
: "rcx", "r8", "r9", "r10", "r11", "memory");
if (!failed) {
unassert(tid);
if (flags & CLONE_PARENT_SETTID)
atomic_init(ptid, tid);
atomic_init(ptid, tid);
return 0;
} else {
return ax;
@ -428,35 +371,35 @@ wontreturn dontinstrument static void FreebsdThreadMain(void *p) {
#elif defined(__x86_64__)
sys_set_tls(AMD64_SET_GSBASE, wt->tls);
#endif
atomic_init(wt->ctid, wt->tid);
atomic_init(wt->ptid, wt->tid);
wt->func(wt->arg, wt->tid);
atomic_init(wt->ctid, wt->tid64);
atomic_init(wt->ptid, wt->tid64);
wt->func(wt->arg);
// we no longer use the stack after this point
// void thr_exit(%rdi = long *state);
#ifdef __x86_64__
asm volatile("movl\t$0,%0\n\t" // *wt->ztid = 0
"syscall\n\t" // _umtx_op(wt->ztid, WAKE, INT_MAX)
asm volatile("movl\t$0,%0\n\t" // *wt->ctid = 0
"syscall\n\t" // _umtx_op(wt->ctid, WAKE, INT_MAX)
"movl\t$431,%%eax\n\t" // thr_exit(long *nonzeroes_and_wake)
"xor\t%%edi,%%edi\n\t" // sad we can't use this free futex op
"syscall\n\t" // thr_exit() fails if thread is orphaned
"movl\t$1,%%eax\n\t" // _exit()
"syscall" //
: "=m"(*wt->ztid)
: "a"(454), "D"(wt->ztid), "S"(UMTX_OP_WAKE), "d"(INT_MAX)
: "=m"(*wt->ctid)
: "a"(454), "D"(wt->ctid), "S"(UMTX_OP_WAKE), "d"(INT_MAX)
: "rcx", "r8", "r9", "r10", "r11", "memory");
#elif defined(__aarch64__)
register long x0 asm("x0") = (long)wt->ztid;
register long x0 asm("x0") = (long)wt->ctid;
register long x1 asm("x1") = UMTX_OP_WAKE;
register long x2 asm("x2") = INT_MAX;
register long x8 asm("x8") = 454; // _umtx_op
asm volatile("str\twzr,%0\n\t" // *wt->ztid = 0
"svc\t0\n\t" // _umtx_op(wt->ztid, WAKE, INT_MAX)
asm volatile("str\twzr,%0\n\t" // *wt->ctid = 0
"svc\t0\n\t" // _umtx_op(wt->ctid, WAKE, INT_MAX)
"mov\tx0,#0\n\t" // arg0 = 0
"mov\tx8,#431\n\t" // thr_exit
"svc\t0\n\t" // thr_exit(long *nonzeroes_and_wake = 0)
"mov\tx8,#1\n\t" // _exit
"svc\t0" // _exit(long *nonzeroes_and_wake = 0)
: "=m"(*wt->ztid)
: "=m"(*wt->ctid)
: "r"(x0), "r"(x1), "r"(x2), "r"(x8));
#else
#error "unsupported architecture"
@ -464,20 +407,19 @@ wontreturn dontinstrument static void FreebsdThreadMain(void *p) {
__builtin_unreachable();
}
static errno_t CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, atomic_int *ptid,
static errno_t CloneFreebsd(int (*func)(void *), char *stk, size_t stksz,
void *arg, void *tls, atomic_int *ptid,
atomic_int *ctid) {
long sp;
int64_t tid;
int64_t tid64;
struct CloneArgs *wt;
sp = (intptr_t)stk + stksz;
sp -= sizeof(struct CloneArgs);
sp &= -alignof(struct CloneArgs);
wt = (struct CloneArgs *)sp;
sp = AlignStack(sp, stk, stksz, 16);
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
sp &= -16;
wt->ctid = ctid;
wt->ptid = ptid;
wt->tls = tls;
wt->func = func;
wt->arg = arg;
@ -486,10 +428,10 @@ static errno_t CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
.arg = wt,
.stack_base = stk,
.stack_size = sp - (long)stk,
.tls_base = flags & CLONE_SETTLS ? tls : 0,
.tls_base = tls,
.tls_size = 64,
.child_tid = &wt->tid64,
.parent_tid = &tid,
.parent_tid = &tid64,
};
#ifdef __x86_64__
int ax;
@ -510,8 +452,7 @@ static errno_t CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
#else
#error "unsupported architecture"
#endif
if (flags & CLONE_PARENT_SETTID)
atomic_init(ptid, tid);
atomic_init(ptid, tid64);
return 0;
}
@ -522,57 +463,57 @@ static errno_t CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
dontinstrument static void *SiliconThreadMain(void *arg) {
struct CloneArgs *wt = arg;
atomic_int *ctid = wt->ctid;
int tid = atomic_load_explicit(ctid, memory_order_relaxed);
asm volatile("mov\tx28,%0" : /* no outputs */ : "r"(wt->tls));
atomic_init(wt->ctid, wt->this);
atomic_init(wt->ptid, wt->this);
__stack_call(wt->arg, wt->this, 0, 0, wt->func, wt->sp);
atomic_store_explicit(wt->ztid, 0, memory_order_release);
ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, wt->ztid, 0);
__stack_call(wt->arg, tid, 0, 0, wt->func, wt->sp);
atomic_store_explicit(ctid, 0, memory_order_release);
ulock_wake(UL_COMPARE_AND_WAIT | ULF_WAKE_ALL, ctid, 0);
return 0;
}
static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, atomic_int *ptid,
static errno_t CloneSilicon(int (*fn)(void *), char *stk, size_t stksz,
void *arg, void *tls, atomic_int *ptid,
atomic_int *ctid) {
long sp;
void *attr;
errno_t res;
unsigned tid;
pthread_t th;
size_t babystack;
struct CloneArgs *wt;
// assign tid to new thread
static atomic_uint tids;
sp = (intptr_t)stk + stksz;
unsigned tid = atomic_fetch_add_explicit(&tids, 1, memory_order_relaxed);
tid %= kMaxThreadIds;
tid += kMinThreadId;
atomic_init(ctid, tid);
atomic_init(ptid, tid);
// pass temp data on stack
intptr_t sp, tip;
struct CloneArgs *wt;
sp = tip = (intptr_t)stk + stksz;
sp -= sizeof(struct CloneArgs);
sp &= -alignof(struct CloneArgs);
wt = (struct CloneArgs *)sp;
sp = AlignStack(sp, stk, stksz, 16);
tid = atomic_fetch_add_explicit(&tids, 1, memory_order_acq_rel);
wt->this = tid = (tid % kMaxThreadIds) + kMinThreadId;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->tls = flags & CLONE_SETTLS ? tls : 0;
wt->func = fn;
wt->arg = arg;
wt->sp = sp;
babystack = __syslib->__pthread_stack_min;
wt->tls = tls;
wt->ctid = ctid;
wt->sp = tip & -16;
// ask apple libc to spawn thread
errno_t res;
pthread_t th;
size_t babystack = __syslib->__pthread_stack_min;
#pragma GCC push_options
#pragma GCC diagnostic ignored "-Walloca-larger-than="
attr = alloca(__syslib->__sizeof_pthread_attr_t);
void *attr = alloca(__syslib->__sizeof_pthread_attr_t);
#pragma GCC pop_options
unassert(!__syslib->__pthread_attr_init(attr));
unassert(!__syslib->__pthread_attr_setguardsize(attr, 0));
unassert(!__syslib->__pthread_attr_setstacksize(attr, babystack));
__syslib->__pthread_attr_init(attr);
__syslib->__pthread_attr_setguardsize(attr, 0);
__syslib->__pthread_attr_setstacksize(attr, babystack);
if (!(res = __syslib->__pthread_create(&th, attr, SiliconThreadMain, wt))) {
if (flags & CLONE_PARENT_SETTID)
atomic_init(ptid, tid);
if (flags & CLONE_SETTLS) {
struct CosmoTib *tib = tls;
atomic_store_explicit(&tib[-1].tib_syshand, th, memory_order_release);
}
atomic_init(ptid, tid);
struct CosmoTib *tib = tls;
atomic_store_explicit(&tib[-1].tib_syshand, th, memory_order_release);
}
unassert(!__syslib->__pthread_attr_destroy(attr));
__syslib->__pthread_attr_destroy(attr);
return res;
}
@ -582,10 +523,9 @@ static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz,
// GNU/SYSTEMD
struct LinuxCloneArgs {
int (*func)(void *, int);
int (*func)(void *);
void *arg;
char *tls;
atomic_int ctid;
};
int sys_clone_linux(int flags, // rdi
@ -596,44 +536,32 @@ int sys_clone_linux(int flags, // rdi
void *func, // r9
void *arg); // 8(rsp)
dontinstrument static int LinuxThreadEntry(void *arg, int tid) {
dontinstrument static int AmdLinuxThreadEntry(void *arg) {
struct LinuxCloneArgs *wt = arg;
#if defined(__x86_64__)
sys_set_tls(ARCH_SET_GS, wt->tls);
#endif
return wt->func(wt->arg, tid);
return wt->func(wt->arg);
}
static int CloneLinux(int (*func)(void *arg, int rc), char *stk, size_t stksz,
int flags, void *arg, void *tls, atomic_int *ptid,
static int CloneLinux(int (*func)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, atomic_int *ptid,
atomic_int *ctid) {
int rc;
long sp;
struct LinuxCloneArgs *wt;
sp = (intptr_t)stk + stksz;
long sp = (intptr_t)stk + stksz;
#if defined(__x86_64__)
sp -= sizeof(struct LinuxCloneArgs);
sp &= -alignof(struct LinuxCloneArgs);
wt = (struct LinuxCloneArgs *)sp;
// align the stack
#ifdef __aarch64__
sp = AlignStack(sp, stk, stksz, 128); // for kernel <=4.6
#else
sp = AlignStack(sp, stk, stksz, 16);
struct LinuxCloneArgs *wt = (struct LinuxCloneArgs *)sp;
sp &= -16; // align the stack
wt->arg = arg;
wt->tls = tls;
wt->func = func;
func = AmdLinuxThreadEntry;
arg = wt;
#elif defined(__aarch64__)
sp &= -128; // for kernels <=4.6
#endif
#ifdef __x86_64__
if (flags & CLONE_SETTLS) {
flags &= ~CLONE_SETTLS;
wt->arg = arg;
wt->tls = tls;
wt->func = func;
func = LinuxThreadEntry;
arg = wt;
}
#endif
if (~flags & CLONE_CHILD_SETTID) {
flags |= CLONE_CHILD_SETTID;
ctid = &wt->ctid;
}
int rc;
if ((rc = sys_clone_linux(flags, sp, ptid, ctid, tls, func, arg)) >= 0) {
// clone() is documented as setting ptid before return
return 0;
@ -646,110 +574,9 @@ static int CloneLinux(int (*func)(void *arg, int rc), char *stk, size_t stksz,
// COSMOPOLITAN
/**
* Creates thread without malloc being linked.
* Creates thread without malloc() being linked.
*
* If you use clone() you're on your own. Example:
*
* int worker(void *arg) { return 0; }
* struct CosmoTib tib = {.tib_self = &tib, .tib_ctid = -1};
* atomic_int tid;
* char *stk = NewCosmoStack();
* clone(worker, stk, GetStackSize() - 16,
* CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES |
* CLONE_SYSVSEM | CLONE_SIGHAND | CLONE_PARENT_SETTID |
* CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | CLONE_SETTLS,
* arg, &tid, &tib, &tib.tib_tid);
* while (atomic_load(&tid) == 0) sched_yield();
* // thread is known
* while (atomic_load(&tib.tib_ctid) < 0) sched_yield();
* // thread is running
* while (atomic_load(&tib.tib_ctid) > 0) sched_yield();
* // thread has terminated
* FreeCosmoStack(stk);
*
* Threads are created in a detached manner. They currently can't be
* synchronized using wait() or posix signals. Threads created by this
* function should be synchronized using shared memory operations.
*
* Any memory that's required by this system call wrapper is allocated
* to the top of your stack. This shouldn't be more than 128 bytes.
*
* Your function is called from within the stack you specify. A return
* address is pushed onto your stack, that causes returning to jump to
* _Exit1() which terminates the thread. Even though the callback says
* it supports a return code, that'll only work on Linux and Windows.
*
* This function follows the same ABI convention as the Linux userspace
* libraries, with a few small changes. The varargs has been removed to
* help prevent broken code, and the stack size and tls size parameters
* are introduced for compatibility with FreeBSD.
*
* To keep this system call lightweight, only the thread creation use
* case is polyfilled across platforms. For example, if you want fork
* that works on OpenBSD for example, don't do it with clone(SIGCHLD)
* and please just call fork(). Even if you do that on Linux, it will
* effectively work around libc features like atfork(), so that means
* other calls like getpid() may return incorrect values.
*
* @param func is your callback function, which this wrapper requires
* not be null, otherwise EINVAL is raised. It is passed two args
* within the child thread: (1) the caller-supplied `arg` and (2)
* the new tid is always passed in the second arg for convenience
*
* @param stk points to the bottom of a caller allocated stack, which
* must be allocated via mmap() using the MAP_STACK flag, or else
* you won't get optimal performance and it won't work on OpenBSD
*
* @param stksz is the size of that stack in bytes, we recommend that
* that this be set to GetStackSize() or else memory safety tools
* like kprintf() can't do as good and quick of a job; this value
* must be 16-aligned plus it must be at least 4192 bytes in size
* and it's advised to have the bottom-most page, be a guard page
*
* @param flags which SHOULD always have all of these flags:
*
* - `CLONE_THREAD`
* - `CLONE_VM`
* - `CLONE_FS`
* - `CLONE_FILES`
* - `CLONE_SIGHAND`
* - `CLONE_SYSVSEM`
*
* This system call wrapper is intended for threads, and as such, we
* won't polyfill Linux's ability to simulate unrelated calls (e.g.
* fork, vfork) via clone() on other platforms. Please just call
* fork() and vfork() when that's what you want.
*
* Your `flags` may also optionally also additionally bitwise-OR any
* combination of the following additional flags:
*
* - `CLONE_CHILD_SETTID` must be specified if you intend to set the
* `ctid` argument, which will updated with the child tid once the
* child has started.
*
* - `CLONE_PARENT_SETTID` must be specified if you intend to set
* the `ptid` argument, and it is updated at the most opportune
* moment. On all platforms except XNU x86, this happens before
* clone() returns. But since it might not be available yet you
* need to use pthread_getunique_np() to obtain it.
*
* - `CLONE_CHILD_CLEARTID` causes `*ctid = 0` upon child thread
* termination. This is used to implement join so that the parent
* may know when it's safe to free the child's stack memory, and
* as such, is guaranteed to happen AFTER the child thread has
* either terminated or has finished using its stack memory
*
* - `CLONE_SETTLS` is needed if you intend to specify the `tls`
* argument, which after thread creation may be accessed using
* __get_tls(). Doing this means that `errno`, gettid(), etc.
* correctly work. Caveat emptor if you choose not to do this.
*
* @param arg is passed as an argument to `func` in the child thread
* @param tls may be used to set the thread local storage segment;
* this parameter is ignored if `CLONE_SETTLS` is not set
* @param ctid lets the child receive its thread id without having to
* call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set
* @return 0 on success, or errno on errno
* If you use clone() you're on your own.
*/
errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg,
void *ptid, void *tls, void *ctid) {
@ -757,33 +584,25 @@ errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg,
atomic_fetch_add(&_pthread_count, 1);
if (!func) {
err = EINVAL;
} else if (IsLinux()) {
if (IsLinux()) {
err = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (!IsTiny() &&
(flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
(CLONE_THREAD | CLONE_VM | CLONE_FS | CLONE_FILES |
CLONE_SIGHAND | CLONE_SYSVSEM)) {
err = EINVAL;
} else if (IsXnu()) {
#ifdef __x86_64__
err = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid);
#if defined(__x86_64__)
err = CloneXnu(func, stk, stksz, arg, tls, ptid, ctid);
#elif defined(__aarch64__)
err = CloneSilicon(func, stk, stksz, flags, arg, tls, ptid, ctid);
err = CloneSilicon(func, stk, stksz, arg, tls, ptid, ctid);
#else
#error "unsupported architecture"
#endif
} else if (IsFreebsd()) {
err = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
#ifdef __x86_64__
} else if (IsNetbsd()) {
err = CloneNetbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsOpenbsd()) {
err = CloneOpenbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
err = CloneFreebsd(func, stk, stksz, arg, tls, ptid, ctid);
#if defined(__x86_64__)
} else if (IsWindows()) {
err = CloneWindows(func, stk, stksz, flags, arg, tls, ptid, ctid);
err = CloneWindows(func, stk, stksz, arg, tls, ptid, ctid);
} else if (IsNetbsd()) {
err = CloneNetbsd(func, stk, stksz, arg, tls, ptid, ctid);
} else if (IsOpenbsd()) {
err = CloneOpenbsd(func, stk, stksz, arg, tls, ptid, ctid);
#endif /* __x86_64__ */
} else {
err = ENOSYS;
@ -793,7 +612,7 @@ errno_t clone(void *func, void *stk, size_t stksz, int flags, void *arg,
err = EAGAIN;
if (err)
unassert(atomic_fetch_sub(&_pthread_count, 1) > 1);
atomic_fetch_sub(&_pthread_count, 1);
return err;
}

View file

@ -22,18 +22,25 @@
ftrace_hook:
#ifdef __x86_64__
// We need to save saved registers because we have some functions
// like __errno_location which can be called from an inline asm()
// statement. It's nice to have the flexibility anyway.
// save argument registers
// we save %rax because __gc() takes it as an argument.
// we save %r10 because it's used as a syscall argument.
cmpl $0,__ftrace(%rip)
jle 1f
push %rbp
mov %rsp,%rbp
and $-16,%rsp
sub $256,%rsp
sub $128,%rsp
movdqu %xmm0,-0x80(%rbp)
movdqu %xmm1,-0x70(%rbp)
movdqu %xmm2,-0x60(%rbp)
movdqu %xmm3,-0x50(%rbp)
movdqu %xmm4,-0x40(%rbp)
movdqu %xmm5,-0x30(%rbp)
movdqu %xmm6,-0x20(%rbp)
movdqu %xmm7,-0x10(%rbp)
push %rax
push %rbx
push %rcx
push %rdx
push %rdi
@ -41,19 +48,15 @@ ftrace_hook:
push %r8
push %r9
push %r10
push %r11
push %r12
push %r13
push %r14
push %r15
call __xmm_save
call ftracer
call __xmm_load
pop %r15
pop %r14
pop %r13
pop %r12
pop %r11
movdqu -0x80(%rbp),%xmm0
movdqu -0x70(%rbp),%xmm1
movdqu -0x60(%rbp),%xmm2
movdqu -0x50(%rbp),%xmm3
movdqu -0x40(%rbp),%xmm4
movdqu -0x30(%rbp),%xmm5
movdqu -0x20(%rbp),%xmm6
movdqu -0x10(%rbp),%xmm7
pop %r10
pop %r9
pop %r8
@ -61,7 +64,6 @@ ftrace_hook:
pop %rdi
pop %rdx
pop %rcx
pop %rbx
pop %rax
leave
1: ret

View file

@ -31,11 +31,7 @@
#include "libc/thread/tls.h"
/**
* @fileoverview Plain-text function call logging.
*
* Able to log ~2 million function calls per second, which is mostly
* bottlenecked by system call overhead. Log size is reasonable if piped
* into gzip.
* @fileoverview plain-text function call logging
*/
#define MAX_NESTING 512
@ -49,7 +45,7 @@
static struct CosmoFtrace g_ftrace;
__funline int GetNestingLevelImpl(struct StackFrame *frame) {
int nesting = -2;
int nesting = -1;
while (frame && !kisdangerous(frame)) {
++nesting;
frame = frame->next;
@ -82,38 +78,63 @@ privileged void ftracer(void) {
struct StackFrame *sf;
struct CosmoFtrace *ft;
struct PosixThread *pt;
// get interesting values
sf = __builtin_frame_address(0);
st = (uintptr_t)__argv - sizeof(uintptr_t);
if (__ftrace <= 0)
return;
// determine top of stack
// main thread won't consider kernel provided argblock
if (__tls_enabled) {
tib = __get_tls_privileged();
if (tib->tib_ftrace <= 0)
return;
ft = &tib->tib_ftracer;
if ((char *)sf >= tib->tib_sigstack_addr &&
(char *)sf <= tib->tib_sigstack_addr + tib->tib_sigstack_size) {
st = (uintptr_t)tib->tib_sigstack_addr + tib->tib_sigstack_size;
} else if ((pt = (struct PosixThread *)tib->tib_pthread) &&
pt->pt_attr.__stacksize) {
st = (uintptr_t)pt->pt_attr.__stackaddr + pt->pt_attr.__stacksize;
pt = (struct PosixThread *)tib->tib_pthread;
if (pt != &_pthread_static) {
if ((char *)sf >= tib->tib_sigstack_addr &&
(char *)sf <= tib->tib_sigstack_addr + tib->tib_sigstack_size) {
st = (uintptr_t)tib->tib_sigstack_addr + tib->tib_sigstack_size;
} else if (pt && pt->pt_attr.__stacksize) {
st = (uintptr_t)pt->pt_attr.__stackaddr + pt->pt_attr.__stacksize;
}
}
} else {
ft = &g_ftrace;
}
stackuse = st - (intptr_t)sf;
if (_cmpxchg(&ft->ft_once, false, true)) {
// estimate stack pointer of hooked function
uintptr_t usp = (uintptr_t)sf;
usp += sizeof(struct StackFrame); // overhead of this function
#if defined(__x86_64__)
usp += 8; // ftrace_hook() stack aligning
usp += 8 * 8; // ftrace_hook() pushed 8x regs
usp += 8 * 16; // ftrace_hook() pushed 8x xmms
#elif defined(__aarch64__)
usp += 384; // overhead of ftrace_hook()
#else
#error "unsupported architecture"
#endif
// determine how much stack hooked function is using
stackuse = st - usp;
// log function call
//
// FUN $PID $TID $STARTNANOS $STACKUSE $SYMBOL
//
if (!ft->ft_once) {
ft->ft_lastaddr = -1;
ft->ft_skew = GetNestingLevelImpl(sf);
ft->ft_once = true;
}
if (_cmpxchg(&ft->ft_noreentry, false, true)) {
sf = sf->next;
fn = sf->addr + DETOUR_SKEW;
if (fn != ft->ft_lastaddr) {
kprintf("%rFUN %6P %6H %'18T %'*ld %*s%t\n", ftrace_stackdigs, stackuse,
GetNestingLevel(ft, sf) * 2, "", fn);
ft->ft_lastaddr = fn;
}
ft->ft_noreentry = false;
sf = sf->next;
fn = sf->addr + DETOUR_SKEW;
if (fn != ft->ft_lastaddr) {
kprintf("%rFUN %6P %6H %'18T %'*ld %*s%t\n", ftrace_stackdigs, stackuse,
GetNestingLevel(ft, sf) * 2, "", fn);
ft->ft_lastaddr = fn;
}
}

View file

@ -84,7 +84,8 @@ o/$(MODE)/libc/sysv/sysret.o: private \
CFLAGS += \
-ffreestanding \
-fno-stack-protector \
-fno-sanitize=all
-fno-sanitize=all \
-mgeneral-regs-only
ifeq ($(ARCH),aarch64)
o/$(MODE)/libc/sysv/sysv.o: private \

View file

@ -35,8 +35,10 @@ errno_t __errno;
/**
* Returns address of `errno` variable.
*
* This function promises to not clobber argument registers.
*/
errno_t *__errno_location(void) {
nocallersavedregisters errno_t *__errno_location(void) {
if (__tls_enabled) {
return &__get_tls()->tib_errno;
} else {

View file

@ -187,7 +187,7 @@ systemfive_error:
#endif
systemfive_errno:
xchg %eax,%ecx
.errno
call __errno_location
mov %ecx,(%rax) // normalize to c library convention
push $-1 // negative one is only error result
pop %rax // the push pop is to save code size

View file

@ -44,7 +44,7 @@
#define STACK_SIZE 65536
static textwindows dontinstrument uint32_t __itimer_worker(void *arg) {
textwindows dontinstrument static uint32_t __itimer_worker(void *arg) {
struct CosmoTib tls;
char *sp = __builtin_frame_address(0);
__bootstrap_tls(&tls, sp);
@ -87,7 +87,7 @@ static textwindows dontinstrument uint32_t __itimer_worker(void *arg) {
return 0;
}
static textwindows void __itimer_setup(void) {
textwindows static void __itimer_setup(void) {
__itimer.thread = CreateThread(0, STACK_SIZE, __itimer_worker, 0,
kNtStackSizeParamIsAReservation, 0);
}

View file

@ -151,7 +151,7 @@ void _pthread_decimate(enum PosixThreadStatus threshold) {
}
}
dontinstrument static int PosixThread(void *arg, int tid) {
static int PosixThread(void *arg) {
struct PosixThread *pt = arg;
// setup scheduling
@ -162,11 +162,11 @@ dontinstrument static int PosixThread(void *arg, int tid) {
// setup signal stack
if (pt->pt_attr.__sigaltstacksize) {
struct sigaltstack ss;
ss.ss_sp = pt->pt_attr.__sigaltstackaddr;
ss.ss_size = pt->pt_attr.__sigaltstacksize;
ss.ss_flags = 0;
unassert(!sigaltstack(&ss, 0));
struct sigaltstack *ss = alloca(sizeof(struct sigaltstack));
ss->ss_sp = pt->pt_attr.__sigaltstackaddr;
ss->ss_size = pt->pt_attr.__sigaltstacksize;
ss->ss_flags = 0;
unassert(!sigaltstack(ss, 0));
}
// set long jump handler so pthread_exit can bring control back here

View file

@ -10,7 +10,6 @@ COSMOPOLITAN_C_START_
struct CosmoFtrace { /* 16 */
char ft_once; /* 0 */
char ft_noreentry; /* 1 */
int ft_skew; /* 4 */
int64_t ft_lastaddr; /* 8 */
};