cosmopolitan/libc/intrin/stack.c
2024-12-25 20:58:08 -08:00

515 lines
18 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2024 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/intrin/stack.h"
#include "libc/assert.h"
#include "libc/atomic.h"
#include "libc/calls/calls.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/cosmo.h"
#include "libc/dce.h"
#include "libc/dlopen/dlfcn.h"
#include "libc/errno.h"
#include "libc/intrin/describeflags.h"
#include "libc/intrin/dll.h"
#include "libc/intrin/maps.h"
#include "libc/intrin/rlimit.h"
#include "libc/intrin/strace.h"
#include "libc/intrin/weaken.h"
#include "libc/runtime/runtime.h"
#include "libc/sock/internal.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/thread.h"
/**
* @fileoverview cosmo stack memory manager
*/
#define MAP_GROWSDOWN_LINUX 0x00000100
#define MAP_ANONYMOUS_LINUX 0x00000020
#define MAP_NOREPLACE_LINUX 0x08000000
#define MAP_NORESERVE_LINUX 0x00004000
#define MAP_ANON_OPENBSD 0x1000
#define MAP_STACK_OPENBSD 0x4000
#define THREADSTACK_CONTAINER(e) DLL_CONTAINER(struct CosmoStack, elem, e)
struct CosmoStack {
struct Dll elem;
void *stackaddr;
size_t stacksize;
size_t guardsize;
};
struct CosmoStacks {
atomic_uint once;
pthread_mutex_t lock;
struct Dll *stacks;
struct Dll *objects;
unsigned count;
};
struct CosmoStacksConfig {
unsigned maxstacks;
};
static struct CosmoStacks cosmo_stacks = {
.lock = PTHREAD_MUTEX_INITIALIZER,
};
static struct CosmoStacksConfig cosmo_stacks_config = {
.maxstacks = 3,
};
void cosmo_stack_lock(void) {
_pthread_mutex_lock(&cosmo_stacks.lock);
}
void cosmo_stack_unlock(void) {
_pthread_mutex_unlock(&cosmo_stacks.lock);
}
void cosmo_stack_wipe(void) {
_pthread_mutex_wipe_np(&cosmo_stacks.lock);
}
// map_growsdown will not grow more than rlimit_stack
static size_t cosmo_stack_maxgrow(void) {
return __rlimit_stack_get().rlim_cur & -__pagesize;
}
// allocates private anonymous fixed noreplace memory on linux
static void *flixmap(void *addr, size_t size, int prot, int flags) {
flags |= MAP_PRIVATE | MAP_ANONYMOUS_LINUX | MAP_NOREPLACE_LINUX;
void *res = __sys_mmap(addr, size, prot, flags, -1, 0, 0);
if (res != MAP_FAILED) {
if (res != addr) {
sys_munmap(addr, size);
errno = EEXIST; // polyfill linux 4.17+ behavior
res = 0;
}
} else {
res = 0;
}
STRACE("mmap(%p, %'zu, %s, %s) → %p% m", addr, size, DescribeProtFlags(prot),
DescribeMapFlags(flags), res);
return res;
}
// maps stack on linux
static void *slackmap(size_t stacksize, size_t guardsize) {
int olde = errno;
struct Map *prev, *map;
char *max = (char *)0x7fffffffffff;
size_t need = guardsize + stacksize;
__maps_lock();
for (;;) {
// look for empty space beneath higher mappings
char *region = 0;
for (map = __maps_floor(max); map; map = prev) {
char *min = (char *)(intptr_t)__pagesize;
if ((prev = __maps_prev(map)))
min = prev->addr + prev->size;
if (map->addr - min >= need) {
region = map->addr - need;
max = region - 1;
break;
}
}
if (!region)
break;
// track intended memory in rbtree
if (!__maps_track(region, guardsize, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS_LINUX))
break;
if (!__maps_track(region + guardsize, stacksize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS_LINUX)) {
__maps_untrack(region, need);
break;
}
__maps_unlock();
// ask kernel to create guard region
// taking special care to not clobber untracked mappings
//
// it's important that this call happen first, since it limits how
// much memory map_growsdown will secretly consume. if there's
// nothing beneath a map_growsdown mapping, then the kernel reserves
// (and this isn't listed /proc/PID/maps so don't bother looking)
// `rlimit_stack.rlim_cur & -__pagesize` bytes of memory including
// this top-most page, and another 1mb of guard pages beneath that.
// but by mapping our guard pages manually, we ensure the guard
// region and the stack itself will be exactly as big as we want.
//
// you'd think we could mmap(0, pagesz, growsdown) to let the kernel
// pick an address and then we could just upscale the user's stack
// size request to whatever rlimit_stack is if it's bigger. but the
// linux kernel will actually choose addresses between existing maps
// where the hole is smaller than rlimit_stack.
//
// to use map_growsdown, we must use map_fixed. normally when we use
// map_fixed, we reserve an entire kernel-assigned region beforehand
// to ensure there isn't any overlap with existing mappings. however
// since growsdown stops growing when it encounters another mapping,
// you can't map it on top of a reservation mapping. so we must take
// a leap of faith there aren't any mystery mappings twixt the guard
// region and growsdown page below.
char *guard_region =
flixmap(region, guardsize, PROT_NONE, MAP_NORESERVE_LINUX);
if (!guard_region) {
RecoverFromMmapFailure:
if (errno != EEXIST) {
// mmap() probably raised enomem due to rlimit_as etc.
__maps_untrack(region, need);
return 0;
} else {
// we've encountered a mystery mapping. it's hard to imagine
// this happening, since we don't use map_growsdown when
// cosmo_dlopen() is linked in the binary. in that case, the
// tracker we created covers at least some of the rogue map,
// therefore this issue should fix itself if we keep going
errno = olde;
__maps_lock();
++max;
continue;
}
}
// ask kernel to create stack pages
// taking special care to not clobber untracked mappings
char *top_page = flixmap(region + need - __pagesize, __pagesize,
PROT_READ | PROT_WRITE, MAP_GROWSDOWN_LINUX);
if (!top_page) {
sys_munmap(region, guardsize);
goto RecoverFromMmapFailure;
}
// return address to bottom of stack
return region + guardsize;
}
__maps_unlock();
errno = ENOMEM;
return 0;
}
static errno_t cosmo_stack_munmap(char *stackaddr, size_t stacksize,
size_t guardsize) {
errno_t r = 0;
errno_t e = errno;
if (!munmap(stackaddr - guardsize, //
guardsize + stacksize)) {
r = errno;
errno = e;
}
return r;
}
static void cosmo_stack_populate(void) {
errno_t e = errno;
void *map = mmap(0, __pagesize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
errno = e;
if (map != MAP_FAILED) {
struct CosmoStack *ts = map;
int n = __pagesize / sizeof(struct CosmoStack);
for (int i = 0; i < n; ++i) {
dll_init(&ts[i].elem);
dll_make_first(&cosmo_stacks.objects, &ts[i].elem);
}
}
}
static struct Dll *cosmo_stack_decimate(unsigned maxstacks) {
struct Dll *surplus = 0;
while (cosmo_stacks.count > maxstacks) {
struct Dll *e = dll_last(cosmo_stacks.stacks);
dll_remove(&cosmo_stacks.stacks, e);
dll_make_first(&surplus, e);
--cosmo_stacks.count;
}
return surplus;
}
static void cosmo_stack_rehabilitate(struct Dll *stacks) {
struct Dll *e;
for (e = dll_first(stacks); e; e = dll_next(stacks, e))
cosmo_stack_munmap(THREADSTACK_CONTAINER(e)->stackaddr,
THREADSTACK_CONTAINER(e)->stacksize,
THREADSTACK_CONTAINER(e)->guardsize);
cosmo_stack_lock();
dll_make_first(&cosmo_stacks.objects, stacks);
cosmo_stack_unlock();
}
/**
* Empties unused stack cache.
*
* To make POSIX threads as cheap as possible to spawn, we recycle their
* stacks without zeroing their memory. On Linux for an 80kb stack size,
* that makes launching a thread take 40µs rather than 80µs. However the
* stack cache needs to be cleared in certain cases. This is called upon
* exit() automatically but anyone can clear this at any other time too.
*
* @see pthread_decimate_np()
*/
void cosmo_stack_clear(void) {
cosmo_stack_lock();
struct Dll *stacks = cosmo_stacks.stacks;
cosmo_stacks.stacks = 0;
cosmo_stacks.count = 0;
cosmo_stack_unlock();
cosmo_stack_rehabilitate(stacks);
}
/**
* Gets maximum number of unused stacks cosmo should cache.
* @see cosmo_stack_setmaxstacks()
*/
int cosmo_stack_getmaxstacks(void) {
return cosmo_stacks_config.maxstacks;
}
/**
* Sets maximum number of unused stacks cosmo should cache.
*
* This lets you place some limitations on how much stack memory the
* cosmo runtime will cache. This number is a count of stacks rather
* than the number of bytes they contain. Old stacks are freed in a
* least recently used fashion once the cache exceeds this limit.
*
* If this is set to zero, then the cosmo stack allocator enters a
* highly secure hardening mode where cosmo_stack_alloc() zeroes all
* stack memory that's returned.
*
* Setting this to a negative number makes the cache size unlimited.
*
* Please note this limit only applies to stacks that aren't in use.
*
* Your default is three stacks may be cached at any given moment.
*
* If `maxstacks` is less than the current cache size, then surplus
* entries will be evicted and freed before this function returns.
*/
void cosmo_stack_setmaxstacks(int maxstacks) {
cosmo_stack_lock();
cosmo_stacks_config.maxstacks = maxstacks;
struct Dll *stacks = cosmo_stack_decimate(maxstacks);
cosmo_stack_unlock();
cosmo_stack_rehabilitate(stacks);
}
/**
* Allocates stack memory.
*
* This is a caching stack allocator that's used by the POSIX threads
* runtime but you may also find it useful for setcontext() coroutines
* or sigaltstack(). Normally you can get away with using malloc() for
* creating stacks. However some OSes (e.g. OpenBSD) forbid you from
* doing that for anything except sigaltstack(). This API serves to
* abstract all the gory details of gaining authorized memory, and
* additionally implements caching for lightning fast performance.
*
* The stack size must be nonzero. It specifies the minimum amount of
* stack space that will be available for use. The provided value is
* rounded up to the system page size. It may be increased further for
* various reasons. Your stack size parameter will be updated with the
* chosen value upon success.
*
* The guard size specifies the minimum amount of memory that should be
* protected beneath your stack. This helps ensure stack overflows cause
* a segfault rather than corrupting memory silently. This may be set to
* zero in which case no guard pages will be made. This value is rounded
* up to the system page size. The corrected value will be returned upon
* success. Your guard size needs to be small enough to leave room for
* at least one memory page in your stack size i.e. `guardsize +
* pagesize <= stacksize` must be the case. Otherwise this function will
* return an `EINVAL` error.
*
* When you're done using your stack, pass it to cosmo_stack_free() so
* it can be recycled. Stacks are only recycled when the `stacksize` and
* `guardsize` parameters match the constraints described above. Stacks
* that don't end up getting reused will be freed eventually, in a least
* recently used way based upon your cosmo_stack_setmaxstacks() setting.
*
* This function returns 0 on success, or an errno on error. See the
* documentation of mmap() for a list possible errors that may occur.
*/
errno_t cosmo_stack_alloc(size_t *inout_stacksize, //
size_t *inout_guardsize, //
void **out_addr) {
// validate arguments
size_t stacksize = *inout_stacksize;
size_t guardsize = *inout_guardsize;
stacksize = (stacksize + __pagesize - 1) & -__pagesize;
guardsize = (guardsize + __pagesize - 1) & -__pagesize;
if (!stacksize)
return EINVAL;
// recycle stack
void *stackaddr = 0;
cosmo_stack_lock();
for (struct Dll *e = dll_first(cosmo_stacks.stacks); e;
e = dll_next(cosmo_stacks.stacks, e)) {
struct CosmoStack *ts = THREADSTACK_CONTAINER(e);
if (ts->stacksize == stacksize && //
ts->guardsize == guardsize) {
stackaddr = ts->stackaddr;
stacksize = ts->stacksize;
guardsize = ts->guardsize;
dll_remove(&cosmo_stacks.stacks, e);
dll_make_first(&cosmo_stacks.objects, e);
--cosmo_stacks.count;
break;
}
}
cosmo_stack_unlock();
// create stack
if (!stackaddr) {
errno_t olde = errno;
if (!IsTiny() && IsLinux() && guardsize && !_weaken(cosmo_dlopen) &&
stacksize <= cosmo_stack_maxgrow() && !IsQemuUser()) {
// this special linux-only stack allocator significantly reduces
// the consumption of virtual memory.
if (!(stackaddr = slackmap(stacksize, guardsize))) {
errno_t err = errno;
errno = olde;
return err;
}
} else {
char *map = mmap(0, guardsize + stacksize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (map == MAP_FAILED) {
errno_t err = errno;
errno = olde;
return err;
}
stackaddr = map + guardsize;
if (IsOpenbsd())
if (!TellOpenbsdThisIsStackMemory(stackaddr, stacksize))
notpossible;
if (guardsize) {
if (mprotect(map, guardsize, PROT_NONE | PROT_GUARD)) {
errno_t err = errno;
munmap(map, guardsize + stacksize);
errno = olde;
return err;
}
}
}
}
// return stack
*inout_stacksize = stacksize;
*inout_guardsize = guardsize;
*out_addr = stackaddr;
return 0;
}
static void cosmo_stack_setup(void) {
atexit(cosmo_stack_clear);
}
/**
* Frees stack memory.
*
* While not strictly required, it's assumed the three parameters are
* those returned by an earlier call to cosmo_stack_alloc(). If they
* aren't page aligned and rounded, this function will return EINVAL.
*
* This function returns 0 on success, or an errno on error. The `errno`
* variable is never clobbered. You can only dependably count on this to
* return an error on failure when you say `cosmo_stack_setmaxstacks(0)`
*/
errno_t cosmo_stack_free(void *stackaddr, size_t stacksize, size_t guardsize) {
if (!stacksize)
return EINVAL;
if (stacksize & (__pagesize - 1))
return EINVAL;
if (guardsize & (__pagesize - 1))
return EINVAL;
if ((uintptr_t)stackaddr & (__pagesize - 1))
return EINVAL;
cosmo_stack_lock();
struct Dll *surplus = 0;
if (cosmo_stacks_config.maxstacks) {
cosmo_once(&cosmo_stacks.once, cosmo_stack_setup);
surplus = cosmo_stack_decimate(cosmo_stacks_config.maxstacks - 1);
struct CosmoStack *ts = 0;
if (dll_is_empty(cosmo_stacks.objects))
cosmo_stack_populate();
struct Dll *e;
if ((e = dll_first(cosmo_stacks.objects))) {
dll_remove(&cosmo_stacks.objects, e);
ts = THREADSTACK_CONTAINER(e);
}
if (ts) {
ts->stackaddr = stackaddr;
ts->stacksize = stacksize;
ts->guardsize = guardsize;
dll_make_first(&cosmo_stacks.stacks, &ts->elem);
++cosmo_stacks.count;
stackaddr = 0;
}
}
cosmo_stack_unlock();
cosmo_stack_rehabilitate(surplus);
errno_t err = 0;
if (stackaddr)
err = cosmo_stack_munmap(stackaddr, stacksize, guardsize);
return err;
}
relegated bool TellOpenbsdThisIsStackMemory(void *addr, size_t size) {
return __sys_mmap(
addr, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_FIXED | MAP_ANON_OPENBSD | MAP_STACK_OPENBSD, -1,
0, 0) == addr;
}
// OpenBSD only permits RSP to occupy memory that's been explicitly
// defined as stack memory, i.e. `lo <= %rsp < hi` must be the case
relegated errno_t FixupCustomStackOnOpenbsd(pthread_attr_t *attr) {
// get interval
uintptr_t lo = (uintptr_t)attr->__stackaddr;
uintptr_t hi = lo + attr->__stacksize;
// squeeze interval
lo = (lo + __pagesize - 1) & -__pagesize;
hi = hi & -__pagesize;
// tell os it's stack memory
errno_t olderr = errno;
if (!TellOpenbsdThisIsStackMemory((void *)lo, hi - lo)) {
errno_t err = errno;
errno = olderr;
return err;
}
// update attributes with usable stack address
attr->__stackaddr = (void *)lo;
attr->__stacksize = hi - lo;
return 0;
}