mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-01-31 03:27:39 +00:00
e939659b70
This change fixes a bug where signal_latency_async_test would flake less than 1/1000 of the time. What was happening was pthread_kill(sender_thr) would return EFAULT. This was because pthread_create() was not returning the thread object pointer until after clone() had been called. So it was actually possible for the main thread to stall after calling clone() and during that time the receiver would launch and receive a signal from the sender thread, and then fail when it tried to send a pong. I thought I'd use a barrier at first, in the test, to synchronize thread creation, but I firmly believe that pthread_create() was to blame and now that's fixed
513 lines
18 KiB
C
513 lines
18 KiB
C
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
|
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Copyright 2024 Justine Alexandra Roberts Tunney │
|
|
│ │
|
|
│ Permission to use, copy, modify, and/or distribute this software for │
|
|
│ any purpose with or without fee is hereby granted, provided that the │
|
|
│ above copyright notice and this permission notice appear in all copies. │
|
|
│ │
|
|
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
|
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
|
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
|
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
|
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
|
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
|
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
|
│ PERFORMANCE OF THIS SOFTWARE. │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/intrin/stack.h"
|
|
#include "libc/assert.h"
|
|
#include "libc/atomic.h"
|
|
#include "libc/calls/calls.h"
|
|
#include "libc/calls/syscall-sysv.internal.h"
|
|
#include "libc/cosmo.h"
|
|
#include "libc/dce.h"
|
|
#include "libc/dlopen/dlfcn.h"
|
|
#include "libc/errno.h"
|
|
#include "libc/intrin/describeflags.h"
|
|
#include "libc/intrin/dll.h"
|
|
#include "libc/intrin/maps.h"
|
|
#include "libc/intrin/rlimit.h"
|
|
#include "libc/intrin/strace.h"
|
|
#include "libc/intrin/weaken.h"
|
|
#include "libc/limits.h"
|
|
#include "libc/macros.h"
|
|
#include "libc/runtime/runtime.h"
|
|
#include "libc/sock/internal.h"
|
|
#include "libc/sysv/consts/map.h"
|
|
#include "libc/sysv/consts/prot.h"
|
|
#include "libc/thread/posixthread.internal.h"
|
|
#include "libc/thread/thread.h"
|
|
|
|
/**
|
|
* @fileoverview cosmo stack memory manager
|
|
*/
|
|
|
|
#define MAP_GROWSDOWN_LINUX 0x00000100
|
|
#define MAP_ANONYMOUS_LINUX 0x00000020
|
|
#define MAP_NOREPLACE_LINUX 0x08000000
|
|
#define MAP_NORESERVE_LINUX 0x00004000
|
|
|
|
#define MAP_ANON_OPENBSD 0x1000
|
|
#define MAP_STACK_OPENBSD 0x4000
|
|
|
|
#define THREADSTACK_CONTAINER(e) DLL_CONTAINER(struct CosmoStack, elem, e)
|
|
|
|
struct CosmoStack {
|
|
struct Dll elem;
|
|
void *stackaddr;
|
|
size_t stacksize;
|
|
size_t guardsize;
|
|
};
|
|
|
|
struct CosmoStacks {
|
|
atomic_uint once;
|
|
pthread_mutex_t lock;
|
|
struct Dll *stacks;
|
|
struct Dll *objects;
|
|
unsigned count;
|
|
};
|
|
|
|
struct CosmoStacksConfig {
|
|
unsigned maxstacks;
|
|
};
|
|
|
|
static struct CosmoStacks cosmo_stacks = {
|
|
.lock = PTHREAD_MUTEX_INITIALIZER,
|
|
};
|
|
|
|
static struct CosmoStacksConfig cosmo_stacks_config = {
|
|
.maxstacks = 3,
|
|
};
|
|
|
|
void cosmo_stack_lock(void) {
|
|
_pthread_mutex_lock(&cosmo_stacks.lock);
|
|
}
|
|
|
|
void cosmo_stack_unlock(void) {
|
|
_pthread_mutex_unlock(&cosmo_stacks.lock);
|
|
}
|
|
|
|
void cosmo_stack_wipe(void) {
|
|
_pthread_mutex_wipe_np(&cosmo_stacks.lock);
|
|
}
|
|
|
|
// map_growsdown will not grow more than rlimit_stack
|
|
static size_t cosmo_stack_maxgrow(void) {
|
|
return __rlimit_stack_get().rlim_cur & -__pagesize;
|
|
}
|
|
|
|
// allocates private anonymous fixed noreplace memory on linux
|
|
static void *flixmap(void *addr, size_t size, int prot, int flags) {
|
|
flags |= MAP_PRIVATE | MAP_ANONYMOUS_LINUX | MAP_NOREPLACE_LINUX;
|
|
void *res = __sys_mmap(addr, size, prot, flags, -1, 0, 0);
|
|
if (res != MAP_FAILED) {
|
|
if (res != addr) {
|
|
sys_munmap(addr, size);
|
|
errno = EEXIST; // polyfill linux 4.17+ behavior
|
|
res = 0;
|
|
}
|
|
} else {
|
|
res = 0;
|
|
}
|
|
STRACE("mmap(%p, %'zu, %s, %s) → %p% m", addr, size, DescribeProtFlags(prot),
|
|
DescribeMapFlags(flags), res);
|
|
return res;
|
|
}
|
|
|
|
// maps stack on linux
|
|
static void *slackmap(size_t stacksize, size_t guardsize) {
|
|
int olde = errno;
|
|
struct Map *prev, *map;
|
|
char *max = (char *)PTRDIFF_MAX;
|
|
size_t need = guardsize + stacksize;
|
|
__maps_lock();
|
|
for (;;) {
|
|
|
|
// look for empty space beneath higher mappings
|
|
char *region = 0;
|
|
for (map = __maps_floor(max); map; map = prev) {
|
|
char *min = (char *)(intptr_t)__gransize;
|
|
if ((prev = __maps_prev(map)))
|
|
min = prev->addr + ROUNDUP(prev->size, __gransize);
|
|
if (map->addr - min >= need) {
|
|
region = map->addr - need;
|
|
max = region - 1;
|
|
break;
|
|
}
|
|
}
|
|
if (!region)
|
|
break;
|
|
|
|
// track intended memory in rbtree
|
|
if (!__maps_track(region, guardsize, PROT_NONE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS_LINUX))
|
|
break;
|
|
if (!__maps_track(region + guardsize, stacksize, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS_LINUX)) {
|
|
__maps_untrack(region, need);
|
|
break;
|
|
}
|
|
__maps_unlock();
|
|
|
|
// ask kernel to create guard region
|
|
// taking special care to not clobber untracked mappings
|
|
//
|
|
// it's important that this call happen first, since it limits how
|
|
// much memory map_growsdown will secretly consume. if there's
|
|
// nothing beneath a map_growsdown mapping, then the kernel reserves
|
|
// (and this isn't listed /proc/PID/maps so don't bother looking)
|
|
// `rlimit_stack.rlim_cur & -__pagesize` bytes of memory including
|
|
// this top-most page, and another 1mb of guard pages beneath that.
|
|
// but by mapping our guard pages manually, we ensure the guard
|
|
// region and the stack itself will be exactly as big as we want.
|
|
//
|
|
// you'd think we could mmap(0, pagesz, growsdown) to let the kernel
|
|
// pick an address and then we could just upscale the user's stack
|
|
// size request to whatever rlimit_stack is if it's bigger. but the
|
|
// linux kernel will actually choose addresses between existing maps
|
|
// where the hole is smaller than rlimit_stack.
|
|
//
|
|
// to use map_growsdown, we must use map_fixed. normally when we use
|
|
// map_fixed, we reserve an entire kernel-assigned region beforehand
|
|
// to ensure there isn't any overlap with existing mappings. however
|
|
// since growsdown stops growing when it encounters another mapping,
|
|
// you can't map it on top of a reservation mapping. so we must take
|
|
// a leap of faith there aren't any mystery mappings twixt the guard
|
|
// region and growsdown page below.
|
|
char *guard_region =
|
|
flixmap(region, guardsize, PROT_NONE, MAP_NORESERVE_LINUX);
|
|
if (!guard_region) {
|
|
RecoverFromMmapFailure:
|
|
if (errno != EEXIST) {
|
|
// mmap() probably raised enomem due to rlimit_as etc.
|
|
__maps_untrack(region, need);
|
|
return 0;
|
|
} else {
|
|
// we've encountered a mystery mapping. it's hard to imagine
|
|
// this happening, since we don't use map_growsdown when
|
|
// cosmo_dlopen() is linked in the binary. in that case, the
|
|
// tracker we created covers at least some of the rogue map,
|
|
// therefore this issue should fix itself if we keep going
|
|
errno = olde;
|
|
__maps_lock();
|
|
++max;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// ask kernel to create stack pages
|
|
// taking special care to not clobber untracked mappings
|
|
char *top_page = flixmap(region + need - __pagesize, __pagesize,
|
|
PROT_READ | PROT_WRITE, MAP_GROWSDOWN_LINUX);
|
|
if (!top_page) {
|
|
sys_munmap(region, guardsize);
|
|
goto RecoverFromMmapFailure;
|
|
}
|
|
|
|
// return address to bottom of stack
|
|
return region + guardsize;
|
|
}
|
|
__maps_unlock();
|
|
errno = ENOMEM;
|
|
return 0;
|
|
}
|
|
|
|
static errno_t cosmo_stack_munmap(char *stackaddr, size_t stacksize,
|
|
size_t guardsize) {
|
|
errno_t r = 0;
|
|
errno_t e = errno;
|
|
if (!munmap(stackaddr - guardsize, //
|
|
guardsize + stacksize)) {
|
|
r = errno;
|
|
errno = e;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
static void cosmo_stack_populate(void) {
|
|
errno_t e = errno;
|
|
void *map = mmap(0, __pagesize, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|
errno = e;
|
|
if (map != MAP_FAILED) {
|
|
struct CosmoStack *ts = map;
|
|
int n = __pagesize / sizeof(struct CosmoStack);
|
|
for (int i = 0; i < n; ++i) {
|
|
dll_init(&ts[i].elem);
|
|
dll_make_first(&cosmo_stacks.objects, &ts[i].elem);
|
|
}
|
|
}
|
|
}
|
|
|
|
static struct Dll *cosmo_stack_decimate(unsigned maxstacks) {
|
|
struct Dll *surplus = 0;
|
|
while (cosmo_stacks.count > maxstacks) {
|
|
struct Dll *e = dll_last(cosmo_stacks.stacks);
|
|
dll_remove(&cosmo_stacks.stacks, e);
|
|
dll_make_first(&surplus, e);
|
|
--cosmo_stacks.count;
|
|
}
|
|
return surplus;
|
|
}
|
|
|
|
static void cosmo_stack_rehabilitate(struct Dll *stacks) {
|
|
struct Dll *e;
|
|
for (e = dll_first(stacks); e; e = dll_next(stacks, e))
|
|
cosmo_stack_munmap(THREADSTACK_CONTAINER(e)->stackaddr,
|
|
THREADSTACK_CONTAINER(e)->stacksize,
|
|
THREADSTACK_CONTAINER(e)->guardsize);
|
|
cosmo_stack_lock();
|
|
dll_make_first(&cosmo_stacks.objects, stacks);
|
|
cosmo_stack_unlock();
|
|
}
|
|
|
|
/**
|
|
* Empties unused stack cache.
|
|
*
|
|
* To make POSIX threads as cheap as possible to spawn, we recycle their
|
|
* stacks without zeroing their memory. On Linux for an 80kb stack size,
|
|
* that makes launching a thread take 40µs rather than 80µs. However the
|
|
* stack cache needs to be cleared in certain cases. This is called upon
|
|
* exit() automatically but anyone can clear this at any other time too.
|
|
*
|
|
* @see pthread_decimate_np()
|
|
*/
|
|
void cosmo_stack_clear(void) {
|
|
cosmo_stack_lock();
|
|
struct Dll *stacks = cosmo_stacks.stacks;
|
|
cosmo_stacks.stacks = 0;
|
|
cosmo_stacks.count = 0;
|
|
cosmo_stack_unlock();
|
|
cosmo_stack_rehabilitate(stacks);
|
|
}
|
|
|
|
/**
|
|
* Gets maximum number of unused stacks cosmo should cache.
|
|
* @see cosmo_stack_setmaxstacks()
|
|
*/
|
|
int cosmo_stack_getmaxstacks(void) {
|
|
return cosmo_stacks_config.maxstacks;
|
|
}
|
|
|
|
/**
|
|
* Sets maximum number of unused stacks cosmo should cache.
|
|
*
|
|
* This lets you place some limitations on how much stack memory the
|
|
* cosmo runtime will cache. This number is a count of stacks rather
|
|
* than the number of bytes they contain. Old stacks are freed in a
|
|
* least recently used fashion once the cache exceeds this limit.
|
|
*
|
|
* If this is set to zero, then the cosmo stack allocator enters a
|
|
* highly secure hardening mode where cosmo_stack_alloc() zeroes all
|
|
* stack memory that's returned.
|
|
*
|
|
* Setting this to a negative number makes the cache size unlimited.
|
|
*
|
|
* Please note this limit only applies to stacks that aren't in use.
|
|
*
|
|
* Your default is three stacks may be cached at any given moment.
|
|
*
|
|
* If `maxstacks` is less than the current cache size, then surplus
|
|
* entries will be evicted and freed before this function returns.
|
|
*/
|
|
void cosmo_stack_setmaxstacks(int maxstacks) {
|
|
cosmo_stack_lock();
|
|
cosmo_stacks_config.maxstacks = maxstacks;
|
|
struct Dll *stacks = cosmo_stack_decimate(maxstacks);
|
|
cosmo_stack_unlock();
|
|
cosmo_stack_rehabilitate(stacks);
|
|
}
|
|
|
|
/**
|
|
* Allocates stack memory.
|
|
*
|
|
* This is a caching stack allocator that's used by the POSIX threads
|
|
* runtime but you may also find it useful for setcontext() coroutines
|
|
* or sigaltstack(). Normally you can get away with using malloc() for
|
|
* creating stacks. However some OSes (e.g. OpenBSD) forbid you from
|
|
* doing that for anything except sigaltstack(). This API serves to
|
|
* abstract all the gory details of gaining authorized memory, and
|
|
* additionally implements caching for lightning fast performance.
|
|
*
|
|
* The stack size must be nonzero. It specifies the minimum amount of
|
|
* stack space that will be available for use. The provided value is
|
|
* rounded up to the system page size. It may be increased further for
|
|
* various reasons. Your stack size parameter will be updated with the
|
|
* chosen value upon success.
|
|
*
|
|
* The guard size specifies the minimum amount of memory that should be
|
|
* protected beneath your stack. This helps ensure stack overflows cause
|
|
* a segfault rather than corrupting memory silently. This may be set to
|
|
* zero in which case no guard pages will be made. This value is rounded
|
|
* up to the system page size. The corrected value will be returned upon
|
|
* success. Your guard size needs to be small enough to leave room for
|
|
* at least one memory page in your stack size i.e. `guardsize +
|
|
* pagesize <= stacksize` must be the case. Otherwise this function will
|
|
* return an `EINVAL` error.
|
|
*
|
|
* When you're done using your stack, pass it to cosmo_stack_free() so
|
|
* it can be recycled. Stacks are only recycled when the `stacksize` and
|
|
* `guardsize` parameters match the constraints described above. Stacks
|
|
* that don't end up getting reused will be freed eventually, in a least
|
|
* recently used way based upon your cosmo_stack_setmaxstacks() setting.
|
|
*
|
|
* This function returns 0 on success, or an errno on error. See the
|
|
* documentation of mmap() for a list possible errors that may occur.
|
|
*/
|
|
errno_t cosmo_stack_alloc(size_t *inout_stacksize, //
|
|
size_t *inout_guardsize, //
|
|
void **out_stackaddr) {
|
|
|
|
// validate arguments
|
|
size_t stacksize = *inout_stacksize;
|
|
size_t guardsize = *inout_guardsize;
|
|
stacksize = (stacksize + __pagesize - 1) & -__pagesize;
|
|
guardsize = (guardsize + __pagesize - 1) & -__pagesize;
|
|
if (!stacksize)
|
|
return EINVAL;
|
|
|
|
// recycle stack
|
|
void *stackaddr = 0;
|
|
cosmo_stack_lock();
|
|
for (struct Dll *e = dll_first(cosmo_stacks.stacks); e;
|
|
e = dll_next(cosmo_stacks.stacks, e)) {
|
|
struct CosmoStack *ts = THREADSTACK_CONTAINER(e);
|
|
if (ts->stacksize == stacksize && //
|
|
ts->guardsize == guardsize) {
|
|
stackaddr = ts->stackaddr;
|
|
stacksize = ts->stacksize;
|
|
guardsize = ts->guardsize;
|
|
dll_remove(&cosmo_stacks.stacks, e);
|
|
dll_make_first(&cosmo_stacks.objects, e);
|
|
--cosmo_stacks.count;
|
|
break;
|
|
}
|
|
}
|
|
cosmo_stack_unlock();
|
|
|
|
// create stack
|
|
if (!stackaddr) {
|
|
errno_t olde = errno;
|
|
if (!IsTiny() && IsLinux() && guardsize && !_weaken(cosmo_dlopen) &&
|
|
stacksize <= cosmo_stack_maxgrow() && !IsQemuUser()) {
|
|
// this special linux-only stack allocator significantly reduces
|
|
// the consumption of virtual memory.
|
|
if (!(stackaddr = slackmap(stacksize, guardsize))) {
|
|
errno_t err = errno;
|
|
errno = olde;
|
|
return err;
|
|
}
|
|
} else {
|
|
char *map = mmap(0, guardsize + stacksize, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|
if (map == MAP_FAILED) {
|
|
errno_t err = errno;
|
|
errno = olde;
|
|
return err;
|
|
}
|
|
stackaddr = map + guardsize;
|
|
if (IsOpenbsd())
|
|
if (!TellOpenbsdThisIsStackMemory(stackaddr, stacksize))
|
|
notpossible;
|
|
if (guardsize) {
|
|
if (mprotect(map, guardsize, PROT_NONE | PROT_GUARD)) {
|
|
errno_t err = errno;
|
|
munmap(map, guardsize + stacksize);
|
|
errno = olde;
|
|
return err;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// return stack
|
|
*inout_stacksize = stacksize;
|
|
*inout_guardsize = guardsize;
|
|
*out_stackaddr = stackaddr;
|
|
return 0;
|
|
}
|
|
|
|
static void cosmo_stack_setup(void) {
|
|
atexit(cosmo_stack_clear);
|
|
}
|
|
|
|
/**
|
|
* Frees stack memory.
|
|
*
|
|
* While not strictly required, it's assumed the three parameters are
|
|
* those returned by an earlier call to cosmo_stack_alloc(). If they
|
|
* aren't page aligned and rounded, this function will return EINVAL.
|
|
*
|
|
* This function returns 0 on success, or an errno on error. The `errno`
|
|
* variable is never clobbered. You can only dependably count on this to
|
|
* return an error on failure when you say `cosmo_stack_setmaxstacks(0)`
|
|
*/
|
|
errno_t cosmo_stack_free(void *stackaddr, size_t stacksize, size_t guardsize) {
|
|
if (!stacksize)
|
|
return EINVAL;
|
|
if (stacksize & (__pagesize - 1))
|
|
return EINVAL;
|
|
if (guardsize & (__pagesize - 1))
|
|
return EINVAL;
|
|
if ((uintptr_t)stackaddr & (__pagesize - 1))
|
|
return EINVAL;
|
|
cosmo_stack_lock();
|
|
struct Dll *surplus = 0;
|
|
if (cosmo_stacks_config.maxstacks) {
|
|
cosmo_once(&cosmo_stacks.once, cosmo_stack_setup);
|
|
surplus = cosmo_stack_decimate(cosmo_stacks_config.maxstacks - 1);
|
|
struct CosmoStack *ts = 0;
|
|
if (dll_is_empty(cosmo_stacks.objects))
|
|
cosmo_stack_populate();
|
|
struct Dll *e;
|
|
if ((e = dll_first(cosmo_stacks.objects))) {
|
|
dll_remove(&cosmo_stacks.objects, e);
|
|
ts = THREADSTACK_CONTAINER(e);
|
|
}
|
|
if (ts) {
|
|
ts->stackaddr = stackaddr;
|
|
ts->stacksize = stacksize;
|
|
ts->guardsize = guardsize;
|
|
dll_make_first(&cosmo_stacks.stacks, &ts->elem);
|
|
++cosmo_stacks.count;
|
|
stackaddr = 0;
|
|
}
|
|
}
|
|
cosmo_stack_unlock();
|
|
cosmo_stack_rehabilitate(surplus);
|
|
errno_t err = 0;
|
|
if (stackaddr)
|
|
err = cosmo_stack_munmap(stackaddr, stacksize, guardsize);
|
|
return err;
|
|
}
|
|
|
|
relegated bool TellOpenbsdThisIsStackMemory(void *addr, size_t size) {
|
|
return __sys_mmap(
|
|
addr, size, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_FIXED | MAP_ANON_OPENBSD | MAP_STACK_OPENBSD, -1,
|
|
0, 0) == addr;
|
|
}
|
|
|
|
// OpenBSD only permits RSP to occupy memory that's been explicitly
|
|
// defined as stack memory, i.e. `lo <= %rsp < hi` must be the case
|
|
relegated bool FixupCustomStackOnOpenbsd(pthread_attr_t *attr) {
|
|
|
|
// get interval
|
|
uintptr_t lo = (uintptr_t)attr->__stackaddr;
|
|
uintptr_t hi = lo + attr->__stacksize;
|
|
|
|
// squeeze interval
|
|
lo = (lo + __pagesize - 1) & -__pagesize;
|
|
hi = hi & -__pagesize;
|
|
|
|
// tell os it's stack memory
|
|
if (!TellOpenbsdThisIsStackMemory((void *)lo, hi - lo))
|
|
return false;
|
|
|
|
// update attributes with usable stack address
|
|
attr->__stackaddr = (void *)lo;
|
|
attr->__stacksize = hi - lo;
|
|
return true;
|
|
}
|