Reduce stack virtual memory consumption on Linux

This commit is contained in:
Justine Tunney 2024-12-25 19:43:43 -08:00
parent cc8a9eb93c
commit 36e5861b0c
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
31 changed files with 583 additions and 166 deletions

View file

@ -23,9 +23,16 @@
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/cosmo.h"
#include "libc/dce.h"
#include "libc/dlopen/dlfcn.h"
#include "libc/errno.h"
#include "libc/intrin/describeflags.h"
#include "libc/intrin/dll.h"
#include "libc/intrin/maps.h"
#include "libc/intrin/rlimit.h"
#include "libc/intrin/strace.h"
#include "libc/intrin/weaken.h"
#include "libc/runtime/runtime.h"
#include "libc/sock/internal.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/thread/posixthread.internal.h"
@ -35,6 +42,11 @@
* @fileoverview cosmo stack memory manager
*/
#define MAP_GROWSDOWN_LINUX 0x00000100
#define MAP_ANONYMOUS_LINUX 0x00000020
#define MAP_NOREPLACE_LINUX 0x08000000
#define MAP_NORESERVE_LINUX 0x00004000
#define MAP_ANON_OPENBSD 0x1000
#define MAP_STACK_OPENBSD 0x4000
@ -43,8 +55,8 @@
struct CosmoStack {
struct Dll elem;
void *stackaddr;
unsigned stacksize;
unsigned guardsize;
size_t stacksize;
size_t guardsize;
};
struct CosmoStacks {
@ -79,10 +91,133 @@ void cosmo_stack_wipe(void) {
_pthread_mutex_wipe_np(&cosmo_stacks.lock);
}
static errno_t cosmo_stack_munmap(void *addr, size_t size) {
// map_growsdown will not grow more than rlimit_stack
static size_t cosmo_stack_maxgrow(void) {
return __rlimit_stack_get().rlim_cur & -__pagesize;
}
// allocates private anonymous fixed noreplace memory on linux
static void *flixmap(void *addr, size_t size, int prot, int flags) {
flags |= MAP_PRIVATE | MAP_ANONYMOUS_LINUX | MAP_NOREPLACE_LINUX;
void *res = __sys_mmap(addr, size, prot, flags, -1, 0, 0);
if (res != MAP_FAILED) {
if (res != addr) {
sys_munmap(addr, size);
errno = EEXIST; // polyfill linux 4.17+ behavior
res = 0;
}
} else {
res = 0;
}
STRACE("mmap(%p, %'zu, %s, %s) → %p% m", addr, size, DescribeProtFlags(prot),
DescribeMapFlags(flags), res);
return res;
}
// maps stack on linux
static void *slackmap(size_t stacksize, size_t guardsize) {
int olde = errno;
struct Map *prev, *map;
char *max = (char *)0x7fffffffffff;
size_t need = guardsize + stacksize;
__maps_lock();
for (;;) {
// look for empty space beneath higher mappings
char *region = 0;
for (map = __maps_floor(max); map; map = prev) {
char *min = (char *)(intptr_t)__pagesize;
if ((prev = __maps_prev(map)))
min = prev->addr + prev->size;
if (map->addr - min >= need) {
region = map->addr - need;
max = region - 1;
break;
}
}
if (!region)
break;
// track intended memory in rbtree
if (!__maps_track(region, guardsize, PROT_NONE,
MAP_PRIVATE | MAP_ANONYMOUS_LINUX))
break;
if (!__maps_track(region + guardsize, stacksize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS_LINUX)) {
__maps_untrack(region, need);
break;
}
__maps_unlock();
// ask kernel to create guard region
// taking special care to not clobber untracked mappings
//
// it's important that this call happen first, since it limits how
// much memory map_growsdown will secretly consume. if there's
// nothing beneath a map_growsdown mapping, then the kernel reserves
// (and this isn't listed /proc/PID/maps so don't bother looking)
// `rlimit_stack.rlim_cur & -__pagesize` bytes of memory including
// this top-most page, and another 1mb of guard pages beneath that.
// but by mapping our guard pages manually, we ensure the guard
// region and the stack itself will be exactly as big as we want.
//
// you'd think we could mmap(0, pagesz, growsdown) to let the kernel
// pick an address and then we could just upscale the user's stack
// size request to whatever rlimit_stack is if it's bigger. but the
// linux kernel will actually choose addresses between existing maps
// where the hole is smaller than rlimit_stack.
//
// to use map_growsdown, we must use map_fixed. normally when we use
// map_fixed, we reserve an entire kernel-assigned region beforehand
// to ensure there isn't any overlap with existing mappings. however
// since growsdown stops growing when it encounters another mapping,
// you can't map it on top of a reservation mapping. so we must take
// a leap of faith there aren't any mystery mappings twixt the guard
// region and growsdown page below.
char *guard_region =
flixmap(region, guardsize, PROT_NONE, MAP_NORESERVE_LINUX);
if (!guard_region) {
RecoverFromMmapFailure:
if (errno != EEXIST) {
// mmap() probably raised enomem due to rlimit_as etc.
__maps_untrack(region, need);
return 0;
} else {
// we've encountered a mystery mapping. it's hard to imagine
// this happening, since we don't use map_growsdown when
// cosmo_dlopen() is linked in the binary. in that case, the
// tracker we created covers at least some of the rogue map,
// therefore this issue should fix itself if we keep going
errno = olde;
__maps_lock();
++max;
continue;
}
}
// ask kernel to create stack pages
// taking special care to not clobber untracked mappings
char *top_page = flixmap(region + need - __pagesize, __pagesize,
PROT_READ | PROT_WRITE, MAP_GROWSDOWN_LINUX);
if (!top_page) {
sys_munmap(region, guardsize);
goto RecoverFromMmapFailure;
}
// return address to bottom of stack
return region + guardsize;
}
__maps_unlock();
errno = ENOMEM;
return 0;
}
static errno_t cosmo_stack_munmap(char *stackaddr, size_t stacksize,
size_t guardsize) {
errno_t r = 0;
errno_t e = errno;
if (!munmap(addr, size)) {
if (!munmap(stackaddr - guardsize, //
guardsize + stacksize)) {
r = errno;
errno = e;
}
@ -119,7 +254,8 @@ static void cosmo_stack_rehabilitate(struct Dll *stacks) {
struct Dll *e;
for (e = dll_first(stacks); e; e = dll_next(stacks, e))
cosmo_stack_munmap(THREADSTACK_CONTAINER(e)->stackaddr,
THREADSTACK_CONTAINER(e)->stacksize);
THREADSTACK_CONTAINER(e)->stacksize,
THREADSTACK_CONTAINER(e)->guardsize);
cosmo_stack_lock();
dll_make_first(&cosmo_stacks.objects, stacks);
cosmo_stack_unlock();
@ -193,39 +329,41 @@ void cosmo_stack_setmaxstacks(int maxstacks) {
* abstract all the gory details of gaining authorized memory, and
* additionally implements caching for lightning fast performance.
*
* The stack size must be nonzero. It is rounded up to the granularity
* of the underlying system allocator, which is normally the page size.
* Your parameter will be updated with the selected value upon success.
* The stack size must be nonzero. It specifies the minimum amount of
* stack space that will be available for use. The provided value is
* rounded up to the system page size. It may be increased further for
* various reasons. Your stack size parameter will be updated with the
* chosen value upon success.
*
* The guard size specifies how much memory should be protected at the
* bottom of your stack. This is helpful for ensuring stack overflows
* will result in a segmentation fault, rather than corrupting memory
* silently. This may be set to zero, in which case no guard pages will
* be protected. This value is rounded up to the system page size. The
* corrected value will be returned upon success. Your guard size needs
* to be small enough to leave room for at least one memory page in your
* stack size i.e. `guardsize + pagesize <= stacksize` must be the case.
* Otherwise this function will return an `EINVAL` error.
* The guard size specifies the minimum amount of memory that should be
* protected beneath your stack. This helps ensure stack overflows cause
* a segfault rather than corrupting memory silently. This may be set to
* zero in which case no guard pages will be made. This value is rounded
* up to the system page size. The corrected value will be returned upon
* success. Your guard size needs to be small enough to leave room for
* at least one memory page in your stack size i.e. `guardsize +
* pagesize <= stacksize` must be the case. Otherwise this function will
* return an `EINVAL` error.
*
* When you're done using your stack, pass it to cosmo_stack_free() so
* it can be recycled. Stacks are only recycled when the `stacksize` and
* `guardsize` parameters are an exact match after correction. Otherwise
* they'll likely be freed eventually, in a least-recently used fashion,
* based upon the configurable cosmo_stack_setmaxstacks() setting.
* `guardsize` parameters match the constraints described above. Stacks
* that don't end up getting reused will be freed eventually, in a least
* recently used way based upon your cosmo_stack_setmaxstacks() setting.
*
* This function returns 0 on success, or an errno on error. See the
* documentation of mmap() for a list possible errors that may occur.
*/
errno_t cosmo_stack_alloc(unsigned *inout_stacksize, //
unsigned *inout_guardsize, //
errno_t cosmo_stack_alloc(size_t *inout_stacksize, //
size_t *inout_guardsize, //
void **out_addr) {
// validate arguments
unsigned stacksize = *inout_stacksize;
unsigned guardsize = *inout_guardsize;
stacksize = (stacksize + __gransize - 1) & -__gransize;
size_t stacksize = *inout_stacksize;
size_t guardsize = *inout_guardsize;
stacksize = (stacksize + __pagesize - 1) & -__pagesize;
guardsize = (guardsize + __pagesize - 1) & -__pagesize;
if (guardsize + __pagesize > stacksize)
if (!stacksize)
return EINVAL;
// recycle stack
@ -236,8 +374,10 @@ errno_t cosmo_stack_alloc(unsigned *inout_stacksize, //
struct CosmoStack *ts = THREADSTACK_CONTAINER(e);
if (ts->stacksize == stacksize && //
ts->guardsize == guardsize) {
dll_remove(&cosmo_stacks.stacks, e);
stackaddr = ts->stackaddr;
stacksize = ts->stacksize;
guardsize = ts->guardsize;
dll_remove(&cosmo_stacks.stacks, e);
dll_make_first(&cosmo_stacks.objects, e);
--cosmo_stacks.count;
break;
@ -247,20 +387,37 @@ errno_t cosmo_stack_alloc(unsigned *inout_stacksize, //
// create stack
if (!stackaddr) {
errno_t e = errno;
stackaddr = mmap(0, stacksize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (stackaddr == MAP_FAILED) {
errno_t err = errno;
errno = e;
return err;
errno_t olde = errno;
if (!IsTiny() && IsLinux() && guardsize && !_weaken(cosmo_dlopen) &&
stacksize <= cosmo_stack_maxgrow() && !IsQemuUser()) {
// this special linux-only stack allocator significantly reduces
// the consumption of virtual memory.
if (!(stackaddr = slackmap(stacksize, guardsize))) {
errno_t err = errno;
errno = olde;
return err;
}
} else {
char *map = mmap(0, guardsize + stacksize, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (map == MAP_FAILED) {
errno_t err = errno;
errno = olde;
return err;
}
stackaddr = map + guardsize;
if (IsOpenbsd())
if (!TellOpenbsdThisIsStackMemory(stackaddr, stacksize))
notpossible;
if (guardsize) {
if (mprotect(map, guardsize, PROT_NONE | PROT_GUARD)) {
errno_t err = errno;
munmap(map, guardsize + stacksize);
errno = olde;
return err;
}
}
}
if (IsOpenbsd())
if (!TellOpenbsdThisIsStackMemory(stackaddr, stacksize))
notpossible;
if (guardsize)
if (mprotect(stackaddr, guardsize, PROT_NONE | PROT_GUARD))
notpossible;
}
// return stack
@ -277,20 +434,22 @@ static void cosmo_stack_setup(void) {
/**
* Frees stack memory.
*
* While not strictly required, it's assumed these three values would be
* those returned by an earlier call to cosmo_stack_alloc().
* While not strictly required, it's assumed the three parameters are
* those returned by an earlier call to cosmo_stack_alloc(). If they
* aren't page aligned and rounded, this function will return EINVAL.
*
* This function returns 0 on success, or an errno on error. The `errno`
* variable is never clobbered. You can only dependably count on this to
* return an error on failure when you say `cosmo_stack_setmaxstacks(0)`
*/
errno_t cosmo_stack_free(void *stackaddr, unsigned stacksize,
unsigned guardsize) {
stacksize = (stacksize + __gransize - 1) & -__gransize;
guardsize = (guardsize + __pagesize - 1) & -__pagesize;
if (guardsize + __pagesize > stacksize)
errno_t cosmo_stack_free(void *stackaddr, size_t stacksize, size_t guardsize) {
if (!stacksize)
return EINVAL;
if ((uintptr_t)stackaddr & (__gransize - 1))
if (stacksize & (__pagesize - 1))
return EINVAL;
if (guardsize & (__pagesize - 1))
return EINVAL;
if ((uintptr_t)stackaddr & (__pagesize - 1))
return EINVAL;
cosmo_stack_lock();
struct Dll *surplus = 0;
@ -318,7 +477,7 @@ errno_t cosmo_stack_free(void *stackaddr, unsigned stacksize,
cosmo_stack_rehabilitate(surplus);
errno_t err = 0;
if (stackaddr)
err = cosmo_stack_munmap(stackaddr, stacksize);
err = cosmo_stack_munmap(stackaddr, stacksize, guardsize);
return err;
}