mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-06-25 22:08:30 +00:00
Reduce stack virtual memory consumption on Linux
This commit is contained in:
parent
cc8a9eb93c
commit
36e5861b0c
31 changed files with 583 additions and 166 deletions
|
@ -23,9 +23,16 @@
|
|||
#include "libc/calls/syscall-sysv.internal.h"
|
||||
#include "libc/cosmo.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/dlopen/dlfcn.h"
|
||||
#include "libc/errno.h"
|
||||
#include "libc/intrin/describeflags.h"
|
||||
#include "libc/intrin/dll.h"
|
||||
#include "libc/intrin/maps.h"
|
||||
#include "libc/intrin/rlimit.h"
|
||||
#include "libc/intrin/strace.h"
|
||||
#include "libc/intrin/weaken.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/sock/internal.h"
|
||||
#include "libc/sysv/consts/map.h"
|
||||
#include "libc/sysv/consts/prot.h"
|
||||
#include "libc/thread/posixthread.internal.h"
|
||||
|
@ -35,6 +42,11 @@
|
|||
* @fileoverview cosmo stack memory manager
|
||||
*/
|
||||
|
||||
#define MAP_GROWSDOWN_LINUX 0x00000100
|
||||
#define MAP_ANONYMOUS_LINUX 0x00000020
|
||||
#define MAP_NOREPLACE_LINUX 0x08000000
|
||||
#define MAP_NORESERVE_LINUX 0x00004000
|
||||
|
||||
#define MAP_ANON_OPENBSD 0x1000
|
||||
#define MAP_STACK_OPENBSD 0x4000
|
||||
|
||||
|
@ -43,8 +55,8 @@
|
|||
struct CosmoStack {
|
||||
struct Dll elem;
|
||||
void *stackaddr;
|
||||
unsigned stacksize;
|
||||
unsigned guardsize;
|
||||
size_t stacksize;
|
||||
size_t guardsize;
|
||||
};
|
||||
|
||||
struct CosmoStacks {
|
||||
|
@ -79,10 +91,133 @@ void cosmo_stack_wipe(void) {
|
|||
_pthread_mutex_wipe_np(&cosmo_stacks.lock);
|
||||
}
|
||||
|
||||
static errno_t cosmo_stack_munmap(void *addr, size_t size) {
|
||||
// map_growsdown will not grow more than rlimit_stack
|
||||
static size_t cosmo_stack_maxgrow(void) {
|
||||
return __rlimit_stack_get().rlim_cur & -__pagesize;
|
||||
}
|
||||
|
||||
// allocates private anonymous fixed noreplace memory on linux
|
||||
static void *flixmap(void *addr, size_t size, int prot, int flags) {
|
||||
flags |= MAP_PRIVATE | MAP_ANONYMOUS_LINUX | MAP_NOREPLACE_LINUX;
|
||||
void *res = __sys_mmap(addr, size, prot, flags, -1, 0, 0);
|
||||
if (res != MAP_FAILED) {
|
||||
if (res != addr) {
|
||||
sys_munmap(addr, size);
|
||||
errno = EEXIST; // polyfill linux 4.17+ behavior
|
||||
res = 0;
|
||||
}
|
||||
} else {
|
||||
res = 0;
|
||||
}
|
||||
STRACE("mmap(%p, %'zu, %s, %s) → %p% m", addr, size, DescribeProtFlags(prot),
|
||||
DescribeMapFlags(flags), res);
|
||||
return res;
|
||||
}
|
||||
|
||||
// maps stack on linux
|
||||
static void *slackmap(size_t stacksize, size_t guardsize) {
|
||||
int olde = errno;
|
||||
struct Map *prev, *map;
|
||||
char *max = (char *)0x7fffffffffff;
|
||||
size_t need = guardsize + stacksize;
|
||||
__maps_lock();
|
||||
for (;;) {
|
||||
|
||||
// look for empty space beneath higher mappings
|
||||
char *region = 0;
|
||||
for (map = __maps_floor(max); map; map = prev) {
|
||||
char *min = (char *)(intptr_t)__pagesize;
|
||||
if ((prev = __maps_prev(map)))
|
||||
min = prev->addr + prev->size;
|
||||
if (map->addr - min >= need) {
|
||||
region = map->addr - need;
|
||||
max = region - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!region)
|
||||
break;
|
||||
|
||||
// track intended memory in rbtree
|
||||
if (!__maps_track(region, guardsize, PROT_NONE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS_LINUX))
|
||||
break;
|
||||
if (!__maps_track(region + guardsize, stacksize, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS_LINUX)) {
|
||||
__maps_untrack(region, need);
|
||||
break;
|
||||
}
|
||||
__maps_unlock();
|
||||
|
||||
// ask kernel to create guard region
|
||||
// taking special care to not clobber untracked mappings
|
||||
//
|
||||
// it's important that this call happen first, since it limits how
|
||||
// much memory map_growsdown will secretly consume. if there's
|
||||
// nothing beneath a map_growsdown mapping, then the kernel reserves
|
||||
// (and this isn't listed /proc/PID/maps so don't bother looking)
|
||||
// `rlimit_stack.rlim_cur & -__pagesize` bytes of memory including
|
||||
// this top-most page, and another 1mb of guard pages beneath that.
|
||||
// but by mapping our guard pages manually, we ensure the guard
|
||||
// region and the stack itself will be exactly as big as we want.
|
||||
//
|
||||
// you'd think we could mmap(0, pagesz, growsdown) to let the kernel
|
||||
// pick an address and then we could just upscale the user's stack
|
||||
// size request to whatever rlimit_stack is if it's bigger. but the
|
||||
// linux kernel will actually choose addresses between existing maps
|
||||
// where the hole is smaller than rlimit_stack.
|
||||
//
|
||||
// to use map_growsdown, we must use map_fixed. normally when we use
|
||||
// map_fixed, we reserve an entire kernel-assigned region beforehand
|
||||
// to ensure there isn't any overlap with existing mappings. however
|
||||
// since growsdown stops growing when it encounters another mapping,
|
||||
// you can't map it on top of a reservation mapping. so we must take
|
||||
// a leap of faith there aren't any mystery mappings twixt the guard
|
||||
// region and growsdown page below.
|
||||
char *guard_region =
|
||||
flixmap(region, guardsize, PROT_NONE, MAP_NORESERVE_LINUX);
|
||||
if (!guard_region) {
|
||||
RecoverFromMmapFailure:
|
||||
if (errno != EEXIST) {
|
||||
// mmap() probably raised enomem due to rlimit_as etc.
|
||||
__maps_untrack(region, need);
|
||||
return 0;
|
||||
} else {
|
||||
// we've encountered a mystery mapping. it's hard to imagine
|
||||
// this happening, since we don't use map_growsdown when
|
||||
// cosmo_dlopen() is linked in the binary. in that case, the
|
||||
// tracker we created covers at least some of the rogue map,
|
||||
// therefore this issue should fix itself if we keep going
|
||||
errno = olde;
|
||||
__maps_lock();
|
||||
++max;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// ask kernel to create stack pages
|
||||
// taking special care to not clobber untracked mappings
|
||||
char *top_page = flixmap(region + need - __pagesize, __pagesize,
|
||||
PROT_READ | PROT_WRITE, MAP_GROWSDOWN_LINUX);
|
||||
if (!top_page) {
|
||||
sys_munmap(region, guardsize);
|
||||
goto RecoverFromMmapFailure;
|
||||
}
|
||||
|
||||
// return address to bottom of stack
|
||||
return region + guardsize;
|
||||
}
|
||||
__maps_unlock();
|
||||
errno = ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static errno_t cosmo_stack_munmap(char *stackaddr, size_t stacksize,
|
||||
size_t guardsize) {
|
||||
errno_t r = 0;
|
||||
errno_t e = errno;
|
||||
if (!munmap(addr, size)) {
|
||||
if (!munmap(stackaddr - guardsize, //
|
||||
guardsize + stacksize)) {
|
||||
r = errno;
|
||||
errno = e;
|
||||
}
|
||||
|
@ -119,7 +254,8 @@ static void cosmo_stack_rehabilitate(struct Dll *stacks) {
|
|||
struct Dll *e;
|
||||
for (e = dll_first(stacks); e; e = dll_next(stacks, e))
|
||||
cosmo_stack_munmap(THREADSTACK_CONTAINER(e)->stackaddr,
|
||||
THREADSTACK_CONTAINER(e)->stacksize);
|
||||
THREADSTACK_CONTAINER(e)->stacksize,
|
||||
THREADSTACK_CONTAINER(e)->guardsize);
|
||||
cosmo_stack_lock();
|
||||
dll_make_first(&cosmo_stacks.objects, stacks);
|
||||
cosmo_stack_unlock();
|
||||
|
@ -193,39 +329,41 @@ void cosmo_stack_setmaxstacks(int maxstacks) {
|
|||
* abstract all the gory details of gaining authorized memory, and
|
||||
* additionally implements caching for lightning fast performance.
|
||||
*
|
||||
* The stack size must be nonzero. It is rounded up to the granularity
|
||||
* of the underlying system allocator, which is normally the page size.
|
||||
* Your parameter will be updated with the selected value upon success.
|
||||
* The stack size must be nonzero. It specifies the minimum amount of
|
||||
* stack space that will be available for use. The provided value is
|
||||
* rounded up to the system page size. It may be increased further for
|
||||
* various reasons. Your stack size parameter will be updated with the
|
||||
* chosen value upon success.
|
||||
*
|
||||
* The guard size specifies how much memory should be protected at the
|
||||
* bottom of your stack. This is helpful for ensuring stack overflows
|
||||
* will result in a segmentation fault, rather than corrupting memory
|
||||
* silently. This may be set to zero, in which case no guard pages will
|
||||
* be protected. This value is rounded up to the system page size. The
|
||||
* corrected value will be returned upon success. Your guard size needs
|
||||
* to be small enough to leave room for at least one memory page in your
|
||||
* stack size i.e. `guardsize + pagesize <= stacksize` must be the case.
|
||||
* Otherwise this function will return an `EINVAL` error.
|
||||
* The guard size specifies the minimum amount of memory that should be
|
||||
* protected beneath your stack. This helps ensure stack overflows cause
|
||||
* a segfault rather than corrupting memory silently. This may be set to
|
||||
* zero in which case no guard pages will be made. This value is rounded
|
||||
* up to the system page size. The corrected value will be returned upon
|
||||
* success. Your guard size needs to be small enough to leave room for
|
||||
* at least one memory page in your stack size i.e. `guardsize +
|
||||
* pagesize <= stacksize` must be the case. Otherwise this function will
|
||||
* return an `EINVAL` error.
|
||||
*
|
||||
* When you're done using your stack, pass it to cosmo_stack_free() so
|
||||
* it can be recycled. Stacks are only recycled when the `stacksize` and
|
||||
* `guardsize` parameters are an exact match after correction. Otherwise
|
||||
* they'll likely be freed eventually, in a least-recently used fashion,
|
||||
* based upon the configurable cosmo_stack_setmaxstacks() setting.
|
||||
* `guardsize` parameters match the constraints described above. Stacks
|
||||
* that don't end up getting reused will be freed eventually, in a least
|
||||
* recently used way based upon your cosmo_stack_setmaxstacks() setting.
|
||||
*
|
||||
* This function returns 0 on success, or an errno on error. See the
|
||||
* documentation of mmap() for a list possible errors that may occur.
|
||||
*/
|
||||
errno_t cosmo_stack_alloc(unsigned *inout_stacksize, //
|
||||
unsigned *inout_guardsize, //
|
||||
errno_t cosmo_stack_alloc(size_t *inout_stacksize, //
|
||||
size_t *inout_guardsize, //
|
||||
void **out_addr) {
|
||||
|
||||
// validate arguments
|
||||
unsigned stacksize = *inout_stacksize;
|
||||
unsigned guardsize = *inout_guardsize;
|
||||
stacksize = (stacksize + __gransize - 1) & -__gransize;
|
||||
size_t stacksize = *inout_stacksize;
|
||||
size_t guardsize = *inout_guardsize;
|
||||
stacksize = (stacksize + __pagesize - 1) & -__pagesize;
|
||||
guardsize = (guardsize + __pagesize - 1) & -__pagesize;
|
||||
if (guardsize + __pagesize > stacksize)
|
||||
if (!stacksize)
|
||||
return EINVAL;
|
||||
|
||||
// recycle stack
|
||||
|
@ -236,8 +374,10 @@ errno_t cosmo_stack_alloc(unsigned *inout_stacksize, //
|
|||
struct CosmoStack *ts = THREADSTACK_CONTAINER(e);
|
||||
if (ts->stacksize == stacksize && //
|
||||
ts->guardsize == guardsize) {
|
||||
dll_remove(&cosmo_stacks.stacks, e);
|
||||
stackaddr = ts->stackaddr;
|
||||
stacksize = ts->stacksize;
|
||||
guardsize = ts->guardsize;
|
||||
dll_remove(&cosmo_stacks.stacks, e);
|
||||
dll_make_first(&cosmo_stacks.objects, e);
|
||||
--cosmo_stacks.count;
|
||||
break;
|
||||
|
@ -247,20 +387,37 @@ errno_t cosmo_stack_alloc(unsigned *inout_stacksize, //
|
|||
|
||||
// create stack
|
||||
if (!stackaddr) {
|
||||
errno_t e = errno;
|
||||
stackaddr = mmap(0, stacksize, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (stackaddr == MAP_FAILED) {
|
||||
errno_t err = errno;
|
||||
errno = e;
|
||||
return err;
|
||||
errno_t olde = errno;
|
||||
if (!IsTiny() && IsLinux() && guardsize && !_weaken(cosmo_dlopen) &&
|
||||
stacksize <= cosmo_stack_maxgrow() && !IsQemuUser()) {
|
||||
// this special linux-only stack allocator significantly reduces
|
||||
// the consumption of virtual memory.
|
||||
if (!(stackaddr = slackmap(stacksize, guardsize))) {
|
||||
errno_t err = errno;
|
||||
errno = olde;
|
||||
return err;
|
||||
}
|
||||
} else {
|
||||
char *map = mmap(0, guardsize + stacksize, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (map == MAP_FAILED) {
|
||||
errno_t err = errno;
|
||||
errno = olde;
|
||||
return err;
|
||||
}
|
||||
stackaddr = map + guardsize;
|
||||
if (IsOpenbsd())
|
||||
if (!TellOpenbsdThisIsStackMemory(stackaddr, stacksize))
|
||||
notpossible;
|
||||
if (guardsize) {
|
||||
if (mprotect(map, guardsize, PROT_NONE | PROT_GUARD)) {
|
||||
errno_t err = errno;
|
||||
munmap(map, guardsize + stacksize);
|
||||
errno = olde;
|
||||
return err;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (IsOpenbsd())
|
||||
if (!TellOpenbsdThisIsStackMemory(stackaddr, stacksize))
|
||||
notpossible;
|
||||
if (guardsize)
|
||||
if (mprotect(stackaddr, guardsize, PROT_NONE | PROT_GUARD))
|
||||
notpossible;
|
||||
}
|
||||
|
||||
// return stack
|
||||
|
@ -277,20 +434,22 @@ static void cosmo_stack_setup(void) {
|
|||
/**
|
||||
* Frees stack memory.
|
||||
*
|
||||
* While not strictly required, it's assumed these three values would be
|
||||
* those returned by an earlier call to cosmo_stack_alloc().
|
||||
* While not strictly required, it's assumed the three parameters are
|
||||
* those returned by an earlier call to cosmo_stack_alloc(). If they
|
||||
* aren't page aligned and rounded, this function will return EINVAL.
|
||||
*
|
||||
* This function returns 0 on success, or an errno on error. The `errno`
|
||||
* variable is never clobbered. You can only dependably count on this to
|
||||
* return an error on failure when you say `cosmo_stack_setmaxstacks(0)`
|
||||
*/
|
||||
errno_t cosmo_stack_free(void *stackaddr, unsigned stacksize,
|
||||
unsigned guardsize) {
|
||||
stacksize = (stacksize + __gransize - 1) & -__gransize;
|
||||
guardsize = (guardsize + __pagesize - 1) & -__pagesize;
|
||||
if (guardsize + __pagesize > stacksize)
|
||||
errno_t cosmo_stack_free(void *stackaddr, size_t stacksize, size_t guardsize) {
|
||||
if (!stacksize)
|
||||
return EINVAL;
|
||||
if ((uintptr_t)stackaddr & (__gransize - 1))
|
||||
if (stacksize & (__pagesize - 1))
|
||||
return EINVAL;
|
||||
if (guardsize & (__pagesize - 1))
|
||||
return EINVAL;
|
||||
if ((uintptr_t)stackaddr & (__pagesize - 1))
|
||||
return EINVAL;
|
||||
cosmo_stack_lock();
|
||||
struct Dll *surplus = 0;
|
||||
|
@ -318,7 +477,7 @@ errno_t cosmo_stack_free(void *stackaddr, unsigned stacksize,
|
|||
cosmo_stack_rehabilitate(surplus);
|
||||
errno_t err = 0;
|
||||
if (stackaddr)
|
||||
err = cosmo_stack_munmap(stackaddr, stacksize);
|
||||
err = cosmo_stack_munmap(stackaddr, stacksize, guardsize);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue