From b592716d1ce7f70a5df44f007156d3c68dcff3fd Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Thu, 7 Sep 2023 04:30:44 -0700 Subject: [PATCH] Reduce mandatory stack rss by 256kb --- ape/aarch64.lds | 2 + ape/ape.lds | 3 +- libc/crt/crt.S | 19 ++--- libc/runtime/internal.h | 1 - libc/runtime/jmpstack.S | 37 ---------- libc/runtime/stack.h | 137 +++++++++++++++++------------------- libc/runtime/stackuse.c | 4 ++ libc/runtime/winmain.greg.c | 9 ++- libc/tinymath/coshl.c | 1 + libc/tinymath/sinhl.c | 1 + libc/tinymath/tanhl.c | 1 + third_party/lua/lua.main.c | 2 +- third_party/lua/luac.main.c | 1 - third_party/python/python.c | 3 + tool/net/redbean.c | 2 +- 15 files changed, 98 insertions(+), 125 deletions(-) delete mode 100644 libc/runtime/jmpstack.S diff --git a/ape/aarch64.lds b/ape/aarch64.lds index 824512fa0..5a12ffc0c 100644 --- a/ape/aarch64.lds +++ b/ape/aarch64.lds @@ -280,6 +280,8 @@ SECTIONS { ape_stack_vaddr = DEFINED(ape_stack_vaddr) ? ape_stack_vaddr : 0x700000000000; ape_stack_memsz = DEFINED(ape_stack_memsz) ? ape_stack_memsz : 8 * 1024 * 1024; +ape_stack_align = DEFINED(ape_stack_align) ? ape_stack_align : 16; +ape_stack_round = -ape_stack_align; _tls_size = _tbss_end - _tdata_start; _tdata_size = _tdata_end - _tdata_start; diff --git a/ape/ape.lds b/ape/ape.lds index 944c9829f..f08f9fce9 100644 --- a/ape/ape.lds +++ b/ape/ape.lds @@ -585,7 +585,8 @@ ape_stack_vaddr = DEFINED(ape_stack_vaddr) ? ape_stack_vaddr : 0x700000000000; ape_stack_paddr = ape_ram_paddr + ape_ram_filesz; ape_stack_filesz = 0; ape_stack_memsz = DEFINED(ape_stack_memsz) ? ape_stack_memsz : 8 * 1024 * 1024; -ape_stack_align = 16; +ape_stack_align = DEFINED(ape_stack_align) ? ape_stack_align : 16; +ape_stack_round = -ape_stack_align; ape_note_offset = ape_cod_offset + (ape_note - ape_cod_vaddr); ape_note_filesz = ape_note_end - ape_note; diff --git a/libc/crt/crt.S b/libc/crt/crt.S index cc28fbccd..e7af029b2 100644 --- a/libc/crt/crt.S +++ b/libc/crt/crt.S @@ -68,14 +68,12 @@ _start: mov %rsp,__oldstack(%rip) mov %rdx,__envp(%rip) -// setup backtraces +// setup stack xor %ebp,%ebp + and $ape_stack_round,%rsp -// make process stack (8mb) follow thread stack (256kb) alignment - and $-(256*1024),%rsp - -#if SupportsWindows() -// make win32 imps noop +#if SupportsWindows() && !IsTiny() +// make win32 imps crash .weak ape_idata_iat .weak ape_idata_iatend .weak __oops_win32 @@ -122,9 +120,12 @@ _start: // this is the first argument to cosmo() below mov x0,sp -// make process stack (8mb) conform to thread stack (256kb) alignment - mov x1,sp - and sp,x1,-(256*1024) +// setup the stack + mov x29,#0 + mov x30,#0 + ldr x1,=ape_stack_round + and x1,x0,x1 + mov sp,x1 // second arg shall be struct Syslib passed by ape-m1.c // used to talk to apple's authoritarian libraries diff --git a/libc/runtime/internal.h b/libc/runtime/internal.h index 34b9a5f2c..f3ffdb96b 100644 --- a/libc/runtime/internal.h +++ b/libc/runtime/internal.h @@ -38,7 +38,6 @@ void __enable_tls(void); void *__cxa_finalize(void *); void __stack_chk_fail(void) wontreturn relegated; void __stack_chk_fail_local(void) wontreturn relegated; -void _jmpstack(void *, void *, ...) wontreturn; long _setstack(void *, void *, ...); int GetDosArgv(const char16_t *, char *, size_t, char **, size_t); int GetDosEnviron(const char16_t *, char *, size_t, char **, size_t); diff --git a/libc/runtime/jmpstack.S b/libc/runtime/jmpstack.S deleted file mode 100644 index 10226f342..000000000 --- a/libc/runtime/jmpstack.S +++ /dev/null @@ -1,37 +0,0 @@ -/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ -│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ -╞══════════════════════════════════════════════════════════════════════════════╡ -│ Copyright 2020 Justine Alexandra Roberts Tunney │ -│ │ -│ Permission to use, copy, modify, and/or distribute this software for │ -│ any purpose with or without fee is hereby granted, provided that the │ -│ above copyright notice and this permission notice appear in all copies. │ -│ │ -│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ -│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ -│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ -│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ -│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ -│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ -│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ -│ PERFORMANCE OF THIS SOFTWARE. │ -╚─────────────────────────────────────────────────────────────────────────────*/ -#include "libc/macros.internal.h" - -// Switches stack. -// -// @param rdi is new rsp, passed as malloc(size) + size -// @param rsi is function to call in new stack space -// @param rdx,rcx,r8,r9 get passed as args to rsi -// @noreturn -_jmpstack: - mov %rdi,%rsp - mov %rsi,%rax - mov %rdx,%rdi - mov %rcx,%rsi - mov %r8,%rdx - mov %r9,%rcx - xor %ebp,%ebp - call *%rax - .unreachable - .endfn _jmpstack,globl,hidden diff --git a/libc/runtime/stack.h b/libc/runtime/stack.h index 3aaddccf2..438a46e17 100644 --- a/libc/runtime/stack.h +++ b/libc/runtime/stack.h @@ -18,46 +18,18 @@ #define GetGuardSize() 16384 /** - * Tunes APE stack maximum size. + * Align APE main thread stack at startup. * - * The bottom-most page will be protected to ensure your stack does not - * magically grow beyond this value. It's possible to detect stack - * overflows, by calling `ShowCrashReports()`. Your stack size must be a - * power of two; the linker will check this. + * You need this in your main program module: * - * If you want to know how much stack your programs needs, then + * STATIC_STACK_ALIGN(GetStackSize()); * - * __static_yoink("stack_usage_logging"); - * - * will install an atexit() handler that appends to `o/$MODE/stack.log` - * - * @see libc/sysv/systemfive.S - * @see ape/ape.lds + * If you want to use GetStackAddr() and HaveStackMemory() safely on + * your main thread in your process. It causes crt.S to waste a tiny + * amount of memory to ensure those macros go extremely fast. */ -#define STATIC_STACK_SIZE(BYTES) \ - _STACK_SYMBOL("ape_stack_memsz", _STACK_STRINGIFY(BYTES) _STACK_EXTRA) - -/** - * Tunes APE stack virtual address. - * - * This value must be aligned according to your stack size, and that's - * checked by your linker script. This defaults to `0x700000000000` so - * - * 1. It's easy to see how close you are to the bottom - * 2. The linker script error is unlikely to happen - * - * This macro will be respected, with two exceptions - * - * 1. In MODE=tiny the operating system provided stack is used instead - * 2. Windows 7 doesn't support 64-bit addresses, so we'll instead use - * `0x10000000 - GetStackSize()` as the stack address - * - * @see libc/sysv/systemfive.S - * @see libc/nt/winmain.greg.c - * @see ape/ape.lds - */ -#define STATIC_STACK_ADDR(ADDR) \ - _STACK_SYMBOL("ape_stack_vaddr", _STACK_STRINGIFY(ADDR)) +#define STATIC_STACK_ALIGN(BYTES) \ + _STACK_SYMBOL("ape_stack_align", _STACK_STRINGIFY(BYTES) _STACK_EXTRA) /** * Makes program stack executable if declared, e.g. @@ -77,9 +49,9 @@ #define STATIC_EXEC_STACK() _STACK_SYMBOL("ape_stack_pf", "7") #define _STACK_STRINGIFY(ADDR) #ADDR -#define _STACK_SYMBOL(NAME, VALUE) \ - asm(".equ\t" NAME "," VALUE "\n\t" \ - ".globl\t" NAME) +#define _STACK_SYMBOL(NAME, VALUE) \ + __asm__(".equ\t" NAME "," VALUE "\n\t" \ + ".globl\t" NAME) #ifdef __SANITIZE_ADDRESS__ #define _STACK_EXTRA "*2" @@ -95,29 +67,71 @@ extern char ape_stack_memsz[] __attribute__((__weak__)); extern char ape_stack_align[] __attribute__((__weak__)); /** - * Returns address of bottom of stack. + * Returns address of bottom of current stack. * - * This takes into consideration threads and sigaltstack. This is - * implemented as a fast pure expression, since we're able to make the - * assumption that stack sizes are two powers and aligned. This is - * thanks to (1) the linker script checks the statically chosen sizes, - * and (2) the mmap() address picker will choose aligned addresses when - * the provided size is a two power. + * This always works on threads. If you want it to work on the main + * process too, then you'll need STATIC_STACK_ALIGN(GetStackSize()) + * which will burn O(256kb) of memory to ensure thread invariants. */ #define GetStackAddr() \ (((intptr_t)__builtin_frame_address(0) - 1) & -GetStackSize()) #define GetStaticStackSize() ((uintptr_t)ape_stack_memsz) +/** + * Returns true if at least `n` bytes of stack are available. + * + * This always works on threads. If you want it to work on the main + * process too, then you'll need STATIC_STACK_ALIGN(GetStackSize()) + * which will burn O(256kb) of memory to ensure thread invariants, + * which make this check exceedingly fast. + */ +#define HaveStackMemory(n) \ + ((intptr_t)__builtin_frame_address(0) >= \ + GetStackAddr() + GetGuardSize() + (n)) + +/** + * Extends stack memory by poking large allocations. + * + * This can be particularly useful depending on how your system + * implements guard pages. For example, Windows can make stacks + * that aren't fully committed, in which case there's only 4096 + * bytes of grows-down guard pages made by portable executable. + * If you alloca() more memory than that, you should call this, + * since it'll not only ensure stack overflows are detected, it + * will also trigger the stack to grow down safely. + */ +__funline void CheckLargeStackAllocation(void *p, ssize_t n) { + for (; n > 0; n -= 4096) { + ((char *)p)[n - 1] = 0; + } +} + +void *NewCosmoStack(void) vallocesque; +int FreeCosmoStack(void *) libcesque; + +/** + * Tunes stack size of main thread on Windows. + * + * On UNIX systems use `RLIMIT_STACK` to tune the main thread size. + */ +#define STATIC_STACK_SIZE(BYTES) \ + _STACK_SYMBOL("ape_stack_memsz", _STACK_STRINGIFY(BYTES) _STACK_EXTRA) + +/** + * Tunes main thread stack address on Windows. + */ +#define STATIC_STACK_ADDR(ADDR) \ + _STACK_SYMBOL("ape_stack_vaddr", _STACK_STRINGIFY(ADDR)) + #ifdef __x86_64__ /** - * Returns preferred bottom address of stack. + * Returns preferred bottom address of main thread stack. * - * This is the stakc address of the main process. The only time that - * isn't guaranteed to be the case is in MODE=tiny, since it doesn't - * link the code for stack creation at startup. This generally isn't - * problematic, since MODE=tiny doesn't use any of the runtime codes - * which want the stack to be cheaply knowable, e.g. ftrace, kprintf + * On UNIX systems we favor the system provided stack, so this only + * really applies to Windows. It's configurable at link time. It is + * needed because polyfilling fork requires that we know, precicely + * where the stack memory begins and ends. */ #define GetStaticStackAddr(ADDEND) \ ({ \ @@ -132,25 +146,6 @@ extern char ape_stack_align[] __attribute__((__weak__)); #define GetStaticStackAddr(ADDEND) (GetStackAddr() + ADDEND) #endif -/** - * Returns true if at least `n` bytes of stack are available. - */ -#define HaveStackMemory(n) \ - ((intptr_t)__builtin_frame_address(0) >= \ - GetStackAddr() + GetGuardSize() + (n)) - -/** - * Extends stack memory by poking large allocations. - */ -forceinline void CheckLargeStackAllocation(void *p, ssize_t n) { - for (; n > 0; n -= 4096) { - ((char *)p)[n - 1] = 0; - } -} - -void *NewCosmoStack(void) vallocesque; -int FreeCosmoStack(void *) libcesque; - COSMOPOLITAN_C_END_ #endif /* GNU ELF */ #endif /* _COSMO_SOURCE */ diff --git a/libc/runtime/stackuse.c b/libc/runtime/stackuse.c index 606820935..e4357e9e0 100644 --- a/libc/runtime/stackuse.c +++ b/libc/runtime/stackuse.c @@ -27,6 +27,10 @@ #include "libc/str/str.h" #include "libc/sysv/consts/o.h" +// TODO(jart): Delete? + +STATIC_STACK_ALIGN(GetStackSize()); + static char stacklog[1024]; dontasan size_t GetStackUsage(char *s, size_t n) { diff --git a/libc/runtime/winmain.greg.c b/libc/runtime/winmain.greg.c index d89cbb65e..02a81e429 100644 --- a/libc/runtime/winmain.greg.c +++ b/libc/runtime/winmain.greg.c @@ -77,7 +77,10 @@ __msabi extern typeof(SetStdHandle) *const __imp_SetStdHandle; __msabi extern typeof(VirtualProtect) *const __imp_VirtualProtect; // clang-format on -extern void cosmo(int, char **, char **, long (*)[2]) wontreturn; +void cosmo(int, char **, char **, long (*)[2]) wontreturn; +void __switch_stacks(int, char **, char **, long (*)[2], + void (*)(int, char **, char **, long (*)[2]), + intptr_t) wontreturn; static const signed char kNtStdio[3] = { (signed char)kNtStdInputHandle, @@ -211,8 +214,8 @@ __msabi static textwindows wontreturn void WinInit(const char16_t *cmdline) { __envp = &wa->envp[0]; // handover control to cosmopolitan runtime - _jmpstack((char *)(stackaddr + (stacksize - sizeof(struct WinArgs))), cosmo, - count, wa->argv, wa->envp, wa->auxv); + __switch_stacks(count, wa->argv, wa->envp, wa->auxv, cosmo, + stackaddr + (stacksize - sizeof(struct WinArgs))); } __msabi textwindows int64_t WinMain(int64_t hInstance, int64_t hPrevInstance, diff --git a/libc/tinymath/coshl.c b/libc/tinymath/coshl.c index 5de30ed2d..51a0be5e5 100644 --- a/libc/tinymath/coshl.c +++ b/libc/tinymath/coshl.c @@ -112,6 +112,7 @@ long double coshl(long double x) { long double hi,lo,x2,x4; + (void)x4; #if LDBL_MANT_DIG == 113 double dx2; #endif diff --git a/libc/tinymath/sinhl.c b/libc/tinymath/sinhl.c index fdc274696..ec70b6123 100644 --- a/libc/tinymath/sinhl.c +++ b/libc/tinymath/sinhl.c @@ -108,6 +108,7 @@ long double sinhl(long double x) { long double hi,lo,x2,x4; + (void)x4; #if LDBL_MANT_DIG == 113 double dx2; #endif diff --git a/libc/tinymath/tanhl.c b/libc/tinymath/tanhl.c index a409fe0ea..c231857bd 100644 --- a/libc/tinymath/tanhl.c +++ b/libc/tinymath/tanhl.c @@ -135,6 +135,7 @@ long double tanhl(long double x) { long double hi,lo,s,x2,x4,z; + (void)x4; #if LDBL_MANT_DIG == 113 double dx2; #endif diff --git a/third_party/lua/lua.main.c b/third_party/lua/lua.main.c index c0f533542..1375c6d4e 100644 --- a/third_party/lua/lua.main.c +++ b/third_party/lua/lua.main.c @@ -57,7 +57,7 @@ Lua 5.4.3 (MIT License)\\n\ Copyright 1994–2021 Lua.org, PUC-Rio.\""); asm(".include \"libc/disclaimer.inc\""); -STATIC_STACK_SIZE(0x80000); +STATIC_STACK_ALIGN(GetStackSize()); #if !defined(LUA_PROGNAME) #define LUA_PROGNAME "lua" diff --git a/third_party/lua/luac.main.c b/third_party/lua/luac.main.c index c002daf1d..9548a57db 100644 --- a/third_party/lua/luac.main.c +++ b/third_party/lua/luac.main.c @@ -50,7 +50,6 @@ Lua 5.4.3 (MIT License)\\n\ Copyright 1994–2021 Lua.org, PUC-Rio.\""); asm(".include \"libc/disclaimer.inc\""); - static void PrintFunction(const Proto* f, int full); #define luaU_print PrintFunction diff --git a/third_party/python/python.c b/third_party/python/python.c index 3ae9cecc1..0e99c2aeb 100644 --- a/third_party/python/python.c +++ b/third_party/python/python.c @@ -4,11 +4,14 @@ │ Python 3 │ │ https://docs.python.org/3/license.html │ ╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/runtime/stack.h" #include "third_party/python/Include/yoink.h" #include "third_party/python/runpythonmodule.h" #include "tool/args/args.h" // clang-format off +STATIC_STACK_ALIGN(GetStackSize()); + PYTHON_YOINK("xed"); PYTHON_YOINK("xterm"); diff --git a/tool/net/redbean.c b/tool/net/redbean.c index d216e7c76..5e8f89f43 100644 --- a/tool/net/redbean.c +++ b/tool/net/redbean.c @@ -142,7 +142,7 @@ #pragma GCC diagnostic ignored "-Wunused-variable" -STATIC_STACK_SIZE(0x80000); +STATIC_STACK_ALIGN(GetStackSize()); __static_yoink("zipos");