Rewrite memory manager

Actually Portable Executable now supports Android. Cosmo's old mmap code
required a 47 bit address space. The new implementation is very agnostic
and supports both smaller address spaces (e.g. embedded) and even modern
56-bit PML5T paging for x86 which finally came true on Zen4 Threadripper

Cosmopolitan no longer requires UNIX systems to observe the Windows 64kb
granularity; i.e. sysconf(_SC_PAGE_SIZE) will now report the host native
page size. This fixes a longstanding POSIX conformance issue, concerning
file mappings that overlap the end of file. Other aspects of conformance
have been improved too, such as the subtleties of address assignment and
and the various subtleties surrounding MAP_FIXED and MAP_FIXED_NOREPLACE

On Windows, mappings larger than 100 megabytes won't be broken down into
thousands of independent 64kb mappings. Support for MAP_STACK is removed
by this change; please use NewCosmoStack() instead.

Stack overflow avoidance is now being implemented using the POSIX thread
APIs. Please use GetStackBottom() and GetStackAddr(), instead of the old
error-prone GetStackAddr() and HaveStackMemory() APIs which are removed.
This commit is contained in:
Justine Tunney 2024-06-20 20:46:42 -07:00
parent 7f6d0b8709
commit 6ffed14b9c
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
150 changed files with 1893 additions and 5634 deletions

View file

@ -2,9 +2,6 @@
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
__static_yoink("sys_mmap"); /* asan needs it */
__static_yoink("__track_memory"); /* asan needs it */
#define ASSERT(x, y) Assert2(x, y, #y, __FILE__, __LINE__)
#define ASSERT128(x, y) Assert128(x, y, #y, __FILE__, __LINE__)

View file

@ -9,9 +9,9 @@ LICENSE
LOCAL CHANGES
- Use thread-local freelist from cosmo tib
- Define dlmalloc_requires_more_vespene_gas()
- Make dlmalloc scalable using sched_getcpu()
- Use faster two power roundup for memalign()
- Poison maps to integrate with Address Sanitizer
- Introduce __oom_hook() by using _mapanon() vs. mmap()
- Wrap locks with __threaded check to improve perf lots
- Use assembly init rather than ensure_initialization()
- Implemented the locking functions dlmalloc wants
- Use assembly _init() rather than ensure_initialization()
- Rather than calling mmap() 96 times from _start() just use .bss

View file

@ -24,6 +24,7 @@
#include "libc/thread/tls.h"
#include "third_party/dlmalloc/vespene.internal.h"
#include "libc/thread/tls.h"
#include "libc/intrin/kprintf.h"
#include "third_party/nsync/mu.h"
#if !IsTiny()

View file

@ -1,3 +1,5 @@
#include "libc/sysv/consts/auxv.h"
#include "libc/runtime/runtime.h"
/* ---------------------------- setting mparams -------------------------- */
@ -5,24 +7,18 @@
#if ONLY_MSPACES
static void dlmalloc_pre_fork(void) {
mstate h;
for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
ACQUIRE_LOCK(&h->mutex);
ACQUIRE_LOCK(&g_heaps[i].m.mutex);
}
static void dlmalloc_post_fork_parent(void) {
mstate h;
for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
RELEASE_LOCK(&h->mutex);
RELEASE_LOCK(&g_heaps[i].m.mutex);
}
static void dlmalloc_post_fork_child(void) {
mstate h;
for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
(void)INITIAL_LOCK(&h->mutex);
(void)INITIAL_LOCK(&g_heaps[i].m.mutex);
}
#else
@ -46,8 +42,8 @@ __attribute__((__constructor__(49))) int init_mparams(void) {
size_t gsize;
#if defined(__COSMOPOLITAN__)
psize = FRAMESIZE;
gsize = FRAMESIZE;
psize = getauxval(AT_PAGESZ);
gsize = DEFAULT_GRANULARITY ? DEFAULT_GRANULARITY : psize;
#elif !defined(WIN32)
psize = malloc_getpagesize;
gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);

View file

@ -23,6 +23,13 @@ static mstate init_user_mstate(char* tbase, size_t tsize) {
return m;
}
// [jart] rather than calling mmap() 96 times from _start() just use .bss
static void init_heap(union Heap *heap, int locked) {
mstate m = init_user_mstate(heap->mspace, sizeof(*heap));
m->seg.sflags = USE_MMAP_BIT;
set_lock(m, locked);
}
mspace create_mspace(size_t capacity, int locked) {
mstate m = 0;
size_t msize;

View file

@ -111,7 +111,7 @@
#if (MORECORE_CONTIGUOUS || defined(WIN32))
#define DEFAULT_GRANULARITY (0) /* 0 means to compute in init_mparams */
#else /* MORECORE_CONTIGUOUS */
#define DEFAULT_GRANULARITY ((size_t)256U * (size_t)1024U)
#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
#endif /* MORECORE_CONTIGUOUS */
#endif /* DEFAULT_GRANULARITY */
#ifndef DEFAULT_TRIM_THRESHOLD

View file

@ -31,9 +31,20 @@
#error "threaded dlmalloc needs footers and mspaces"
#endif
union Heap {
struct malloc_state mstate;
struct {
size_t top_foot[2];
struct malloc_state m;
};
_Alignas(16) char mspace[DEFAULT_GRANULARITY];
};
static void init_heap(union Heap *heap, int locked);
static struct magicu magiu;
static unsigned g_heapslen;
static mstate g_heaps[128];
static union Heap g_heaps[128];
void dlfree(void *p) {
return mspace_free(0, p);
@ -54,7 +65,7 @@ int dlmallopt(int param_number, int value) {
int dlmalloc_trim(size_t pad) {
int got_some = 0;
for (unsigned i = 0; i < g_heapslen; ++i)
got_some |= mspace_trim(g_heaps[i], pad);
got_some |= mspace_trim(&g_heaps[i].m, pad);
return got_some;
}
@ -68,7 +79,7 @@ void dlmalloc_inspect_all(void handler(void *start, void *end,
size_t used_bytes, void *callback_arg),
void *arg) {
for (unsigned i = 0; i < g_heapslen; ++i)
mspace_inspect_all(g_heaps[i], handler, arg);
mspace_inspect_all(&g_heaps[i].m, handler, arg);
}
forceinline mstate get_arena(void) {
@ -82,11 +93,11 @@ forceinline mstate get_arena(void) {
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
cpu = tpidr_el0 & 255;
#endif
return g_heaps[__magicu_div(cpu, magiu) % g_heapslen];
return &g_heaps[__magicu_div(cpu, magiu) % g_heapslen].m;
}
static void *dlmalloc_single(size_t n) {
return mspace_malloc(g_heaps[0], n);
return mspace_malloc(&g_heaps[0].m, n);
}
static void *dlmalloc_threaded(size_t n) {
@ -94,7 +105,7 @@ static void *dlmalloc_threaded(size_t n) {
}
static void *dlcalloc_single(size_t n, size_t z) {
return mspace_calloc(g_heaps[0], n, z);
return mspace_calloc(&g_heaps[0].m, n, z);
}
static void *dlcalloc_threaded(size_t n, size_t z) {
@ -102,7 +113,7 @@ static void *dlcalloc_threaded(size_t n, size_t z) {
}
static void *dlrealloc_single(void *p, size_t n) {
return mspace_realloc(g_heaps[0], p, n);
return mspace_realloc(&g_heaps[0].m, p, n);
}
static void *dlrealloc_threaded(void *p, size_t n) {
@ -113,7 +124,7 @@ static void *dlrealloc_threaded(void *p, size_t n) {
}
static void *dlmemalign_single(size_t a, size_t n) {
return mspace_memalign(g_heaps[0], a, n);
return mspace_memalign(&g_heaps[0].m, a, n);
}
static void *dlmemalign_threaded(size_t a, size_t n) {
@ -121,7 +132,7 @@ static void *dlmemalign_threaded(size_t a, size_t n) {
}
static struct mallinfo dlmallinfo_single(void) {
return mspace_mallinfo(g_heaps[0]);
return mspace_mallinfo(&g_heaps[0].m);
}
static struct mallinfo dlmallinfo_threaded(void) {
@ -144,8 +155,7 @@ static void use_single_heap(bool uses_locks) {
dlrealloc = dlrealloc_single;
dlmemalign = dlmemalign_single;
dlmallinfo = dlmallinfo_single;
if (!(g_heaps[0] = create_mspace(0, uses_locks)))
__builtin_trap();
init_heap(&g_heaps[0], uses_locks);
}
static void threaded_dlmalloc(void) {
@ -180,10 +190,9 @@ static void threaded_dlmalloc(void) {
// we need this too due to linux's cpu count affinity hack
g_heapslen = heaps;
// create the arenas
// create the heaps
for (size_t i = 0; i < g_heapslen; ++i)
if (!(g_heaps[i] = create_mspace(0, true)))
__builtin_trap();
init_heap(&g_heaps[i], true);
// install function pointers
dlmalloc = dlmalloc_threaded;
@ -192,5 +201,5 @@ static void threaded_dlmalloc(void) {
dlmemalign = dlmemalign_threaded;
dlmallinfo = dlmallinfo_threaded;
STRACE("created %d dlmalloc arenas for %d cpus", heaps, cpus);
STRACE("created %d dlmalloc heaps for %d cpus", heaps, cpus);
}

View file

@ -28,10 +28,8 @@
*/
void *dlmalloc_requires_more_vespene_gas(size_t size) {
char *p;
if ((p = _mapanon(size))) {
if (IsAsan()) {
if ((p = _mapanon(size)))
if (IsAsan())
__asan_poison(p, size, kAsanHeapFree);
}
}
return p;
}

View file

@ -21,7 +21,7 @@
#endif
#if defined(__NetBSD__)
#pragma weak pthread_create // Do not create libpthread dependency
//#pragma weak pthread_create // Do not create libpthread dependency
#endif
#if defined(_LIBCPP_WIN32API)

View file

@ -124,6 +124,7 @@ THIRD_PARTY_LUA_A_DIRECTDEPS = \
LIBC_CALLS \
LIBC_FMT \
LIBC_INTRIN \
LIBC_LOG \
LIBC_MEM \
LIBC_NEXGEN32E \
LIBC_PROC \
@ -131,13 +132,13 @@ THIRD_PARTY_LUA_A_DIRECTDEPS = \
LIBC_STDIO \
LIBC_STR \
LIBC_SYSV \
LIBC_LOG \
LIBC_X \
LIBC_THREAD \
LIBC_TINYMATH \
LIBC_X \
NET_HTTP \
THIRD_PARTY_LINENOISE \
THIRD_PARTY_DOUBLECONVERSION \
THIRD_PARTY_GDTOA \
THIRD_PARTY_LINENOISE \
THIRD_PARTY_TZ
THIRD_PARTY_LUA_A_DEPS := \

View file

@ -19,6 +19,7 @@ struct Serializer {
const char *reason;
char *strbuf;
size_t strbuflen;
uintptr_t bsp;
};
bool LuaHasMultipleItems(lua_State *);

View file

@ -35,6 +35,7 @@
#include "third_party/lua/cosmo.h"
#include "third_party/lua/lauxlib.h"
#include "third_party/lua/lua.h"
#include "third_party/lua/cosmo.h"
#include "third_party/lua/visitor.h"
static int Serialize(lua_State *, char **, int, struct Serializer *, int);
@ -171,7 +172,7 @@ static int SerializeTable(lua_State *L, char **buf, int idx,
bool multi;
bool isarray;
lua_Unsigned n;
if (UNLIKELY(!HaveStackMemory(getauxval(AT_PAGESZ)))) {
if (UNLIKELY(GetStackPointer() < z->bsp)) {
z->reason = "out of stack";
return -1;
}
@ -264,7 +265,11 @@ static int Serialize(lua_State *L, char **buf, int idx, struct Serializer *z,
int LuaEncodeJsonData(lua_State *L, char **buf, int idx,
struct EncoderConfig conf) {
int rc;
struct Serializer z = {.reason = "out of memory", .conf = conf};
struct Serializer z = {
.reason = "out of memory",
.bsp = GetStackBottom() + 4096,
.conf = conf,
};
if (lua_checkstack(L, conf.maxdepth * 3 + LUA_MINSTACK)) {
rc = Serialize(L, buf, idx, &z, 0);
free(z.visited.p);

View file

@ -351,7 +351,7 @@ static int SerializeTable(lua_State *L, char **buf, int idx,
struct Serializer *z, int depth) {
int rc;
bool multi;
if (UNLIKELY(!HaveStackMemory(getauxval(AT_PAGESZ)))) {
if (UNLIKELY(GetStackPointer() < z->bsp)) {
z->reason = "out of stack";
return -1;
}
@ -424,7 +424,11 @@ static int Serialize(lua_State *L, char **buf, int idx, struct Serializer *z,
int LuaEncodeLuaData(lua_State *L, char **buf, int idx,
struct EncoderConfig conf) {
int rc;
struct Serializer z = {.reason = "out of memory", .conf = conf};
struct Serializer z = {
.reason = "out of memory",
.bsp = GetStackBottom() + 4096,
.conf = conf,
};
if (lua_checkstack(L, conf.maxdepth * 3 + LUA_MINSTACK)) {
rc = Serialize(L, buf, idx, &z, 0);
free(z.visited.p);

View file

@ -55,7 +55,6 @@
#include "libc/nt/runtime.h"
#include "libc/nt/synchronization.h"
#include "libc/runtime/clktck.h"
#include "libc/runtime/memtrack.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/sysconf.h"
#include "libc/sock/sock.h"
@ -127,7 +126,7 @@ static void *LuaRealloc(lua_State *L, void *p, size_t n) {
if ((p2 = realloc(p, n))) {
return p2;
}
if (IsLegalSize(n)) {
if (n < 0x100000000000) {
WARNF("reacting to malloc() failure by running lua garbage collector...");
luaC_fullgc(L, 1);
p2 = realloc(p, n);
@ -2934,7 +2933,7 @@ static int LuaUnixMapshared(lua_State *L) {
luaL_error(L, "size must be multiple of word size");
__builtin_unreachable();
}
if (!IsLegalSize(n)) {
if (n >= 0x100000000000) {
luaL_error(L, "map size too big");
__builtin_unreachable();
}

View file

@ -33,6 +33,9 @@
#include "third_party/nsync/common.internal.h"
#include "third_party/nsync/mu_semaphore.h"
#include "third_party/nsync/races.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/runtime.h"
#include "libc/sysv/consts/map.h"
#include "third_party/nsync/wait_s.internal.h"
__static_yoink("nsync_notice");
@ -149,28 +152,14 @@ waiter *nsync_dll_waiter_samecond_ (struct Dll *e) {
/* -------------------------------- */
static struct {
nsync_atomic_uint32_ mu;
size_t used;
char *p, *e;
} malloc;
static void *nsync_malloc (size_t size) {
void *res = 0;
nsync_spin_test_and_set_ (&malloc.mu, 1, 1, 0);
if (malloc.p + malloc.used + size > malloc.e) {
if (!malloc.p) {
malloc.p = malloc.e = (char *)kMemtrackNsyncStart;
}
malloc.e = _extend (malloc.p, malloc.used + size, malloc.e, MAP_PRIVATE,
kMemtrackNsyncStart + kMemtrackNsyncSize);
if (!malloc.e) {
nsync_panic_ ("out of memory\n");
}
}
res = malloc.p + malloc.used;
malloc.used = (malloc.used + size + 15) & -16;
ATM_STORE_REL (&malloc.mu, 0);
void *res;
res = mmap (0, size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0);
if (res == MAP_FAILED)
nsync_panic_ ("out of memory\n");
return res;
}

View file

@ -1,8 +1,6 @@
#ifndef Py_CEVAL_H
#define Py_CEVAL_H
#include "libc/dce.h"
#include "libc/intrin/likely.h"
#include "libc/runtime/stack.h"
#include "third_party/python/Include/object.h"
#include "third_party/python/Include/pyerrors.h"
#include "third_party/python/Include/pystate.h"
@ -108,23 +106,9 @@ void Py_LeaveRecursiveCall(void);
extern int _Py_CheckRecursionLimit;
int _Py_CheckRecursiveCall(const char *);
#define Py_LeaveRecursiveCall() PyThreadState_GET()->recursion_depth--
#define Py_EnterRecursiveCall(where) \
({ \
int rc = 0; \
intptr_t rsp, bot; \
if (IsModeDbg()) { \
PyThreadState_GET()->recursion_depth++; \
rc = _Py_CheckRecursiveCall(where); \
} else { \
rsp = (intptr_t)__builtin_frame_address(0); \
bot = GetStackAddr() + 32768; \
if (UNLIKELY(rsp < bot)) { \
PyErr_Format(PyExc_MemoryError, "Stack overflow%s", where); \
rc = -1; \
} \
} \
rc; \
})
#define Py_EnterRecursiveCall(where) \
(PyThreadState_GET()->recursion_depth++, \
_Py_CheckRecursiveCall(where))
#endif
#define Py_ALLOW_RECURSION \

View file

@ -35,6 +35,9 @@
#include "third_party/python/Include/sysmodule.h"
#include "third_party/python/Include/traceback.h"
#include "third_party/python/Include/tupleobject.h"
#include "libc/thread/thread.h"
#include "libc/thread/thread.h"
#include "libc/thread/thread.h"
#include "third_party/python/Include/warnings.h"
/* Execute compiled code */
@ -654,10 +657,9 @@ int
_Py_CheckRecursiveCall(const char *where)
{
PyThreadState *t;
const char *rsp, *bot;
rsp = __builtin_frame_address(0);
bot = (const char *)GetStackAddr() + 32768;
if (rsp > bot) {
uintptr_t bottom = GetStackBottom();
uintptr_t pointer = GetStackPointer();
if (pointer > bottom + 32768) {
t = PyThreadState_GET();
_Py_CheckRecursionLimit = recursion_limit;
if (t->recursion_depth > recursion_limit && !t->recursion_critical) {
@ -669,7 +671,7 @@ _Py_CheckRecursiveCall(const char *where)
return -1;
}
return 0;
} else if (rsp > bot - 20480) {
} else if (pointer > bottom + 12288) {
PyErr_Format(PyExc_MemoryError, "Stack overflow%s", where);
return -1;
} else {

View file

@ -48,6 +48,7 @@
#include "third_party/python/Include/pythonrun.h"
#include "third_party/python/Include/unicodeobject.h"
#include "third_party/python/Include/yoink.h"
#include "libc/runtime/stack.h"
#include "third_party/xed/x86.h"
STATIC_STACK_SIZE(0x100000);

View file

@ -24,6 +24,7 @@
#include "third_party/bzip2/bzlib.h"
#include "libc/calls/typedef/u.h"
#include "libc/runtime/sysconf.h"
#include "libc/runtime/runtime.h"
#include "libc/errno.h"
#ifndef UTIL /* This module contains no code for Zip Utilities */
@ -47,7 +48,7 @@
#endif
#undef PAGESIZE
#define PAGESIZE FRAMESIZE
#define PAGESIZE __granularity()
#if defined(MMAP)
#include "libc/calls/calls.h"