Rewrite memory manager

Actually Portable Executable now supports Android. Cosmo's old mmap code
required a 47 bit address space. The new implementation is very agnostic
and supports both smaller address spaces (e.g. embedded) and even modern
56-bit PML5T paging for x86 which finally came true on Zen4 Threadripper

Cosmopolitan no longer requires UNIX systems to observe the Windows 64kb
granularity; i.e. sysconf(_SC_PAGE_SIZE) will now report the host native
page size. This fixes a longstanding POSIX conformance issue, concerning
file mappings that overlap the end of file. Other aspects of conformance
have been improved too, such as the subtleties of address assignment and
and the various subtleties surrounding MAP_FIXED and MAP_FIXED_NOREPLACE

On Windows, mappings larger than 100 megabytes won't be broken down into
thousands of independent 64kb mappings. Support for MAP_STACK is removed
by this change; please use NewCosmoStack() instead.

Stack overflow avoidance is now being implemented using the POSIX thread
APIs. Please use GetStackBottom() and GetStackAddr(), instead of the old
error-prone GetStackAddr() and HaveStackMemory() APIs which are removed.
This commit is contained in:
Justine Tunney 2024-06-20 20:46:42 -07:00
parent 7f6d0b8709
commit 6ffed14b9c
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
150 changed files with 1893 additions and 5634 deletions

View file

@ -9,9 +9,9 @@ LICENSE
LOCAL CHANGES
- Use thread-local freelist from cosmo tib
- Define dlmalloc_requires_more_vespene_gas()
- Make dlmalloc scalable using sched_getcpu()
- Use faster two power roundup for memalign()
- Poison maps to integrate with Address Sanitizer
- Introduce __oom_hook() by using _mapanon() vs. mmap()
- Wrap locks with __threaded check to improve perf lots
- Use assembly init rather than ensure_initialization()
- Implemented the locking functions dlmalloc wants
- Use assembly _init() rather than ensure_initialization()
- Rather than calling mmap() 96 times from _start() just use .bss

View file

@ -24,6 +24,7 @@
#include "libc/thread/tls.h"
#include "third_party/dlmalloc/vespene.internal.h"
#include "libc/thread/tls.h"
#include "libc/intrin/kprintf.h"
#include "third_party/nsync/mu.h"
#if !IsTiny()

View file

@ -1,3 +1,5 @@
#include "libc/sysv/consts/auxv.h"
#include "libc/runtime/runtime.h"
/* ---------------------------- setting mparams -------------------------- */
@ -5,24 +7,18 @@
#if ONLY_MSPACES
static void dlmalloc_pre_fork(void) {
mstate h;
for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
ACQUIRE_LOCK(&h->mutex);
ACQUIRE_LOCK(&g_heaps[i].m.mutex);
}
static void dlmalloc_post_fork_parent(void) {
mstate h;
for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
RELEASE_LOCK(&h->mutex);
RELEASE_LOCK(&g_heaps[i].m.mutex);
}
static void dlmalloc_post_fork_child(void) {
mstate h;
for (unsigned i = 0; i < ARRAYLEN(g_heaps); ++i)
if ((h = atomic_load_explicit(&g_heaps[i], memory_order_acquire)))
(void)INITIAL_LOCK(&h->mutex);
(void)INITIAL_LOCK(&g_heaps[i].m.mutex);
}
#else
@ -46,8 +42,8 @@ __attribute__((__constructor__(49))) int init_mparams(void) {
size_t gsize;
#if defined(__COSMOPOLITAN__)
psize = FRAMESIZE;
gsize = FRAMESIZE;
psize = getauxval(AT_PAGESZ);
gsize = DEFAULT_GRANULARITY ? DEFAULT_GRANULARITY : psize;
#elif !defined(WIN32)
psize = malloc_getpagesize;
gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);

View file

@ -23,6 +23,13 @@ static mstate init_user_mstate(char* tbase, size_t tsize) {
return m;
}
// [jart] rather than calling mmap() 96 times from _start() just use .bss
static void init_heap(union Heap *heap, int locked) {
mstate m = init_user_mstate(heap->mspace, sizeof(*heap));
m->seg.sflags = USE_MMAP_BIT;
set_lock(m, locked);
}
mspace create_mspace(size_t capacity, int locked) {
mstate m = 0;
size_t msize;

View file

@ -111,7 +111,7 @@
#if (MORECORE_CONTIGUOUS || defined(WIN32))
#define DEFAULT_GRANULARITY (0) /* 0 means to compute in init_mparams */
#else /* MORECORE_CONTIGUOUS */
#define DEFAULT_GRANULARITY ((size_t)256U * (size_t)1024U)
#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
#endif /* MORECORE_CONTIGUOUS */
#endif /* DEFAULT_GRANULARITY */
#ifndef DEFAULT_TRIM_THRESHOLD

View file

@ -31,9 +31,20 @@
#error "threaded dlmalloc needs footers and mspaces"
#endif
union Heap {
struct malloc_state mstate;
struct {
size_t top_foot[2];
struct malloc_state m;
};
_Alignas(16) char mspace[DEFAULT_GRANULARITY];
};
static void init_heap(union Heap *heap, int locked);
static struct magicu magiu;
static unsigned g_heapslen;
static mstate g_heaps[128];
static union Heap g_heaps[128];
void dlfree(void *p) {
return mspace_free(0, p);
@ -54,7 +65,7 @@ int dlmallopt(int param_number, int value) {
int dlmalloc_trim(size_t pad) {
int got_some = 0;
for (unsigned i = 0; i < g_heapslen; ++i)
got_some |= mspace_trim(g_heaps[i], pad);
got_some |= mspace_trim(&g_heaps[i].m, pad);
return got_some;
}
@ -68,7 +79,7 @@ void dlmalloc_inspect_all(void handler(void *start, void *end,
size_t used_bytes, void *callback_arg),
void *arg) {
for (unsigned i = 0; i < g_heapslen; ++i)
mspace_inspect_all(g_heaps[i], handler, arg);
mspace_inspect_all(&g_heaps[i].m, handler, arg);
}
forceinline mstate get_arena(void) {
@ -82,11 +93,11 @@ forceinline mstate get_arena(void) {
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
cpu = tpidr_el0 & 255;
#endif
return g_heaps[__magicu_div(cpu, magiu) % g_heapslen];
return &g_heaps[__magicu_div(cpu, magiu) % g_heapslen].m;
}
static void *dlmalloc_single(size_t n) {
return mspace_malloc(g_heaps[0], n);
return mspace_malloc(&g_heaps[0].m, n);
}
static void *dlmalloc_threaded(size_t n) {
@ -94,7 +105,7 @@ static void *dlmalloc_threaded(size_t n) {
}
static void *dlcalloc_single(size_t n, size_t z) {
return mspace_calloc(g_heaps[0], n, z);
return mspace_calloc(&g_heaps[0].m, n, z);
}
static void *dlcalloc_threaded(size_t n, size_t z) {
@ -102,7 +113,7 @@ static void *dlcalloc_threaded(size_t n, size_t z) {
}
static void *dlrealloc_single(void *p, size_t n) {
return mspace_realloc(g_heaps[0], p, n);
return mspace_realloc(&g_heaps[0].m, p, n);
}
static void *dlrealloc_threaded(void *p, size_t n) {
@ -113,7 +124,7 @@ static void *dlrealloc_threaded(void *p, size_t n) {
}
static void *dlmemalign_single(size_t a, size_t n) {
return mspace_memalign(g_heaps[0], a, n);
return mspace_memalign(&g_heaps[0].m, a, n);
}
static void *dlmemalign_threaded(size_t a, size_t n) {
@ -121,7 +132,7 @@ static void *dlmemalign_threaded(size_t a, size_t n) {
}
static struct mallinfo dlmallinfo_single(void) {
return mspace_mallinfo(g_heaps[0]);
return mspace_mallinfo(&g_heaps[0].m);
}
static struct mallinfo dlmallinfo_threaded(void) {
@ -144,8 +155,7 @@ static void use_single_heap(bool uses_locks) {
dlrealloc = dlrealloc_single;
dlmemalign = dlmemalign_single;
dlmallinfo = dlmallinfo_single;
if (!(g_heaps[0] = create_mspace(0, uses_locks)))
__builtin_trap();
init_heap(&g_heaps[0], uses_locks);
}
static void threaded_dlmalloc(void) {
@ -180,10 +190,9 @@ static void threaded_dlmalloc(void) {
// we need this too due to linux's cpu count affinity hack
g_heapslen = heaps;
// create the arenas
// create the heaps
for (size_t i = 0; i < g_heapslen; ++i)
if (!(g_heaps[i] = create_mspace(0, true)))
__builtin_trap();
init_heap(&g_heaps[i], true);
// install function pointers
dlmalloc = dlmalloc_threaded;
@ -192,5 +201,5 @@ static void threaded_dlmalloc(void) {
dlmemalign = dlmemalign_threaded;
dlmallinfo = dlmallinfo_threaded;
STRACE("created %d dlmalloc arenas for %d cpus", heaps, cpus);
STRACE("created %d dlmalloc heaps for %d cpus", heaps, cpus);
}

View file

@ -28,10 +28,8 @@
*/
void *dlmalloc_requires_more_vespene_gas(size_t size) {
char *p;
if ((p = _mapanon(size))) {
if (IsAsan()) {
if ((p = _mapanon(size)))
if (IsAsan())
__asan_poison(p, size, kAsanHeapFree);
}
}
return p;
}