cosmopolitan/third_party/dlmalloc/threaded.inc
Justine Tunney 6ffed14b9c
Rewrite memory manager
Actually Portable Executable now supports Android. Cosmo's old mmap code
required a 47 bit address space. The new implementation is very agnostic
and supports both smaller address spaces (e.g. embedded) and even modern
56-bit PML5T paging for x86 which finally came true on Zen4 Threadripper

Cosmopolitan no longer requires UNIX systems to observe the Windows 64kb
granularity; i.e. sysconf(_SC_PAGE_SIZE) will now report the host native
page size. This fixes a longstanding POSIX conformance issue, concerning
file mappings that overlap the end of file. Other aspects of conformance
have been improved too, such as the subtleties of address assignment and
and the various subtleties surrounding MAP_FIXED and MAP_FIXED_NOREPLACE

On Windows, mappings larger than 100 megabytes won't be broken down into
thousands of independent 64kb mappings. Support for MAP_STACK is removed
by this change; please use NewCosmoStack() instead.

Stack overflow avoidance is now being implemented using the POSIX thread
APIs. Please use GetStackBottom() and GetStackAddr(), instead of the old
error-prone GetStackAddr() and HaveStackMemory() APIs which are removed.
2024-06-22 05:45:11 -07:00

205 lines
6.4 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2024 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/dce.h"
#include "libc/intrin/magicu.h"
#include "libc/intrin/strace.internal.h"
#include "libc/intrin/weaken.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/rdtscp.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/runtime/runtime.h"
#include "libc/thread/thread.h"
#include "third_party/dlmalloc/dlmalloc.h"
#if !FOOTERS || !MSPACES
#error "threaded dlmalloc needs footers and mspaces"
#endif
union Heap {
struct malloc_state mstate;
struct {
size_t top_foot[2];
struct malloc_state m;
};
_Alignas(16) char mspace[DEFAULT_GRANULARITY];
};
static void init_heap(union Heap *heap, int locked);
static struct magicu magiu;
static unsigned g_heapslen;
static union Heap g_heaps[128];
void dlfree(void *p) {
return mspace_free(0, p);
}
size_t dlmalloc_usable_size(void* mem) {
return mspace_usable_size(mem);
}
void* dlrealloc_in_place(void *p, size_t n) {
return mspace_realloc_in_place(0, p, n);
}
int dlmallopt(int param_number, int value) {
return mspace_mallopt(param_number, value);
}
int dlmalloc_trim(size_t pad) {
int got_some = 0;
for (unsigned i = 0; i < g_heapslen; ++i)
got_some |= mspace_trim(&g_heaps[i].m, pad);
return got_some;
}
size_t dlbulk_free(void *array[], size_t nelem) {
for (size_t i = 0; i < nelem; ++i)
mspace_free(0, array[i]);
return 0;
}
void dlmalloc_inspect_all(void handler(void *start, void *end,
size_t used_bytes, void *callback_arg),
void *arg) {
for (unsigned i = 0; i < g_heapslen; ++i)
mspace_inspect_all(&g_heaps[i].m, handler, arg);
}
forceinline mstate get_arena(void) {
unsigned cpu;
#ifdef __x86_64__
unsigned tsc_aux;
rdtscp(&tsc_aux);
cpu = TSC_AUX_CORE(tsc_aux);
#else
long tpidr_el0;
asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
cpu = tpidr_el0 & 255;
#endif
return &g_heaps[__magicu_div(cpu, magiu) % g_heapslen].m;
}
static void *dlmalloc_single(size_t n) {
return mspace_malloc(&g_heaps[0].m, n);
}
static void *dlmalloc_threaded(size_t n) {
return mspace_malloc(get_arena(), n);
}
static void *dlcalloc_single(size_t n, size_t z) {
return mspace_calloc(&g_heaps[0].m, n, z);
}
static void *dlcalloc_threaded(size_t n, size_t z) {
return mspace_calloc(get_arena(), n, z);
}
static void *dlrealloc_single(void *p, size_t n) {
return mspace_realloc(&g_heaps[0].m, p, n);
}
static void *dlrealloc_threaded(void *p, size_t n) {
if (p)
return mspace_realloc(0, p, n);
else
return mspace_malloc(get_arena(), n);
}
static void *dlmemalign_single(size_t a, size_t n) {
return mspace_memalign(&g_heaps[0].m, a, n);
}
static void *dlmemalign_threaded(size_t a, size_t n) {
return mspace_memalign(get_arena(), a, n);
}
static struct mallinfo dlmallinfo_single(void) {
return mspace_mallinfo(&g_heaps[0].m);
}
static struct mallinfo dlmallinfo_threaded(void) {
return mspace_mallinfo(get_arena());
}
static int dlmalloc_atoi(const char *s) {
int c, x = 0;
while ((c = *s++)) {
x *= 10;
x += c - '0';
}
return x;
}
static void use_single_heap(bool uses_locks) {
g_heapslen = 1;
dlmalloc = dlmalloc_single;
dlcalloc = dlcalloc_single;
dlrealloc = dlrealloc_single;
dlmemalign = dlmemalign_single;
dlmallinfo = dlmallinfo_single;
init_heap(&g_heaps[0], uses_locks);
}
static void threaded_dlmalloc(void) {
int heaps, cpus;
const char *var;
if (!_weaken(pthread_create))
return use_single_heap(false);
if (!IsAarch64() && !X86_HAVE(RDTSCP))
return use_single_heap(true);
// determine how many independent heaps we should install
// by default we do an approximation of one heap per core
// this code makes the c++ stl go 164x faster on my ryzen
cpus = __get_cpu_count();
if (cpus == -1)
heaps = 1;
else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT")))
heaps = dlmalloc_atoi(var);
else
heaps = cpus >> 1;
if (heaps <= 1)
return use_single_heap(true);
if (heaps > ARRAYLEN(g_heaps))
heaps = ARRAYLEN(g_heaps);
// find 𝑑 such that sched_getcpu() / 𝑑 is within [0,heaps)
// turn 𝑑 into a fast magic that can divide by multiplying
magiu = __magicu_get(cpus / heaps);
// we need this too due to linux's cpu count affinity hack
g_heapslen = heaps;
// create the heaps
for (size_t i = 0; i < g_heapslen; ++i)
init_heap(&g_heaps[i], true);
// install function pointers
dlmalloc = dlmalloc_threaded;
dlcalloc = dlcalloc_threaded;
dlrealloc = dlrealloc_threaded;
dlmemalign = dlmemalign_threaded;
dlmallinfo = dlmallinfo_threaded;
STRACE("created %d dlmalloc heaps for %d cpus", heaps, cpus);
}