cosmopolitan/third_party/dlmalloc/threaded.inc
2024-08-15 23:54:14 -07:00

226 lines
7 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2024 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/dce.h"
#include "libc/intrin/magicu.h"
#include "libc/intrin/strace.h"
#include "libc/intrin/weaken.h"
#include "libc/macros.h"
#include "libc/runtime/runtime.h"
#include "libc/thread/thread.h"
#include "libc/thread/threads.h"
#include "libc/errno.h"
#include "third_party/dlmalloc/dlmalloc.h"
#if !FOOTERS || !MSPACES
#error "threaded dlmalloc needs footers and mspaces"
#endif
static struct magicu magiu;
static unsigned g_cpucount;
static unsigned g_heapslen;
static mstate g_heaps[128];
void dlfree(void *p) {
return mspace_free(0, p);
}
size_t dlmalloc_usable_size(void* mem) {
return mspace_usable_size(mem);
}
void* dlrealloc_in_place(void *p, size_t n) {
return mspace_realloc_in_place(0, p, n);
}
int dlmallopt(int param_number, int value) {
return mspace_mallopt(param_number, value);
}
int dlmalloc_trim(size_t pad) {
int got_some = 0;
for (unsigned i = 0; i < g_heapslen; ++i)
got_some |= mspace_trim(g_heaps[i], pad);
return got_some;
}
size_t dlbulk_free(void *array[], size_t nelem) {
for (size_t i = 0; i < nelem; ++i)
mspace_free(0, array[i]);
return 0;
}
struct ThreadedMallocVisitor {
mstate heap;
void (*handler)(void *start, void *end,
size_t used_bytes, void *arg);
void *arg;
};
static void threaded_malloc_visitor(void *start, void *end,
size_t used_bytes, void *arg) {
struct ThreadedMallocVisitor *tmv = arg;
if (start == tmv->heap)
return;
tmv->handler(start, end, used_bytes, tmv->arg);
}
void dlmalloc_inspect_all(void handler(void *start, void *end,
size_t used_bytes, void *arg),
void *arg) {
for (unsigned i = 0; i < g_heapslen; ++i) {
struct ThreadedMallocVisitor tmv = {g_heaps[i], handler, arg};
mspace_inspect_all(g_heaps[i], threaded_malloc_visitor, &tmv);
}
}
// we make malloc() scalable basically by
//
// return g_heaps[sched_getcpu() / 2];
//
// except we cache the syscall result using thread-local storage. on
// some platforms, it's not possible to use sched_getcpu() so we use
// arbitrary assignments to help scalability, but may not be optimal
static mstate get_arena(void) {
static atomic_uint assign;
static thread_local unsigned i;
static thread_local unsigned n;
if (n == 50)
n = 0;
if (!n) {
int e = errno;
i = sched_getcpu();
if (i == -1) {
errno = e;
i = atomic_fetch_add_explicit(&assign, 1, memory_order_relaxed);
i %= g_cpucount;
}
i = __magicu_div(i, magiu) % g_heapslen;
}
++n;
return g_heaps[i];
}
static void *dlmalloc_single(size_t n) {
return mspace_malloc(g_heaps[0], n);
}
static void *dlmalloc_threaded(size_t n) {
return mspace_malloc(get_arena(), n);
}
static void *dlcalloc_single(size_t n, size_t z) {
return mspace_calloc(g_heaps[0], n, z);
}
static void *dlcalloc_threaded(size_t n, size_t z) {
return mspace_calloc(get_arena(), n, z);
}
static void *dlrealloc_single(void *p, size_t n) {
return mspace_realloc(g_heaps[0], p, n);
}
static void *dlrealloc_threaded(void *p, size_t n) {
if (p)
return mspace_realloc(0, p, n);
else
return mspace_malloc(get_arena(), n);
}
static void *dlmemalign_single(size_t a, size_t n) {
return mspace_memalign(g_heaps[0], a, n);
}
static void *dlmemalign_threaded(size_t a, size_t n) {
return mspace_memalign(get_arena(), a, n);
}
static struct mallinfo dlmallinfo_single(void) {
return mspace_mallinfo(g_heaps[0]);
}
static struct mallinfo dlmallinfo_threaded(void) {
return mspace_mallinfo(get_arena());
}
static int dlmalloc_atoi(const char *s) {
int c, x = 0;
while ((c = *s++)) {
x *= 10;
x += c - '0';
}
return x;
}
static void use_single_heap(bool uses_locks) {
g_heapslen = 1;
dlmalloc = dlmalloc_single;
dlcalloc = dlcalloc_single;
dlrealloc = dlrealloc_single;
dlmemalign = dlmemalign_single;
dlmallinfo = dlmallinfo_single;
if (!(g_heaps[0] = create_mspace(0, uses_locks)))
__builtin_trap();
}
static void threaded_dlmalloc(void) {
int heaps, cpus;
const char *var;
if (!_weaken(pthread_create))
return use_single_heap(false);
// determine how many independent heaps we should install
// by default we do an approximation of one heap per core
// this code makes the c++ stl go 164x faster on my ryzen
g_cpucount = cpus = __get_cpu_count();
if (cpus == -1) {
heaps = 1;
g_cpucount = 1;
} else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT"))) {
heaps = dlmalloc_atoi(var);
} else {
heaps = cpus >> 1;
}
if (heaps <= 1)
return use_single_heap(true);
if (heaps > ARRAYLEN(g_heaps))
heaps = ARRAYLEN(g_heaps);
// find 𝑑 such that sched_getcpu() / 𝑑 is within [0,heaps)
// turn 𝑑 into a fast magic that can divide by multiplying
magiu = __magicu_get(cpus / heaps);
// we need this too due to linux's cpu count affinity hack
g_heapslen = heaps;
// create the arenas
for (size_t i = 0; i < g_heapslen; ++i)
if (!(g_heaps[i] = create_mspace(0, true)))
__builtin_trap();
// install function pointers
dlmalloc = dlmalloc_threaded;
dlcalloc = dlcalloc_threaded;
dlrealloc = dlrealloc_threaded;
dlmemalign = dlmemalign_threaded;
dlmallinfo = dlmallinfo_threaded;
STRACE("created %d dlmalloc arenas for %d cpus", heaps, cpus);
}