/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2024 Justine Alexandra Roberts Tunney │ │ │ │ Permission to use, copy, modify, and/or distribute this software for │ │ any purpose with or without fee is hereby granted, provided that the │ │ above copyright notice and this permission notice appear in all copies. │ │ │ │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/dce.h" #include "libc/intrin/magicu.h" #include "libc/intrin/strace.h" #include "libc/intrin/weaken.h" #include "libc/macros.h" #include "libc/runtime/runtime.h" #include "libc/thread/thread.h" #include "libc/thread/threads.h" #include "libc/errno.h" #include "libc/calls/struct/cpuset.h" #include "third_party/dlmalloc/dlmalloc.h" #if !FOOTERS || !MSPACES #error "threaded dlmalloc needs footers and mspaces" #endif static struct magicu magiu; static unsigned g_cpucount; static unsigned g_heapslen; static mstate g_heaps[128]; void dlfree(void *p) { return mspace_free(0, p); } size_t dlmalloc_usable_size(void* mem) { return mspace_usable_size(mem); } void* dlrealloc_in_place(void *p, size_t n) { return mspace_realloc_in_place(0, p, n); } int dlmallopt(int param_number, int value) { return mspace_mallopt(param_number, value); } int dlmalloc_trim(size_t pad) { int got_some = 0; for (unsigned i = 0; i < g_heapslen; ++i) got_some |= mspace_trim(g_heaps[i], pad); return got_some; } size_t dlbulk_free(void *array[], size_t nelem) { for (size_t i = 0; i < nelem; ++i) mspace_free(0, array[i]); return 0; } struct ThreadedMallocVisitor { mstate heap; void (*handler)(void *start, void *end, size_t used_bytes, void *arg); void *arg; }; static void threaded_malloc_visitor(void *start, void *end, size_t used_bytes, void *arg) { struct ThreadedMallocVisitor *tmv = arg; if (start == tmv->heap) return; tmv->handler(start, end, used_bytes, tmv->arg); } void dlmalloc_inspect_all(void handler(void *start, void *end, size_t used_bytes, void *arg), void *arg) { for (unsigned i = 0; i < g_heapslen; ++i) { struct ThreadedMallocVisitor tmv = {g_heaps[i], handler, arg}; mspace_inspect_all(g_heaps[i], threaded_malloc_visitor, &tmv); } } // we make malloc() scalable basically by // // return g_heaps[sched_getcpu() / 2]; // // except we cache the syscall result using thread-local storage. on // some platforms, it's not possible to use sched_getcpu() so we use // arbitrary assignments to help scalability, but may not be optimal static mstate get_arena(void) { static atomic_uint assign; static thread_local unsigned i; static thread_local unsigned n; if (n == 50) n = 0; if (!n) { int e = errno; i = sched_getcpu(); if (i == -1) { errno = e; i = atomic_fetch_add_explicit(&assign, 1, memory_order_relaxed); i %= g_cpucount; } i = __magicu_div(i, magiu) % g_heapslen; } ++n; return g_heaps[i]; } static void *dlmalloc_single(size_t n) { return mspace_malloc(g_heaps[0], n); } static void *dlmalloc_threaded(size_t n) { return mspace_malloc(get_arena(), n); } static void *dlcalloc_single(size_t n, size_t z) { return mspace_calloc(g_heaps[0], n, z); } static void *dlcalloc_threaded(size_t n, size_t z) { return mspace_calloc(get_arena(), n, z); } static void *dlrealloc_single(void *p, size_t n) { return mspace_realloc(g_heaps[0], p, n); } static void *dlrealloc_threaded(void *p, size_t n) { if (p) return mspace_realloc(0, p, n); else return mspace_malloc(get_arena(), n); } static void *dlmemalign_single(size_t a, size_t n) { return mspace_memalign(g_heaps[0], a, n); } static void *dlmemalign_threaded(size_t a, size_t n) { return mspace_memalign(get_arena(), a, n); } static struct mallinfo dlmallinfo_single(void) { return mspace_mallinfo(g_heaps[0]); } static struct mallinfo dlmallinfo_threaded(void) { return mspace_mallinfo(get_arena()); } static int dlmalloc_atoi(const char *s) { int c, x = 0; while ((c = *s++)) { x *= 10; x += c - '0'; } return x; } static void use_single_heap(bool uses_locks) { g_heapslen = 1; dlmalloc = dlmalloc_single; dlcalloc = dlcalloc_single; dlrealloc = dlrealloc_single; dlmemalign = dlmemalign_single; dlmallinfo = dlmallinfo_single; if (!(g_heaps[0] = create_mspace(0, uses_locks))) __builtin_trap(); } static void threaded_dlmalloc(void) { int heaps, cpus; const char *var; if (!_weaken(pthread_create)) return use_single_heap(false); // determine how many independent heaps we should install // by default we do an approximation of one heap per core // this code makes the c++ stl go 164x faster on my ryzen g_cpucount = cpus = __get_cpu_count(); if (cpus == -1) { heaps = 1; g_cpucount = 1; } else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT"))) { heaps = dlmalloc_atoi(var); } else { heaps = cpus >> 1; } if (heaps <= 1) return use_single_heap(true); if (heaps > ARRAYLEN(g_heaps)) heaps = ARRAYLEN(g_heaps); // find 𝑑 such that sched_getcpu() / 𝑑 is within [0,heaps) // turn 𝑑 into a fast magic that can divide by multiplying magiu = __magicu_get(cpus / heaps); // we need this too due to linux's cpu count affinity hack g_heapslen = heaps; // create the arenas for (size_t i = 0; i < g_heapslen; ++i) if (!(g_heaps[i] = create_mspace(0, true))) __builtin_trap(); // install function pointers dlmalloc = dlmalloc_threaded; dlcalloc = dlcalloc_threaded; dlrealloc = dlrealloc_threaded; dlmemalign = dlmemalign_threaded; dlmallinfo = dlmallinfo_threaded; STRACE("created %d dlmalloc arenas for %d cpus", heaps, cpus); }