From 07cef612c3c2b255df0670ae3a5bebd3c897cc18 Mon Sep 17 00:00:00 2001 From: Justine Tunney <jtunney@gmail.com> Date: Tue, 28 May 2024 11:13:12 -0700 Subject: [PATCH] Make dlmalloc 2.4x faster for multithreading This change adds a TLS freelist for small dynamic memory allocations. Cosmopolitan's TIB is now 512 bytes in size. Single-threaded malloc() performance isn't impacted by this, until pthread_create() is called. Single-threaded programs may also want to consider using: #include "libc/mem/tinymalloc.inc" Which will shave 30k off the executable size and sometimes go faster. --- libc/errno.h | 2 +- libc/nexgen32e/gc.S | 2 +- libc/proc/vfork.S | 2 +- libc/thread/pthread_exit.c | 18 +++++++ libc/thread/tls.h | 4 +- test/libc/mem/malloc_test.c | 1 - test/libc/mem/thread_test.cc | 79 +++++++++++++++++++++++++++++++ third_party/dlmalloc/README.cosmo | 1 + third_party/dlmalloc/dlmalloc.c | 47 +++++++++++++++++- 9 files changed, 150 insertions(+), 6 deletions(-) create mode 100644 test/libc/mem/thread_test.cc diff --git a/libc/errno.h b/libc/errno.h index cc063599d..8a3a04f30 100644 --- a/libc/errno.h +++ b/libc/errno.h @@ -29,7 +29,7 @@ COSMOPOLITAN_C_START_ #define errno \ (*__extension__({ \ errno_t *__ep; \ - __asm__("sub\t%0,x28,#192-0x3c" : "=r"(__ep)); \ + __asm__("sub\t%0,x28,#512-0x3c" : "=r"(__ep)); \ __ep; \ })) #else diff --git a/libc/nexgen32e/gc.S b/libc/nexgen32e/gc.S index 302dcd5a2..6b60ae240 100644 --- a/libc/nexgen32e/gc.S +++ b/libc/nexgen32e/gc.S @@ -66,7 +66,7 @@ __gc: .ftrace2 // if this code fails // check if CosmoTib's size changed - sub x8,x28,#192 // __get_tls() + sub x8,x28,#512 // __get_tls() ldr x9,[x8,0x18] // tib::garbages ldr x10,[x9] // g->i ldr x8,[x9,8] // g->p diff --git a/libc/proc/vfork.S b/libc/proc/vfork.S index d43faf4cf..482d3b23a 100644 --- a/libc/proc/vfork.S +++ b/libc/proc/vfork.S @@ -121,7 +121,7 @@ vfork: // } else { // __get_tls()->tib_flags &= ~TIB_FLAG_VFORKED; // } - sub x1,x28,#192 // sizeof(CosmoTib) + sub x1,x28,#512 // sizeof(CosmoTib) ldr x2,[x1,64] cbnz x0,2f orr x2,x2,#TIB_FLAG_VFORKED diff --git a/libc/thread/pthread_exit.c b/libc/thread/pthread_exit.c index 6f6c9ad1a..ef40846d1 100644 --- a/libc/thread/pthread_exit.c +++ b/libc/thread/pthread_exit.c @@ -29,6 +29,7 @@ #include "libc/mem/mem.h" #include "libc/runtime/internal.h" #include "libc/runtime/runtime.h" +#include "libc/str/str.h" #include "libc/thread/posixthread.internal.h" #include "libc/thread/thread.h" #include "libc/thread/tls.h" @@ -130,6 +131,23 @@ wontreturn void pthread_exit(void *rc) { } } +#ifndef MODE_DBG + // free tls freelist + // + // 1. set lengths to -1 so free() thinks it's full + // 2. free globally by giving mallocs back to free + // + short freelen[32]; + static_assert(sizeof(freelen) == sizeof(tib->tib_freelen), ""); + memcpy(freelen, tib->tib_freelen, sizeof(freelen)); + memset(tib->tib_freelen, -1, sizeof(freelen)); + for (int i = 0; i < 32; ++i) { + if (freelen[i] > 0) { + free(tib->tib_freemem[i]); + } + } +#endif + // transition the thread to a terminated state status = atomic_load_explicit(&pt->pt_status, memory_order_acquire); do { diff --git a/libc/thread/tls.h b/libc/thread/tls.h index 87c0b0bc5..8b233a4c5 100644 --- a/libc/thread/tls.h +++ b/libc/thread/tls.h @@ -15,6 +15,7 @@ struct CosmoFtrace { /* 16 */ int64_t ft_lastaddr; /* 8 */ }; +/* cosmopolitan thread information block (512 bytes) */ /* NOTE: update aarch64 libc/errno.h if sizeof changes */ /* NOTE: update aarch64 libc/proc/vfork.S if sizeof changes */ /* NOTE: update aarch64 libc/nexgen32e/gc.S if sizeof changes */ @@ -38,7 +39,8 @@ struct CosmoTib { uint32_t tib_sigstack_flags; void **tib_keys; void *tib_nsync; - void *tib_todo[7]; + unsigned short tib_freelen[32]; + void *tib_freemem[32]; } __attribute__((__aligned__(64))); extern int __threaded; diff --git a/test/libc/mem/malloc_test.c b/test/libc/mem/malloc_test.c index 1f94765ab..ab5d1e7dc 100644 --- a/test/libc/mem/malloc_test.c +++ b/test/libc/mem/malloc_test.c @@ -26,7 +26,6 @@ #include "libc/intrin/safemacros.internal.h" #include "libc/macros.internal.h" #include "libc/mem/gc.h" -#include "libc/mem/gc.h" #include "libc/mem/mem.h" #include "libc/runtime/internal.h" #include "libc/runtime/memtrack.internal.h" diff --git a/test/libc/mem/thread_test.cc b/test/libc/mem/thread_test.cc new file mode 100644 index 000000000..9cc845985 --- /dev/null +++ b/test/libc/mem/thread_test.cc @@ -0,0 +1,79 @@ +/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│ +│ vi: set et ft=c++ ts=2 sts=2 sw=2 fenc=utf-8 :vi │ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2024 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/thread/thread.h" +#include "libc/assert.h" +#include "libc/calls/calls.h" +#include "libc/calls/struct/timespec.h" +#include "libc/fmt/itoa.h" +#include "libc/macros.internal.h" +#include "libc/runtime/runtime.h" +#include "libc/stdio/rand.h" +#include "libc/stdio/stdio.h" +#include "libc/str/str.h" + +// +// BEFORE ADDING TLS FREELIST +// +// malloc multithreading torture test +// with 192 threads and 10000 iterations +// consumed 0.084721 wall and 0.141747 cpu seconds +// +// AFTER ADDING TLS FREELIST +// +// malloc multithreading torture test +// with 192 threads and 10000 iterations +// consumed 0.035193 wall and 4.34012 cpu seconds +// + +#define ITERATIONS 10000 + +void *Worker(void *arg) { + char *thing[32] = {}; + for (int i = 0; i < ITERATIONS; ++i) { + int r = rand(); + int j = r % ARRAYLEN(thing); + if (thing[j]) { + delete[] thing[j]; + thing[j] = 0; + } else { + thing[j] = new char[12 + ((r >> 8) % 32)]; + } + } + return 0; +} + +int main(int argc, char *argv[]) { + int n = __get_cpu_count(); + pthread_t *t = new pthread_t[n]; + fprintf(stderr, + "\n" + "malloc multithreading torture test\n" + "with %d threads and %d iterations\n", + n, ITERATIONS); + struct timespec t1 = timespec_real(); + for (int i = 0; i < n; ++i) + unassert(!pthread_create(t + i, 0, Worker, 0)); + for (int i = 0; i < n; ++i) + unassert(!pthread_join(t[i], 0)); + struct timespec t2 = timespec_real(); + fprintf(stderr, "consumed %g wall and %g cpu seconds\n", + timespec_tomicros(timespec_sub(t2, t1)) * 1e-6, + (double)clock() / CLOCKS_PER_SEC); + delete[] t; +} diff --git a/third_party/dlmalloc/README.cosmo b/third_party/dlmalloc/README.cosmo index 35e1de921..6948927a6 100644 --- a/third_party/dlmalloc/README.cosmo +++ b/third_party/dlmalloc/README.cosmo @@ -9,6 +9,7 @@ LICENSE LOCAL CHANGES + - Use thread-local freelist from cosmo tib - Use faster two power roundup for memalign() - Poison maps to integrate with Address Sanitizer - Introduce __oom_hook() by using _mapanon() vs. mmap() diff --git a/third_party/dlmalloc/dlmalloc.c b/third_party/dlmalloc/dlmalloc.c index d13bff0d9..fb7eac2a1 100644 --- a/third_party/dlmalloc/dlmalloc.c +++ b/third_party/dlmalloc/dlmalloc.c @@ -23,6 +23,7 @@ #include "libc/thread/thread.h" #include "libc/thread/tls.h" #include "third_party/dlmalloc/vespene.internal.h" +#include "libc/thread/tls.h" #include "third_party/nsync/mu.h" #define FOOTERS 0 @@ -584,7 +585,30 @@ static void* tmalloc_small(mstate m, size_t nb) { #if !ONLY_MSPACES +#define FREEBIE_COUNT 32 +#define FREEBIE_MAXSIZE 2048 + void* dlmalloc(size_t bytes) { + +#if FREEBIE_COUNT && !defined(MODE_DBG) + /* Allocate from thread-local freelist. */ + if (__threaded && bytes && bytes <= FREEBIE_MAXSIZE) { + unsigned need = bytes; + unsigned best_index = FREEBIE_COUNT; + unsigned best_delta = FREEBIE_MAXSIZE + 1; + struct CosmoTib *tib = __get_tls(); + for (int i = 0; i < FREEBIE_COUNT; ++i) { + unsigned d = tib->tib_freelen[i] - need; + best_index = d < best_delta ? i : best_index; + best_delta = d < best_delta ? d : best_delta; + } + if (best_index < FREEBIE_COUNT) { + tib->tib_freelen[best_index] = 0; + return tib->tib_freemem[best_index]; + } + } +#endif + /* Basic algorithm: If a small request (< 256 bytes minus per-chunk overhead): @@ -733,7 +757,6 @@ void dlfree(void* mem) { free chunks, if they exist, and then place in a bin. Intermixed with special cases for top, dv, mmapped chunks, and usage errors. */ - if (mem != 0) { mchunkptr p = mem2chunk(mem); #if FOOTERS @@ -745,6 +768,28 @@ void dlfree(void* mem) { #else /* FOOTERS */ #define fm gm #endif /* FOOTERS */ + +#if FREEBIE_COUNT && !defined(MODE_DBG) + /* Free small allocations locally. */ + if (__threaded) { + struct CosmoTib *tib = __get_tls(); + for (int i = 0; i < FREEBIE_COUNT; ++i) { + if (!tib->tib_freelen[i]) { + if (is_inuse(p)) { + size_t len = chunksize(p) - overhead_for(p); + if (len && len < FREEBIE_MAXSIZE) { + tib->tib_freelen[i] = len; + tib->tib_freemem[i] = mem; + return; + } + } + break; + } + } + } +#endif + + /* Otherwise free memory globally. */ if (!PREACTION(fm)) { check_inuse_chunk(fm, p); if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {