From 07cef612c3c2b255df0670ae3a5bebd3c897cc18 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Tue, 28 May 2024 11:13:12 -0700
Subject: [PATCH] Make dlmalloc 2.4x faster for multithreading

This change adds a TLS freelist for small dynamic memory allocations.
Cosmopolitan's TIB is now 512 bytes in size. Single-threaded malloc()
performance isn't impacted by this, until pthread_create() is called.
Single-threaded programs may also want to consider using:

    #include "libc/mem/tinymalloc.inc"

Which will shave 30k off the executable size and sometimes go faster.
---
 libc/errno.h                      |  2 +-
 libc/nexgen32e/gc.S               |  2 +-
 libc/proc/vfork.S                 |  2 +-
 libc/thread/pthread_exit.c        | 18 +++++++
 libc/thread/tls.h                 |  4 +-
 test/libc/mem/malloc_test.c       |  1 -
 test/libc/mem/thread_test.cc      | 79 +++++++++++++++++++++++++++++++
 third_party/dlmalloc/README.cosmo |  1 +
 third_party/dlmalloc/dlmalloc.c   | 47 +++++++++++++++++-
 9 files changed, 150 insertions(+), 6 deletions(-)
 create mode 100644 test/libc/mem/thread_test.cc

diff --git a/libc/errno.h b/libc/errno.h
index cc063599d..8a3a04f30 100644
--- a/libc/errno.h
+++ b/libc/errno.h
@@ -29,7 +29,7 @@ COSMOPOLITAN_C_START_
 #define errno                                      \
   (*__extension__({                                \
     errno_t *__ep;                                 \
-    __asm__("sub\t%0,x28,#192-0x3c" : "=r"(__ep)); \
+    __asm__("sub\t%0,x28,#512-0x3c" : "=r"(__ep)); \
     __ep;                                          \
   }))
 #else
diff --git a/libc/nexgen32e/gc.S b/libc/nexgen32e/gc.S
index 302dcd5a2..6b60ae240 100644
--- a/libc/nexgen32e/gc.S
+++ b/libc/nexgen32e/gc.S
@@ -66,7 +66,7 @@ __gc:	.ftrace2
 
 //	if this code fails
 //	check if CosmoTib's size changed
-	sub	x8,x28,#192			// __get_tls()
+	sub	x8,x28,#512			// __get_tls()
 	ldr	x9,[x8,0x18]			// tib::garbages
 	ldr	x10,[x9]			// g->i
 	ldr	x8,[x9,8]			// g->p
diff --git a/libc/proc/vfork.S b/libc/proc/vfork.S
index d43faf4cf..482d3b23a 100644
--- a/libc/proc/vfork.S
+++ b/libc/proc/vfork.S
@@ -121,7 +121,7 @@ vfork:
 //	} else {
 //	  __get_tls()->tib_flags &= ~TIB_FLAG_VFORKED;
 //	}
-	sub	x1,x28,#192		// sizeof(CosmoTib)
+	sub	x1,x28,#512		// sizeof(CosmoTib)
 	ldr	x2,[x1,64]
 	cbnz	x0,2f
 	orr	x2,x2,#TIB_FLAG_VFORKED
diff --git a/libc/thread/pthread_exit.c b/libc/thread/pthread_exit.c
index 6f6c9ad1a..ef40846d1 100644
--- a/libc/thread/pthread_exit.c
+++ b/libc/thread/pthread_exit.c
@@ -29,6 +29,7 @@
 #include "libc/mem/mem.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/runtime.h"
+#include "libc/str/str.h"
 #include "libc/thread/posixthread.internal.h"
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
@@ -130,6 +131,23 @@ wontreturn void pthread_exit(void *rc) {
     }
   }
 
+#ifndef MODE_DBG
+  // free tls freelist
+  //
+  //   1. set lengths to -1 so free() thinks it's full
+  //   2. free globally by giving mallocs back to free
+  //
+  short freelen[32];
+  static_assert(sizeof(freelen) == sizeof(tib->tib_freelen), "");
+  memcpy(freelen, tib->tib_freelen, sizeof(freelen));
+  memset(tib->tib_freelen, -1, sizeof(freelen));
+  for (int i = 0; i < 32; ++i) {
+    if (freelen[i] > 0) {
+      free(tib->tib_freemem[i]);
+    }
+  }
+#endif
+
   // transition the thread to a terminated state
   status = atomic_load_explicit(&pt->pt_status, memory_order_acquire);
   do {
diff --git a/libc/thread/tls.h b/libc/thread/tls.h
index 87c0b0bc5..8b233a4c5 100644
--- a/libc/thread/tls.h
+++ b/libc/thread/tls.h
@@ -15,6 +15,7 @@ struct CosmoFtrace {   /* 16 */
   int64_t ft_lastaddr; /*  8 */
 };
 
+/* cosmopolitan thread information block (512 bytes) */
 /* NOTE: update aarch64 libc/errno.h if sizeof changes */
 /* NOTE: update aarch64 libc/proc/vfork.S if sizeof changes */
 /* NOTE: update aarch64 libc/nexgen32e/gc.S if sizeof changes */
@@ -38,7 +39,8 @@ struct CosmoTib {
   uint32_t tib_sigstack_flags;
   void **tib_keys;
   void *tib_nsync;
-  void *tib_todo[7];
+  unsigned short tib_freelen[32];
+  void *tib_freemem[32];
 } __attribute__((__aligned__(64)));
 
 extern int __threaded;
diff --git a/test/libc/mem/malloc_test.c b/test/libc/mem/malloc_test.c
index 1f94765ab..ab5d1e7dc 100644
--- a/test/libc/mem/malloc_test.c
+++ b/test/libc/mem/malloc_test.c
@@ -26,7 +26,6 @@
 #include "libc/intrin/safemacros.internal.h"
 #include "libc/macros.internal.h"
 #include "libc/mem/gc.h"
-#include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/memtrack.internal.h"
diff --git a/test/libc/mem/thread_test.cc b/test/libc/mem/thread_test.cc
new file mode 100644
index 000000000..9cc845985
--- /dev/null
+++ b/test/libc/mem/thread_test.cc
@@ -0,0 +1,79 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8-*-│
+│ vi: set et ft=c++ ts=2 sts=2 sw=2 fenc=utf-8                             :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/thread/thread.h"
+#include "libc/assert.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/fmt/itoa.h"
+#include "libc/macros.internal.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/stdio.h"
+#include "libc/str/str.h"
+
+//
+// BEFORE ADDING TLS FREELIST
+//
+//     malloc multithreading torture test
+//     with 192 threads and 10000 iterations
+//     consumed 0.084721 wall and 0.141747 cpu seconds
+//
+// AFTER ADDING TLS FREELIST
+//
+//     malloc multithreading torture test
+//     with 192 threads and 10000 iterations
+//     consumed 0.035193 wall and 4.34012 cpu seconds
+//
+
+#define ITERATIONS 10000
+
+void *Worker(void *arg) {
+  char *thing[32] = {};
+  for (int i = 0; i < ITERATIONS; ++i) {
+    int r = rand();
+    int j = r % ARRAYLEN(thing);
+    if (thing[j]) {
+      delete[] thing[j];
+      thing[j] = 0;
+    } else {
+      thing[j] = new char[12 + ((r >> 8) % 32)];
+    }
+  }
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  int n = __get_cpu_count();
+  pthread_t *t = new pthread_t[n];
+  fprintf(stderr,
+          "\n"
+          "malloc multithreading torture test\n"
+          "with %d threads and %d iterations\n",
+          n, ITERATIONS);
+  struct timespec t1 = timespec_real();
+  for (int i = 0; i < n; ++i)
+    unassert(!pthread_create(t + i, 0, Worker, 0));
+  for (int i = 0; i < n; ++i)
+    unassert(!pthread_join(t[i], 0));
+  struct timespec t2 = timespec_real();
+  fprintf(stderr, "consumed %g wall and %g cpu seconds\n",
+          timespec_tomicros(timespec_sub(t2, t1)) * 1e-6,
+          (double)clock() / CLOCKS_PER_SEC);
+  delete[] t;
+}
diff --git a/third_party/dlmalloc/README.cosmo b/third_party/dlmalloc/README.cosmo
index 35e1de921..6948927a6 100644
--- a/third_party/dlmalloc/README.cosmo
+++ b/third_party/dlmalloc/README.cosmo
@@ -9,6 +9,7 @@ LICENSE
 
 LOCAL CHANGES
 
+  - Use thread-local freelist from cosmo tib
   - Use faster two power roundup for memalign()
   - Poison maps to integrate with Address Sanitizer
   - Introduce __oom_hook() by using _mapanon() vs. mmap()
diff --git a/third_party/dlmalloc/dlmalloc.c b/third_party/dlmalloc/dlmalloc.c
index d13bff0d9..fb7eac2a1 100644
--- a/third_party/dlmalloc/dlmalloc.c
+++ b/third_party/dlmalloc/dlmalloc.c
@@ -23,6 +23,7 @@
 #include "libc/thread/thread.h"
 #include "libc/thread/tls.h"
 #include "third_party/dlmalloc/vespene.internal.h"
+#include "libc/thread/tls.h"
 #include "third_party/nsync/mu.h"
 
 #define FOOTERS 0
@@ -584,7 +585,30 @@ static void* tmalloc_small(mstate m, size_t nb) {
 
 #if !ONLY_MSPACES
 
+#define FREEBIE_COUNT 32
+#define FREEBIE_MAXSIZE 2048
+
 void* dlmalloc(size_t bytes) {
+
+#if FREEBIE_COUNT && !defined(MODE_DBG)
+  /* Allocate from thread-local freelist. */
+  if (__threaded && bytes && bytes <= FREEBIE_MAXSIZE) {
+    unsigned need = bytes;
+    unsigned best_index = FREEBIE_COUNT;
+    unsigned best_delta = FREEBIE_MAXSIZE + 1;
+    struct CosmoTib *tib = __get_tls();
+    for (int i = 0; i < FREEBIE_COUNT; ++i) {
+      unsigned d = tib->tib_freelen[i] - need;
+      best_index = d < best_delta ? i : best_index;
+      best_delta = d < best_delta ? d : best_delta;
+    }
+    if (best_index < FREEBIE_COUNT) {
+      tib->tib_freelen[best_index] = 0;
+      return tib->tib_freemem[best_index];
+    }
+  }
+#endif
+
   /*
      Basic algorithm:
      If a small request (< 256 bytes minus per-chunk overhead):
@@ -733,7 +757,6 @@ void dlfree(void* mem) {
      free chunks, if they exist, and then place in a bin.  Intermixed
      with special cases for top, dv, mmapped chunks, and usage errors.
   */
-
   if (mem != 0) {
     mchunkptr p  = mem2chunk(mem);
 #if FOOTERS
@@ -745,6 +768,28 @@ void dlfree(void* mem) {
 #else /* FOOTERS */
 #define fm gm
 #endif /* FOOTERS */
+
+#if FREEBIE_COUNT && !defined(MODE_DBG)
+    /* Free small allocations locally. */
+    if (__threaded) {
+      struct CosmoTib *tib = __get_tls();
+      for (int i = 0; i < FREEBIE_COUNT; ++i) {
+        if (!tib->tib_freelen[i]) {
+          if (is_inuse(p)) {
+            size_t len = chunksize(p) - overhead_for(p);
+            if (len && len < FREEBIE_MAXSIZE) {
+              tib->tib_freelen[i] = len;
+              tib->tib_freemem[i] = mem;
+              return;
+            }
+          }
+          break;
+        }
+      }
+    }
+#endif
+
+    /* Otherwise free memory globally. */
     if (!PREACTION(fm)) {
       check_inuse_chunk(fm, p);
       if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {