Make malloc scalable on all platforms

It turns out sched_getcpu() didn't work on many platforms. So the system call now has tests and is well documented. We now employ new workarounds on platforms where it isn't supported in our malloc() implementation. It was previously the case that malloc() was only scalable on Linux/Windows for x86-64. Now the other platforms are scalable too.
2025-06-30 08:18:30 +00:00 · 2024-08-15 21:32:30 -07:00 · 2024-08-15 21:32:30 -07:00 · 0a79c6961f
commit 0a79c6961f
parent 3fd275f59f
9 changed files with 459 additions and 99 deletions
--- a/third_party/dlmalloc/threaded.inc
+++ b/third_party/dlmalloc/threaded.inc
@ -21,12 +21,9 @@
 #include "libc/intrin/strace.h"
 #include "libc/intrin/weaken.h"
 #include "libc/macros.h"
-#include "libc/nexgen32e/rdtscp.h"
-#include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
 #include "libc/thread/thread.h"
-#include "libc/runtime/runtime.h"
-#include "libc/intrin/weaken.h"
+#include "libc/thread/threads.h"
 #include "third_party/dlmalloc/dlmalloc.h"

 #if !FOOTERS || !MSPACES
@ -34,6 +31,7 @@
 #endif

 static struct magicu magiu;
+static unsigned g_cpucount;
 static unsigned g_heapslen;
 static mstate g_heaps[128];

@ -90,18 +88,29 @@ void dlmalloc_inspect_all(void handler(void *start, void *end,
  }
 }

-forceinline mstate get_arena(void) {
-  unsigned cpu;
-#ifdef __x86_64__
-  unsigned tsc_aux;
-  rdtscp(&tsc_aux);
-  cpu = TSC_AUX_CORE(tsc_aux);
-#else
-  long tpidr_el0;
-  asm("mrs\t%0,tpidr_el0" : "=r"(tpidr_el0));
-  cpu = tpidr_el0 & 255;
-#endif
-  return g_heaps[__magicu_div(cpu, magiu) % g_heapslen];
+// we make malloc() scalable basically by
+//
+//     return g_heaps[sched_getcpu() / 2];
+//
+// except we cache the syscall result using thread-local storage. on
+// some platforms, it's not possible to use sched_getcpu() so we use
+// arbitrary assignments to help scalability, but may not be optimal
+static mstate get_arena(void) {
+  static atomic_uint assign;
+  static thread_local unsigned i;
+  static thread_local unsigned n;
+  if (n == 50)
+    n = 0;
+  if (!n) {
+    i = sched_getcpu();
+    if (i == -1) {
+      i = atomic_fetch_add_explicit(&assign, 1, memory_order_relaxed);
+      i %= g_cpucount;
+    }
+    i = __magicu_div(i, magiu) % g_heapslen;
+  }
+  ++n;
+  return g_heaps[i];
 }

 static void *dlmalloc_single(size_t n) {
@ -174,19 +183,18 @@ static void threaded_dlmalloc(void) {
  if (!_weaken(pthread_create))
    return use_single_heap(false);

-  if (!IsAarch64() && !X86_HAVE(RDTSCP))
-    return use_single_heap(true);
-
  // determine how many independent heaps we should install
  // by default we do an approximation of one heap per core
  // this code makes the c++ stl go 164x faster on my ryzen
-  cpus = __get_cpu_count();
-  if (cpus == -1)
+  g_cpucount = cpus = __get_cpu_count();
+  if (cpus == -1) {
    heaps = 1;
-  else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT")))
+    g_cpucount = 1;
+  } else if ((var = getenv("COSMOPOLITAN_HEAP_COUNT"))) {
    heaps = dlmalloc_atoi(var);
-  else
+  } else {
    heaps = cpus >> 1;
+  }
  if (heaps <= 1)
    return use_single_heap(true);
  if (heaps > ARRAYLEN(g_heaps))