Make dlmalloc 2.4x faster for multithreading

This change adds a TLS freelist for small dynamic memory allocations.
Cosmopolitan's TIB is now 512 bytes in size. Single-threaded malloc()
performance isn't impacted by this, until pthread_create() is called.
Single-threaded programs may also want to consider using:

    #include "libc/mem/tinymalloc.inc"

Which will shave 30k off the executable size and sometimes go faster.
This commit is contained in:
Justine Tunney 2024-05-28 11:13:12 -07:00
parent deaef81463
commit 07cef612c3
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
9 changed files with 150 additions and 6 deletions

View file

@ -29,7 +29,7 @@ COSMOPOLITAN_C_START_
#define errno \
(*__extension__({ \
errno_t *__ep; \
__asm__("sub\t%0,x28,#192-0x3c" : "=r"(__ep)); \
__asm__("sub\t%0,x28,#512-0x3c" : "=r"(__ep)); \
__ep; \
}))
#else

View file

@ -66,7 +66,7 @@ __gc: .ftrace2
// if this code fails
// check if CosmoTib's size changed
sub x8,x28,#192 // __get_tls()
sub x8,x28,#512 // __get_tls()
ldr x9,[x8,0x18] // tib::garbages
ldr x10,[x9] // g->i
ldr x8,[x9,8] // g->p

View file

@ -121,7 +121,7 @@ vfork:
// } else {
// __get_tls()->tib_flags &= ~TIB_FLAG_VFORKED;
// }
sub x1,x28,#192 // sizeof(CosmoTib)
sub x1,x28,#512 // sizeof(CosmoTib)
ldr x2,[x1,64]
cbnz x0,2f
orr x2,x2,#TIB_FLAG_VFORKED

View file

@ -29,6 +29,7 @@
#include "libc/mem/mem.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/str/str.h"
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/thread.h"
#include "libc/thread/tls.h"
@ -130,6 +131,23 @@ wontreturn void pthread_exit(void *rc) {
}
}
#ifndef MODE_DBG
// free tls freelist
//
// 1. set lengths to -1 so free() thinks it's full
// 2. free globally by giving mallocs back to free
//
short freelen[32];
static_assert(sizeof(freelen) == sizeof(tib->tib_freelen), "");
memcpy(freelen, tib->tib_freelen, sizeof(freelen));
memset(tib->tib_freelen, -1, sizeof(freelen));
for (int i = 0; i < 32; ++i) {
if (freelen[i] > 0) {
free(tib->tib_freemem[i]);
}
}
#endif
// transition the thread to a terminated state
status = atomic_load_explicit(&pt->pt_status, memory_order_acquire);
do {

View file

@ -15,6 +15,7 @@ struct CosmoFtrace { /* 16 */
int64_t ft_lastaddr; /* 8 */
};
/* cosmopolitan thread information block (512 bytes) */
/* NOTE: update aarch64 libc/errno.h if sizeof changes */
/* NOTE: update aarch64 libc/proc/vfork.S if sizeof changes */
/* NOTE: update aarch64 libc/nexgen32e/gc.S if sizeof changes */
@ -38,7 +39,8 @@ struct CosmoTib {
uint32_t tib_sigstack_flags;
void **tib_keys;
void *tib_nsync;
void *tib_todo[7];
unsigned short tib_freelen[32];
void *tib_freemem[32];
} __attribute__((__aligned__(64)));
extern int __threaded;