From c8e10eef30a421a22b24a5f18668e0ab1608aa89 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Mon, 23 Dec 2024 20:14:01 -0800 Subject: [PATCH] Make bulk_free() go faster --- .gitignore | 1 + libc/cosmo.h | 4 ++-- libc/intrin/stack.c | 15 ++++++++------- libc/thread/mapstack.c | 4 ++-- libc/thread/posixthread.internal.h | 4 +--- libc/thread/pthread_atfork.c | 2 ++ libc/thread/pthread_create.c | 20 +++++++++++--------- libc/thread/pthread_exit.c | 2 +- libc/thread/pthread_timedjoin_np.c | 2 +- libc/thread/thread.h | 4 ++-- test/libc/mem/malloc_test.c | 3 +-- third_party/dlmalloc/dlmalloc.c | 10 +++++----- third_party/dlmalloc/threaded.inc | 19 +++++++++++++++++-- 13 files changed, 54 insertions(+), 36 deletions(-) diff --git a/.gitignore b/.gitignore index 4c767cd51..0c6b21f03 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ __pycache__ /tool/emacs/*.elc /perf.data /perf.data.old +/qemu*core diff --git a/libc/cosmo.h b/libc/cosmo.h index e2691587a..d53c3045f 100644 --- a/libc/cosmo.h +++ b/libc/cosmo.h @@ -25,8 +25,8 @@ int cosmo_futex_wake(_COSMO_ATOMIC(int) *, int, char); int cosmo_futex_wait(_COSMO_ATOMIC(int) *, int, char, int, const struct timespec *); -errno_t cosmo_stack_alloc(size_t *, size_t *, void **) libcesque; -errno_t cosmo_stack_free(void *, size_t, size_t) libcesque; +errno_t cosmo_stack_alloc(unsigned *, unsigned *, void **) libcesque; +errno_t cosmo_stack_free(void *, unsigned, unsigned) libcesque; void cosmo_stack_clear(void) libcesque; void cosmo_stack_setmaxstacks(int) libcesque; int cosmo_stack_getmaxstacks(void) libcesque; diff --git a/libc/intrin/stack.c b/libc/intrin/stack.c index e153b6ce8..c77e9a8d0 100644 --- a/libc/intrin/stack.c +++ b/libc/intrin/stack.c @@ -42,8 +42,8 @@ struct CosmoStack { struct Dll elem; void *stackaddr; - size_t stacksize; - size_t guardsize; + unsigned stacksize; + unsigned guardsize; }; struct CosmoStacks { @@ -215,13 +215,13 @@ void cosmo_stack_setmaxstacks(int maxstacks) { * This function returns 0 on success, or an errno on error. See the * documentation of mmap() for a list possible errors that may occur. */ -errno_t cosmo_stack_alloc(size_t *inout_stacksize, // - size_t *inout_guardsize, // +errno_t cosmo_stack_alloc(unsigned *inout_stacksize, // + unsigned *inout_guardsize, // void **out_addr) { // validate arguments - size_t stacksize = *inout_stacksize; - size_t guardsize = *inout_guardsize; + unsigned stacksize = *inout_stacksize; + unsigned guardsize = *inout_guardsize; stacksize = (stacksize + __gransize - 1) & -__gransize; guardsize = (guardsize + __pagesize - 1) & -__pagesize; if (guardsize + __pagesize > stacksize) @@ -283,7 +283,8 @@ static void cosmo_stack_setup(void) { * variable is never clobbered. You can only dependably count on this to * return an error on failure when you say `cosmo_stack_setmaxstacks(0)` */ -errno_t cosmo_stack_free(void *stackaddr, size_t stacksize, size_t guardsize) { +errno_t cosmo_stack_free(void *stackaddr, unsigned stacksize, + unsigned guardsize) { stacksize = (stacksize + __gransize - 1) & -__gransize; guardsize = (guardsize + __pagesize - 1) & -__pagesize; if (guardsize + __pagesize > stacksize) diff --git a/libc/thread/mapstack.c b/libc/thread/mapstack.c index 470ab58a6..28a3fd56e 100644 --- a/libc/thread/mapstack.c +++ b/libc/thread/mapstack.c @@ -35,8 +35,8 @@ */ void *NewCosmoStack(void) { void *stackaddr; - size_t stacksize = GetStackSize(); - size_t guardsize = GetGuardSize(); + unsigned stacksize = GetStackSize(); + unsigned guardsize = GetGuardSize(); errno_t err = cosmo_stack_alloc(&stacksize, &guardsize, &stackaddr); if (!err) return stackaddr; diff --git a/libc/thread/posixthread.internal.h b/libc/thread/posixthread.internal.h index 8fa216805..fe94dc066 100644 --- a/libc/thread/posixthread.internal.h +++ b/libc/thread/posixthread.internal.h @@ -78,8 +78,7 @@ struct PosixThread { atomic_int ptid; // transitions 0 → tid atomic_int pt_refs; // prevents decimation void *(*pt_start)(void *); // creation callback - void *pt_arg; // start's parameter - void *pt_rc; // start's return value + void *pt_val; // start param / return val char *pt_tls; // bottom of tls allocation struct CosmoTib *tib; // middle of tls allocation struct Dll list; // list of threads @@ -105,7 +104,6 @@ int _pthread_tid(struct PosixThread *) libcesque; intptr_t _pthread_syshand(struct PosixThread *) libcesque; long _pthread_cancel_ack(void) libcesque; void _pthread_decimate(void) libcesque; -void _pthread_free(struct PosixThread *) libcesque; void _pthread_lock(void) libcesque; void _pthread_onfork_child(void) libcesque; void _pthread_onfork_parent(void) libcesque; diff --git a/libc/thread/pthread_atfork.c b/libc/thread/pthread_atfork.c index 5ef7a92c1..c7e32ed2c 100644 --- a/libc/thread/pthread_atfork.c +++ b/libc/thread/pthread_atfork.c @@ -63,11 +63,13 @@ static void _pthread_onfork(int i, const char *op) { } void _pthread_onfork_prepare(void) { + pthread_mutex_lock(&_atforks.lock); _pthread_onfork(0, "prepare"); } void _pthread_onfork_parent(void) { _pthread_onfork(1, "parent"); + pthread_mutex_unlock(&_atforks.lock); } void _pthread_onfork_child(void) { diff --git a/libc/thread/pthread_create.c b/libc/thread/pthread_create.c index ba5771a9e..351a18c8b 100644 --- a/libc/thread/pthread_create.c +++ b/libc/thread/pthread_create.c @@ -67,7 +67,7 @@ __static_yoink("_pthread_onfork_prepare"); __static_yoink("_pthread_onfork_parent"); __static_yoink("_pthread_onfork_child"); -void _pthread_free(struct PosixThread *pt) { +static void _pthread_free(struct PosixThread *pt) { // thread must be removed from _pthread_list before calling unassert(dll_is_alone(&pt->list) && &pt->list != _pthread_list); @@ -93,10 +93,13 @@ void _pthread_free(struct PosixThread *pt) { } // free heap memory associated with thread - if (pt->pt_flags & PT_OWNSIGALTSTACK) - free(pt->pt_attr.__sigaltstackaddr); - free(pt->pt_tls); - free(pt); + bulk_free( + (void *[]){ + pt->pt_flags & PT_OWNSIGALTSTACK ? pt->pt_attr.__sigaltstackaddr : 0, + pt->pt_tls, + pt, + }, + 3); } void _pthread_decimate(void) { @@ -137,7 +140,6 @@ void _pthread_decimate(void) { } static int PosixThread(void *arg, int tid) { - void *rc; struct PosixThread *pt = arg; // setup scheduling @@ -167,11 +169,11 @@ static int PosixThread(void *arg, int tid) { } else { sys_sigprocmask(SIG_SETMASK, &pt->pt_attr.__sigmask, 0); } - rc = pt->pt_start(pt->pt_arg); + void *ret = pt->pt_start(pt->pt_val); // ensure pthread_cleanup_pop(), and pthread_exit() popped cleanup unassert(!pt->pt_cleanup); // calling pthread_exit() will either jump back here, or call exit - pthread_exit(rc); + pthread_exit(ret); } // avoid signal handler being triggered after we trash our own stack @@ -196,7 +198,7 @@ static errno_t pthread_create_impl(pthread_t *thread, dll_init(&pt->list); pt->pt_locale = &__global_locale; pt->pt_start = start_routine; - pt->pt_arg = arg; + pt->pt_val = arg; // create thread local storage memory if (!(pt->pt_tls = _mktls(&pt->tib))) { diff --git a/libc/thread/pthread_exit.c b/libc/thread/pthread_exit.c index c50b867da..6c8d605bc 100644 --- a/libc/thread/pthread_exit.c +++ b/libc/thread/pthread_exit.c @@ -88,7 +88,7 @@ wontreturn void pthread_exit(void *rc) { // set state pt->pt_flags |= PT_NOCANCEL | PT_EXITING; - pt->pt_rc = rc; + pt->pt_val = rc; // free resources __cxa_thread_finalize(); diff --git a/libc/thread/pthread_timedjoin_np.c b/libc/thread/pthread_timedjoin_np.c index 9022a9196..142ae4734 100644 --- a/libc/thread/pthread_timedjoin_np.c +++ b/libc/thread/pthread_timedjoin_np.c @@ -139,7 +139,7 @@ errno_t pthread_timedjoin_np(pthread_t thread, void **value_ptr, memory_order_release); _pthread_zombify(pt); if (value_ptr) - *value_ptr = pt->pt_rc; + *value_ptr = pt->pt_val; } _pthread_unref(pt); diff --git a/libc/thread/thread.h b/libc/thread/thread.h index 4b469a209..e2827b7d4 100644 --- a/libc/thread/thread.h +++ b/libc/thread/thread.h @@ -130,8 +130,8 @@ typedef struct pthread_attr_s { int __contentionscope; int __sigaltstacksize; uint64_t __sigmask; - size_t __guardsize; - size_t __stacksize; + unsigned __guardsize; + unsigned __stacksize; void *__stackaddr; void *__sigaltstackaddr; } pthread_attr_t; diff --git a/test/libc/mem/malloc_test.c b/test/libc/mem/malloc_test.c index 5e69b98ca..b1b7d2609 100644 --- a/test/libc/mem/malloc_test.c +++ b/test/libc/mem/malloc_test.c @@ -22,7 +22,6 @@ #include "libc/dce.h" #include "libc/errno.h" #include "libc/intrin/cxaatexit.h" -#include "libc/intrin/kprintf.h" #include "libc/intrin/safemacros.h" #include "libc/macros.h" #include "libc/mem/gc.h" @@ -162,7 +161,7 @@ void *bulk[1024]; void BulkFreeBenchSetup(void) { size_t i; for (i = 0; i < ARRAYLEN(bulk); ++i) { - bulk[i] = malloc(rand() % 64); + bulk[i] = rand() % 64 ? malloc(rand() % 64) : 0; } } diff --git a/third_party/dlmalloc/dlmalloc.c b/third_party/dlmalloc/dlmalloc.c index 0adc13f4f..b20e28cd9 100644 --- a/third_party/dlmalloc/dlmalloc.c +++ b/third_party/dlmalloc/dlmalloc.c @@ -62,11 +62,6 @@ #include "locks.inc" #include "chunks.inc" #include "headfoot.inc" - -#if ONLY_MSPACES -#include "threaded.inc" -#endif - #include "global.inc" #include "system.inc" #include "hooks.inc" @@ -74,6 +69,11 @@ #include "indexing.inc" #include "binmaps.inc" #include "runtimechecks.inc" + +#if ONLY_MSPACES +#include "threaded.inc" +#endif + #include "init.inc" #include "debuglib.inc" #include "statistics.inc" diff --git a/third_party/dlmalloc/threaded.inc b/third_party/dlmalloc/threaded.inc index e8768dbc3..3dbfb5b35 100644 --- a/third_party/dlmalloc/threaded.inc +++ b/third_party/dlmalloc/threaded.inc @@ -61,8 +61,23 @@ int dlmalloc_trim(size_t pad) { } size_t dlbulk_free(void *array[], size_t nelem) { - for (size_t i = 0; i < nelem; ++i) - mspace_free(0, array[i]); + size_t j = 0; + mstate msp = (mstate)-1; + for (size_t i = 0; i < nelem; ++i) { + mstate next; + if (array[i]) { + next = get_mstate_for(mem2chunk(array[i])); + if (next != msp) { + if (j) + mspace_bulk_free(msp, array, j); + msp = next; + j = 0; + } + array[j++] = array[i]; + } + } + if (j) + mspace_bulk_free(msp, array, j); return 0; }