Make threads faster and more reliable

This change doubles the performance of thread spawning. That's thanks to
our new stack manager, which allows us to avoid zeroing stacks. It gives
us 15µs spawns rather than 30µs spawns on Linux. Also, pthread_exit() is
faster now, since it doesn't need to acquire the pthread GIL. On NetBSD,
that helps us avoid allocating too many semaphores. Even if that happens
we're now able to survive semaphores running out and even memory running
out, when allocating *NSYNC waiter objects. I found a lot more rare bugs
in the POSIX threads runtime that could cause things to crash, if you've
got dozens of threads all spawning and joining dozens of threads. I want
cosmo to be world class production worthy for 2025 so happy holidays all
This commit is contained in:
Justine Tunney 2024-12-18 04:59:02 -08:00
parent 906bd06a5a
commit 624573207e
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
51 changed files with 1006 additions and 321 deletions

View file

@ -62,7 +62,7 @@
#include "libc/sysv/consts/prot.h"
#include "libc/sysv/consts/sig.h"
#include "libc/sysv/errfuns.h"
#include "libc/thread/itimer.internal.h"
#include "libc/thread/itimer.h"
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/tls.h"
#ifdef __x86_64__
@ -189,7 +189,7 @@ static textwindows void *Malloc(size_t size) {
}
textwindows void WinMainForked(void) {
jmp_buf jb;
intptr_t jb[5];
int64_t reader;
int64_t savetsc;
uint32_t varlen;
@ -305,14 +305,14 @@ textwindows void WinMainForked(void) {
#endif
// jump back into function below
longjmp(jb, 1);
__builtin_longjmp(jb, 1);
}
textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
char ok;
jmp_buf jb;
char **args;
int rc = -1;
intptr_t jb[5];
struct Proc *proc;
struct CosmoTib *tib;
char16_t pipename[64];
@ -325,7 +325,7 @@ textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
return -1;
ftrace_enabled(-1);
strace_enabled(-1);
if (!setjmp(jb)) {
if (!__builtin_setjmp(jb)) {
reader = CreateNamedPipe(__create_pipe_name(pipename), kNtPipeAccessInbound,
kNtPipeTypeByte | kNtPipeReadmodeByte, 1, PIPE_BUF,
PIPE_BUF, 0, &kNtIsInheritable);
@ -467,12 +467,7 @@ textwindows int sys_fork_nt(uint32_t dwCreationFlags) {
if (ftrace_stackdigs)
_weaken(__hook)(_weaken(ftrace_hook), _weaken(GetSymbolTable)());
// reset core runtime services
__proc_wipe();
WipeKeystrokes();
if (_weaken(__sig_init))
_weaken(__sig_init)();
if (_weaken(__itimer_wipe))
_weaken(__itimer_wipe)();
// notify pthread join
atomic_store_explicit(&_pthread_static.ptid, GetCurrentThreadId(),
memory_order_release);

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/calls.h"
#include "libc/calls/sig.internal.h"
#include "libc/calls/state.internal.h"
#include "libc/calls/struct/sigset.internal.h"
#include "libc/calls/struct/timespec.h"
@ -27,6 +28,7 @@
#include "libc/intrin/cxaatexit.h"
#include "libc/intrin/dll.h"
#include "libc/intrin/maps.h"
#include "libc/intrin/stack.h"
#include "libc/intrin/strace.h"
#include "libc/intrin/weaken.h"
#include "libc/nt/files.h"
@ -39,6 +41,7 @@
#include "libc/runtime/syslib.internal.h"
#include "libc/stdio/internal.h"
#include "libc/str/str.h"
#include "libc/thread/itimer.h"
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/thread.h"
#include "third_party/dlmalloc/dlmalloc.h"
@ -104,10 +107,6 @@ static void fork_prepare(void) {
pthread_mutex_lock(&supreme_lock);
if (_weaken(_pthread_onfork_prepare))
_weaken(_pthread_onfork_prepare)();
if (IsWindows()) {
pthread_mutex_lock(&__sig_worker_lock);
__proc_lock();
}
fork_prepare_stdio();
__localtime_lock();
__cxa_lock();
@ -117,12 +116,16 @@ static void fork_prepare(void) {
dlmalloc_pre_fork();
__fds_lock();
pthread_mutex_lock(&__rand64_lock_obj);
if (_weaken(cosmo_stack_lock))
_weaken(cosmo_stack_lock)();
__maps_lock();
LOCKTRACE("READY TO LOCK AND ROLL");
}
static void fork_parent(void) {
__maps_unlock();
if (_weaken(cosmo_stack_unlock))
_weaken(cosmo_stack_unlock)();
pthread_mutex_unlock(&__rand64_lock_obj);
__fds_unlock();
dlmalloc_post_fork_parent();
@ -132,10 +135,6 @@ static void fork_parent(void) {
__cxa_unlock();
__localtime_unlock();
fork_parent_stdio();
if (IsWindows()) {
__proc_unlock();
pthread_mutex_unlock(&__sig_worker_lock);
}
if (_weaken(_pthread_onfork_parent))
_weaken(_pthread_onfork_parent)();
pthread_mutex_unlock(&supreme_lock);
@ -143,6 +142,8 @@ static void fork_parent(void) {
static void fork_child(void) {
nsync_mu_semaphore_sem_fork_child();
if (_weaken(cosmo_stack_wipe))
_weaken(cosmo_stack_wipe)();
pthread_mutex_wipe_np(&__rand64_lock_obj);
pthread_mutex_wipe_np(&__fds_lock_obj);
dlmalloc_post_fork_child();
@ -153,8 +154,13 @@ static void fork_child(void) {
pthread_mutex_wipe_np(&__cxa_lock_obj);
pthread_mutex_wipe_np(&__localtime_lock_obj);
if (IsWindows()) {
__proc_wipe();
// we don't bother locking the proc/itimer/sig locks above since
// their state is reset in the forked child. nothing to protect.
__proc_wipe_and_reset();
__itimer_wipe_and_reset();
pthread_mutex_wipe_np(&__sig_worker_lock);
if (_weaken(__sig_init))
_weaken(__sig_init)();
}
if (_weaken(_pthread_onfork_child))
_weaken(_pthread_onfork_child)();
@ -211,6 +217,7 @@ int _fork(uint32_t dwCreationFlags) {
memory_order_relaxed);
}
dll_make_first(&_pthread_list, &pt->list);
atomic_store_explicit(&_pthread_count, 1, memory_order_relaxed);
// get new system thread handle
intptr_t syshand = 0;

View file

@ -268,7 +268,8 @@ textwindows void __proc_unlock(void) {
/**
* Resets process tracker from forked child.
*/
textwindows void __proc_wipe(void) {
textwindows void __proc_wipe_and_reset(void) {
// TODO(jart): Should we preserve this state in forked children?
pthread_mutex_t lock = __proc.lock;
bzero(&__proc, sizeof(__proc));
__proc.lock = lock;

View file

@ -41,7 +41,6 @@ struct Procs {
extern struct Procs __proc;
void __proc_wipe(void) libcesque;
void __proc_lock(void) libcesque;
void __proc_unlock(void) libcesque;
int64_t __proc_handle(int) libcesque;
@ -49,6 +48,7 @@ int64_t __proc_search(int) libcesque;
struct Proc *__proc_new(void) libcesque;
void __proc_add(struct Proc *) libcesque;
void __proc_free(struct Proc *) libcesque;
void __proc_wipe_and_reset(void) libcesque;
int __proc_harvest(struct Proc *, bool) libcesque;
int sys_wait4_nt(int, int *, int, struct rusage *) libcesque;