Simplify TLS and reduce startup latency

This change simplifies the thread-local storage support code. On Windows
and Mac OS X the startup latency of __enable_tls() has been reduced from
30ms to 1ms. On Windows, TLS memory accesses will now go much faster due
to better self-modifying code that prevents a function call and acquires
our thread information block pointer in a single instruction.
This commit is contained in:
Justine Tunney 2022-07-18 03:33:32 -07:00
parent 38c3fa63fe
commit b1d9d11be1
15 changed files with 136 additions and 312 deletions

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/calls.h"
#include "libc/calls/struct/timespec.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/kprintf.h"
@ -31,6 +32,8 @@
#include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h"
#include "libc/runtime/symbols.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/clock.h"
#include "libc/sysv/consts/clone.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/o.h"
@ -165,6 +168,6 @@ BENCH(clone, bench) {
char *volatile tp;
errno_t *volatile ep;
EZBENCH2("__errno_location", donothing, (ep = __errno_location()));
EZBENCH2("__get_tls_inline", donothing, (tp = __get_tls_inline()));
EZBENCH2("__get_tls_privileged", donothing, (tp = __get_tls_privileged()));
EZBENCH2("__get_tls", donothing, (tp = __get_tls()));
}

View file

@ -20,11 +20,13 @@
#include "libc/calls/struct/sched_param.h"
#include "libc/dce.h"
#include "libc/fmt/fmt.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/spinlock.h"
#include "libc/intrin/wait0.internal.h"
#include "libc/math.h"
#include "libc/mem/mem.h"
#include "libc/runtime/gc.internal.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/stack.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/clone.h"