Simplify TLS and reduce startup latency

This change simplifies the thread-local storage support code. On Windows
and Mac OS X the startup latency of __enable_tls() has been reduced from
30ms to 1ms. On Windows, TLS memory accesses will now go much faster due
to better self-modifying code that prevents a function call and acquires
our thread information block pointer in a single instruction.
This commit is contained in:
Justine Tunney 2022-07-18 03:33:32 -07:00
parent 38c3fa63fe
commit b1d9d11be1
15 changed files with 136 additions and 312 deletions

View file

@ -5,10 +5,27 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
char *__get_tls(void) libcesque nosideeffect;
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
/**
* Returns location of thread information block.
*
* This can't be used in privileged functions.
*/
static noasan inline char *__get_tls(void) {
char *tib;
asm("mov\t%%fs:0,%0" : "=r"(tib) : /* no inputs */ : "memory");
return tib;
}
#endif /* GNU x86-64 */
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
static noasan inline char *__get_tls_inline(void) {
/**
* Returns location of thread information block.
*
* This should be favored over __get_tls() for .privileged code that
* can't be self-modified by __enable_tls().
*/
static noasan inline char *__get_tls_privileged(void) {
char *tib, *lin = (char *)0x30;
if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) {
asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");