Simplify TLS and reduce startup latency

This change simplifies the thread-local storage support code. On Windows
and Mac OS X the startup latency of __enable_tls() has been reduced from
30ms to 1ms. On Windows, TLS memory accesses will now go much faster due
to better self-modifying code that prevents a function call and acquires
our thread information block pointer in a single instruction.
This commit is contained in:
Justine Tunney 2022-07-18 03:33:32 -07:00
parent 38c3fa63fe
commit b1d9d11be1
15 changed files with 136 additions and 312 deletions

View file

@ -67,7 +67,7 @@ int chdir(const char *);
int chmod(const char *, uint32_t);
int chown(const char *, uint32_t, uint32_t);
int chroot(const char *);
int clone(void *, void *, size_t, int, void *, int *, void *, size_t, int *);
int clone(void *, void *, size_t, int, void *, int *, void *, int *);
int close(int);
int creat(const char *, uint32_t);
int dup(int);

View file

@ -50,10 +50,10 @@
* @asyncsignalsafe
* @threadsafe
*/
privileged int gettid(void) {
int gettid(void) {
int tid;
if (__tls_enabled) {
tid = *(int *)(__get_tls_inline() + 0x38);
tid = *(int *)(__get_tls() + 0x38);
if (tid > 0) return tid;
}
return sys_gettid();

View file

@ -314,7 +314,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
if (!__tls_enabled) {
x = __pid;
} else {
x = *(int *)(__get_tls_inline() + 0x38);
x = *(int *)(__get_tls_privileged() + 0x38);
}
} else {
x = 666;
@ -395,8 +395,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
i = 0;
m = (1 << base) - 1;
if (hash && x) sign = hash;
do
z[i++ & 127] = abet[x & m];
do z[i++ & 127] = abet[x & m];
while ((x >>= base) || (pdot && i < prec));
goto EmitNumber;

View file

@ -5,10 +5,27 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
char *__get_tls(void) libcesque nosideeffect;
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
/**
* Returns location of thread information block.
*
* This can't be used in privileged functions.
*/
static noasan inline char *__get_tls(void) {
char *tib;
asm("mov\t%%fs:0,%0" : "=r"(tib) : /* no inputs */ : "memory");
return tib;
}
#endif /* GNU x86-64 */
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
static noasan inline char *__get_tls_inline(void) {
/**
* Returns location of thread information block.
*
* This should be favored over __get_tls() for .privileged code that
* can't be self-modified by __enable_tls().
*/
static noasan inline char *__get_tls_privileged(void) {
char *tib, *lin = (char *)0x30;
if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) {
asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");

View file

@ -113,8 +113,7 @@ WinThreadEntry(int rdi, // rcx
static textwindows int CloneWindows(int (*func)(void *, int), char *stk,
size_t stksz, int flags, void *arg,
void *tls, size_t tlssz, int *ptid,
int *ctid) {
void *tls, int *ptid, int *ctid) {
int64_t h;
struct CloneArgs *wt;
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
@ -193,7 +192,7 @@ XnuThreadMain(void *pthread, // rdi
}
static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, size_t tlssz, int *ptid, int *ctid) {
void *arg, void *tls, int *ptid, int *ctid) {
int rc;
bool failed;
static bool once;
@ -244,8 +243,7 @@ static wontreturn void FreebsdThreadMain(void *p) {
}
static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz,
int *ptid, int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
int ax;
bool failed;
int64_t tid;
@ -265,7 +263,7 @@ static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
.stack_base = stk,
.stack_size = (((intptr_t)wt - (intptr_t)stk) & -16) - 8,
.tls_base = flags & CLONE_SETTLS ? tls : 0,
.tls_size = flags & CLONE_SETTLS ? tlssz : 0,
.tls_size = 64,
.child_tid = &wt->tid64,
.parent_tid = &tid,
};
@ -319,8 +317,7 @@ noasan static wontreturn void OpenbsdThreadMain(void *p) {
}
static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz,
int *ptid, int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
int tid;
intptr_t sp;
struct __tfork *tf;
@ -373,8 +370,7 @@ static wontreturn void NetbsdThreadMain(void *arg, // rdi
}
static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
// NetBSD has its own clone() and it works, but it's technically a
// second-class API, intended to help Linux folks migrate to this.
bool failed;
@ -465,8 +461,7 @@ int sys_clone_linux(int flags, // rdi
void *arg); // 8(rsp)
static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
long sp;
sp = (intptr_t)(stk + stksz);
if (~flags & CLONE_CHILD_SETTID) {
@ -589,14 +584,13 @@ static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
* @param arg is passed as an argument to `func` in the child thread
* @param tls may be used to set the thread local storage segment;
* this parameter is ignored if `CLONE_SETTLS` is not set
* @param tlssz is the size of tls in bytes which must be at least 64
* @param ctid lets the child receive its thread id without having to
* call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set
* @return tid of child on success, or -1 w/ errno
* @threadsafe
*/
int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
void *tls, size_t tlssz, int *ctid) {
void *tls, int *ctid) {
int rc;
struct CloneArgs *wt;
@ -606,13 +600,12 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
if (!func) {
rc = einval();
} else if (!IsTiny() &&
(((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15))) ||
((flags & CLONE_SETTLS) && (tlssz < 64 || (tlssz & 7))))) {
((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15)))) {
rc = einval();
} else if (IsAsan() &&
((stksz > PAGESIZE &&
!__asan_is_valid((char *)stk + PAGESIZE, stksz - PAGESIZE)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, tlssz)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, 64)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, sizeof(long))) ||
((flags & CLONE_PARENT_SETTID) &&
!__asan_is_valid(ptid, sizeof(*ptid))) ||
@ -620,7 +613,7 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
!__asan_is_valid(ctid, sizeof(*ctid))))) {
rc = efault();
} else if (IsLinux()) {
rc = CloneLinux(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (!IsTiny() &&
(flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
@ -629,15 +622,15 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
STRACE("clone flag unsupported on this platform");
rc = einval();
} else if (IsXnu()) {
rc = CloneXnu(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsFreebsd()) {
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsNetbsd()) {
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsOpenbsd()) {
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsWindows()) {
rc = CloneWindows(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneWindows(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else {
rc = enosys();
}
@ -647,8 +640,8 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
*ptid = rc;
}
STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %'zu, %p) → %d% m", func, stk,
stksz, flags, arg, ptid, tls, tlssz, ctid, rc);
STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %p) → %d% m", func, stk, stksz,
flags, arg, ptid, tls, ctid, rc);
return rc;
}

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/bits.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
@ -45,11 +46,16 @@
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t)
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[];
/**
* Enables thread local storage.
*/
privileged void __enable_tls(void) {
if (__tls_enabled) return;
STRACE("__enable_tls()");
@ -111,98 +117,93 @@ privileged void __enable_tls(void) {
: "rcx", "r11", "memory");
}
/*
* We need to rewrite SysV _Thread_local code. You MUST use the
* -mno-tls-direct-seg-refs flag which generates code like this
*
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
*
* Which on Mac we can replace with this:
*
* 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
*
* Whereas on Windows we'll replace it with this:
*
* 0f 1f 40 00 fatnop4
* e8 xx xx xx xx call __tls_mov_nt_%R
*
* Since we have no idea where the TLS instructions exist in the
* binary, we need to disassemble the whole program image. This'll
* potentially take a few milliseconds for some larger programs.
*
* We check `_tls_content` which is generated by the linker script
* since it lets us determine ahead of time if _Thread_local vars
* have actually been linked into this program.
*
* TODO(jart): compute probability this is just overkill
*/
// We need to rewrite SysV _Thread_local code. You MUST use the
// -mno-tls-direct-seg-refs flag which generates code like this
//
// 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
// 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
//
// Which on Mac we can replace with this:
//
// 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
//
// Whereas on Windows we'll replace it with this:
//
// 0f 1f 40 00 fatnop4
// e8 xx xx xx xx call __tls_mov_nt_%R
//
// Since we have no idea where the TLS instructions exist in the
// binary, we need to disassemble the whole program image. This'll
// potentially take a few milliseconds for some larger programs.
//
// We check `_tls_content` which is generated by the linker script
// since it lets us determine ahead of time if _Thread_local vars
// have actually been linked into this program.
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
int n, reg, dis;
int n;
uint64_t w;
unsigned m, dis;
unsigned char *p;
const unsigned char *impl;
struct XedDecodedInst xedd;
__morph_begin();
// The most expensive part of this process is we need to compute the
// byte length of each instruction in our program. We'll use Intel's
// disassembler for this purpose.
for (p = _ereal; p < __privileged_start; p += n) {
xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64);
if (!xed_instruction_length_decode(&xedd, p, 15)) {
if (IsXnu()) {
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
// address 0x30 was promised to us, according to Go team
// https://github.com/golang/go/issues/23617
dis = 0x30;
} else {
// MSVC __declspec(thread) generates binary code for this
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
// times we should be good.
dis = 0x1480 + __tls_index * 8;
}
// We now know p[0] is most likely the first byte of an x86 op.
// Let's check and see if it's the GCC linear TIB address load.
// We hope and pray GCC won't generate TLS stores to %r8..%r15.
if (xedd.length == 9 && //
0144 == p[0] && // fs
0110 == p[1] && // rex.w (64-bit operand size)
(0213 == p[2] || // mov reg/mem → reg (word-sized)
0003 == p[2]) && // add reg/mem → reg (word-sized)
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
0000 == p[5] && // displacement (von Neumann endian)
0000 == p[6] && // displacement
0000 == p[7] && // displacement
0000 == p[8]) { // displacement
// iterate over modifiable code looking for 9 byte instruction
// this would take 30 ms using xed to enable tls on python.com
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
if (IsXnu()) {
p[0] = 0145; // this changes gs segment to fs segment
p[5] = 0x30; // tib slot index for tib linear address
}
// Windows is kind of complicated. We need to replace the
// segment mov instruction with a function call, that (a)
// won't clobber registers, and (b) has a return register
// that's the same as the mov destination. When setting
// function displacement, &CALL+5+DISP must equal &FUNC.
else {
if (p[2] == 3) {
impl = __tls_add_nt_rax;
} else {
impl = __tls_mov_nt_rax;
}
reg = (p[3] & 070) >> 3;
dis = (impl + reg * 18) - (p + 9);
p[0] = 0017; // map1
p[1] = 0037; // nopl (onl if reg=0)
p[2] = 0100; // mod/rm (%rax)+disp8
p[3] = 0000; // displacement
p[4] = 0350; // call
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
}
// use sse to zoom zoom to fs register prefixes
// that way it'll take 1 ms to morph python.com
while (p + 9 + 16 <= __privileged_start) {
if ((m = __builtin_ia32_pmovmskb128(
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144}))) {
m = __builtin_ctzll(m);
p += m;
break;
} else {
p += 16;
}
}
// Move to the next instruction.
n = xedd.length;
// we're checking for the following expression:
// 0144 == p[0] && // fs
// 0110 == p[1] && // rex.w (64-bit operand size)
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
// 0000 == p[5] && // displacement (von Neumann endian)
// 0000 == p[6] && // displacement
// 0000 == p[7] && // displacement
// 0000 == p[8] // displacement
w = READ64LE(p) & READ64LE("\377\377\377\307\377\377\377\377");
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
!p[8]) {
// now change the code
p[0] = 0145; // this changes gs segment to fs segment
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
// advance to the next instruction
n = 9;
} else {
// If Xed failed to decode the instruction, then we'll just plow
// through memory one byte at a time until Xed's morale improves
n = 1;
}
}

View file

@ -72,7 +72,7 @@ privileged void ftracer(void) {
long stackuse;
struct FtraceTls *ft;
struct StackFrame *sf;
ft = (struct FtraceTls *)(__get_tls_inline() + 0x08);
ft = (struct FtraceTls *)(__get_tls_privileged() + 0x08);
if (_cmpxchg(&ft->once, false, true)) {
ft->lastaddr = -1;
ft->skew = GetNestingLevelImpl(__builtin_frame_address(0));

View file

@ -28,5 +28,5 @@
*/
privileged nocallersavedregisters errno_t *(__errno_location)(void) {
if (!__tls_enabled) return &__errno;
return (errno_t *)(__get_tls_inline() + 0x3c);
return (errno_t *)(__get_tls_privileged() + 0x3c);
}

View file

@ -1,33 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/nexgen32e/gettls.h"
#include "libc/nexgen32e/threaded.h"
/**
* Returns address of thread information block.
*
* This function must not be called until TLS is initialized.
*
* @see __get_tls_inline()
* @see __install_tls()
* @see _spawn()
*/
optimizespeed char *__get_tls(void) {
return __get_tls_inline();
}

View file

@ -39,8 +39,6 @@ LIBC_SYSV_A_FILES := \
libc/sysv/systemfive.S \
libc/sysv/errno_location.greg.c \
libc/sysv/errno.c \
libc/sysv/gettls.greg.c \
libc/sysv/tlspolyfill.S \
libc/sysv/errfun.S \
libc/sysv/strace.greg.c \
libc/sysv/describeos.greg.c \

View file

@ -1,156 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Code morphing TLS polyfills for The New Technology.
//
// @note msvc generates this code so it's stable
// @note func ordering follows x86 reg encoding
// @note each function is exactly 18 bytes
// @see __enable_threads()
__tls_mov_nt_rax:
push %rcx
mov __tls_index(%rip),%ecx
mov %gs:0x1480(,%rcx,8),%rax
pop %rcx
ret
.endfn __tls_mov_nt_rax,globl,hidden
__tls_mov_nt_rcx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rcx
pop %rax
ret
.endfn __tls_mov_nt_rcx
__tls_mov_nt_rdx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rdx
pop %rax
ret
.endfn __tls_mov_nt_rdx
__tls_mov_nt_rbx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rbx
pop %rax
ret
.endfn __tls_mov_nt_rbx
__tls_mov_nt_rsp:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rsp
pop %rax
ret
.endfn __tls_mov_nt_rsp
__tls_mov_nt_rbp:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rbp
pop %rax
ret
.endfn __tls_mov_nt_rbp
__tls_mov_nt_rsi:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rsi
pop %rax
ret
.endfn __tls_mov_nt_rsi
__tls_mov_nt_rdi:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rdi
pop %rax
ret
.endfn __tls_mov_nt_rdi
////////////////////////////////////////////////////////////////////////////////
__tls_add_nt_rax:
push %rcx
mov __tls_index(%rip),%ecx
add %gs:0x1480(,%rcx,8),%rax
pop %rcx
ret
.endfn __tls_add_nt_rax,globl,hidden
__tls_add_nt_rcx:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rcx
pop %rax
ret
.endfn __tls_add_nt_rcx
__tls_add_nt_rdx:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rdx
pop %rax
ret
.endfn __tls_add_nt_rdx
__tls_add_nt_rbx:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rbx
pop %rax
ret
.endfn __tls_add_nt_rbx
__tls_add_nt_rsp:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rsp
pop %rax
ret
.endfn __tls_add_nt_rsp
__tls_add_nt_rbp:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rbp
pop %rax
ret
.endfn __tls_add_nt_rbp
__tls_add_nt_rsi:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rsi
pop %rax
ret
.endfn __tls_add_nt_rsi
__tls_add_nt_rdi:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rdi
pop %rax
ret
.endfn __tls_add_nt_rdi

View file

@ -26,5 +26,5 @@ STATIC_YOINK("_main_thread_ctor");
* Returns thread descriptor of the current thread.
*/
cthread_t(cthread_self)(void) {
return (cthread_t)__get_tls_inline();
return (cthread_t)__get_tls();
}

View file

@ -107,7 +107,7 @@ int _spawn(int fun(void *, int), void *arg, struct spawn *opt_out_thread) {
CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID,
arg, &th->ptid, th->tib, _TIBZ, th->ctid) == -1) {
arg, &th->ptid, th->tib, th->ctid) == -1) {
_freestack(th->stk);
free(th->tls);
return -1;

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/calls.h"
#include "libc/calls/struct/timespec.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/kprintf.h"
@ -31,6 +32,8 @@
#include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h"
#include "libc/runtime/symbols.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/clock.h"
#include "libc/sysv/consts/clone.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/o.h"
@ -165,6 +168,6 @@ BENCH(clone, bench) {
char *volatile tp;
errno_t *volatile ep;
EZBENCH2("__errno_location", donothing, (ep = __errno_location()));
EZBENCH2("__get_tls_inline", donothing, (tp = __get_tls_inline()));
EZBENCH2("__get_tls_privileged", donothing, (tp = __get_tls_privileged()));
EZBENCH2("__get_tls", donothing, (tp = __get_tls()));
}

View file

@ -20,11 +20,13 @@
#include "libc/calls/struct/sched_param.h"
#include "libc/dce.h"
#include "libc/fmt/fmt.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/spinlock.h"
#include "libc/intrin/wait0.internal.h"
#include "libc/math.h"
#include "libc/mem/mem.h"
#include "libc/runtime/gc.internal.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/stack.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/clone.h"