Simplify TLS and reduce startup latency

This change simplifies the thread-local storage support code. On Windows
and Mac OS X the startup latency of __enable_tls() has been reduced from
30ms to 1ms. On Windows, TLS memory accesses will now go much faster due
to better self-modifying code that prevents a function call and acquires
our thread information block pointer in a single instruction.
This commit is contained in:
Justine Tunney 2022-07-18 03:33:32 -07:00
parent 38c3fa63fe
commit b1d9d11be1
15 changed files with 136 additions and 312 deletions

View file

@ -113,8 +113,7 @@ WinThreadEntry(int rdi, // rcx
static textwindows int CloneWindows(int (*func)(void *, int), char *stk,
size_t stksz, int flags, void *arg,
void *tls, size_t tlssz, int *ptid,
int *ctid) {
void *tls, int *ptid, int *ctid) {
int64_t h;
struct CloneArgs *wt;
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
@ -193,7 +192,7 @@ XnuThreadMain(void *pthread, // rdi
}
static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, size_t tlssz, int *ptid, int *ctid) {
void *arg, void *tls, int *ptid, int *ctid) {
int rc;
bool failed;
static bool once;
@ -244,8 +243,7 @@ static wontreturn void FreebsdThreadMain(void *p) {
}
static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz,
int *ptid, int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
int ax;
bool failed;
int64_t tid;
@ -265,7 +263,7 @@ static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
.stack_base = stk,
.stack_size = (((intptr_t)wt - (intptr_t)stk) & -16) - 8,
.tls_base = flags & CLONE_SETTLS ? tls : 0,
.tls_size = flags & CLONE_SETTLS ? tlssz : 0,
.tls_size = 64,
.child_tid = &wt->tid64,
.parent_tid = &tid,
};
@ -319,8 +317,7 @@ noasan static wontreturn void OpenbsdThreadMain(void *p) {
}
static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz,
int *ptid, int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
int tid;
intptr_t sp;
struct __tfork *tf;
@ -373,8 +370,7 @@ static wontreturn void NetbsdThreadMain(void *arg, // rdi
}
static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
// NetBSD has its own clone() and it works, but it's technically a
// second-class API, intended to help Linux folks migrate to this.
bool failed;
@ -465,8 +461,7 @@ int sys_clone_linux(int flags, // rdi
void *arg); // 8(rsp)
static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
int *ctid) {
int flags, void *arg, void *tls, int *ptid, int *ctid) {
long sp;
sp = (intptr_t)(stk + stksz);
if (~flags & CLONE_CHILD_SETTID) {
@ -589,14 +584,13 @@ static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
* @param arg is passed as an argument to `func` in the child thread
* @param tls may be used to set the thread local storage segment;
* this parameter is ignored if `CLONE_SETTLS` is not set
* @param tlssz is the size of tls in bytes which must be at least 64
* @param ctid lets the child receive its thread id without having to
* call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set
* @return tid of child on success, or -1 w/ errno
* @threadsafe
*/
int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
void *tls, size_t tlssz, int *ctid) {
void *tls, int *ctid) {
int rc;
struct CloneArgs *wt;
@ -606,13 +600,12 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
if (!func) {
rc = einval();
} else if (!IsTiny() &&
(((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15))) ||
((flags & CLONE_SETTLS) && (tlssz < 64 || (tlssz & 7))))) {
((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15)))) {
rc = einval();
} else if (IsAsan() &&
((stksz > PAGESIZE &&
!__asan_is_valid((char *)stk + PAGESIZE, stksz - PAGESIZE)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, tlssz)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, 64)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, sizeof(long))) ||
((flags & CLONE_PARENT_SETTID) &&
!__asan_is_valid(ptid, sizeof(*ptid))) ||
@ -620,7 +613,7 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
!__asan_is_valid(ctid, sizeof(*ctid))))) {
rc = efault();
} else if (IsLinux()) {
rc = CloneLinux(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (!IsTiny() &&
(flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
@ -629,15 +622,15 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
STRACE("clone flag unsupported on this platform");
rc = einval();
} else if (IsXnu()) {
rc = CloneXnu(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsFreebsd()) {
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsNetbsd()) {
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsOpenbsd()) {
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsWindows()) {
rc = CloneWindows(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
rc = CloneWindows(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else {
rc = enosys();
}
@ -647,8 +640,8 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
*ptid = rc;
}
STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %'zu, %p) → %d% m", func, stk,
stksz, flags, arg, ptid, tls, tlssz, ctid, rc);
STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %p) → %d% m", func, stk, stksz,
flags, arg, ptid, tls, ctid, rc);
return rc;
}

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/bits/bits.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
@ -45,11 +46,16 @@
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t)
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[];
/**
* Enables thread local storage.
*/
privileged void __enable_tls(void) {
if (__tls_enabled) return;
STRACE("__enable_tls()");
@ -111,98 +117,93 @@ privileged void __enable_tls(void) {
: "rcx", "r11", "memory");
}
/*
* We need to rewrite SysV _Thread_local code. You MUST use the
* -mno-tls-direct-seg-refs flag which generates code like this
*
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
*
* Which on Mac we can replace with this:
*
* 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
*
* Whereas on Windows we'll replace it with this:
*
* 0f 1f 40 00 fatnop4
* e8 xx xx xx xx call __tls_mov_nt_%R
*
* Since we have no idea where the TLS instructions exist in the
* binary, we need to disassemble the whole program image. This'll
* potentially take a few milliseconds for some larger programs.
*
* We check `_tls_content` which is generated by the linker script
* since it lets us determine ahead of time if _Thread_local vars
* have actually been linked into this program.
*
* TODO(jart): compute probability this is just overkill
*/
// We need to rewrite SysV _Thread_local code. You MUST use the
// -mno-tls-direct-seg-refs flag which generates code like this
//
// 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
// 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
//
// Which on Mac we can replace with this:
//
// 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
//
// Whereas on Windows we'll replace it with this:
//
// 0f 1f 40 00 fatnop4
// e8 xx xx xx xx call __tls_mov_nt_%R
//
// Since we have no idea where the TLS instructions exist in the
// binary, we need to disassemble the whole program image. This'll
// potentially take a few milliseconds for some larger programs.
//
// We check `_tls_content` which is generated by the linker script
// since it lets us determine ahead of time if _Thread_local vars
// have actually been linked into this program.
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
int n, reg, dis;
int n;
uint64_t w;
unsigned m, dis;
unsigned char *p;
const unsigned char *impl;
struct XedDecodedInst xedd;
__morph_begin();
// The most expensive part of this process is we need to compute the
// byte length of each instruction in our program. We'll use Intel's
// disassembler for this purpose.
for (p = _ereal; p < __privileged_start; p += n) {
xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64);
if (!xed_instruction_length_decode(&xedd, p, 15)) {
if (IsXnu()) {
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
// address 0x30 was promised to us, according to Go team
// https://github.com/golang/go/issues/23617
dis = 0x30;
} else {
// MSVC __declspec(thread) generates binary code for this
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
// times we should be good.
dis = 0x1480 + __tls_index * 8;
}
// We now know p[0] is most likely the first byte of an x86 op.
// Let's check and see if it's the GCC linear TIB address load.
// We hope and pray GCC won't generate TLS stores to %r8..%r15.
if (xedd.length == 9 && //
0144 == p[0] && // fs
0110 == p[1] && // rex.w (64-bit operand size)
(0213 == p[2] || // mov reg/mem → reg (word-sized)
0003 == p[2]) && // add reg/mem → reg (word-sized)
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
0000 == p[5] && // displacement (von Neumann endian)
0000 == p[6] && // displacement
0000 == p[7] && // displacement
0000 == p[8]) { // displacement
// iterate over modifiable code looking for 9 byte instruction
// this would take 30 ms using xed to enable tls on python.com
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
if (IsXnu()) {
p[0] = 0145; // this changes gs segment to fs segment
p[5] = 0x30; // tib slot index for tib linear address
}
// Windows is kind of complicated. We need to replace the
// segment mov instruction with a function call, that (a)
// won't clobber registers, and (b) has a return register
// that's the same as the mov destination. When setting
// function displacement, &CALL+5+DISP must equal &FUNC.
else {
if (p[2] == 3) {
impl = __tls_add_nt_rax;
} else {
impl = __tls_mov_nt_rax;
}
reg = (p[3] & 070) >> 3;
dis = (impl + reg * 18) - (p + 9);
p[0] = 0017; // map1
p[1] = 0037; // nopl (onl if reg=0)
p[2] = 0100; // mod/rm (%rax)+disp8
p[3] = 0000; // displacement
p[4] = 0350; // call
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
}
// use sse to zoom zoom to fs register prefixes
// that way it'll take 1 ms to morph python.com
while (p + 9 + 16 <= __privileged_start) {
if ((m = __builtin_ia32_pmovmskb128(
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144}))) {
m = __builtin_ctzll(m);
p += m;
break;
} else {
p += 16;
}
}
// Move to the next instruction.
n = xedd.length;
// we're checking for the following expression:
// 0144 == p[0] && // fs
// 0110 == p[1] && // rex.w (64-bit operand size)
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
// 0000 == p[5] && // displacement (von Neumann endian)
// 0000 == p[6] && // displacement
// 0000 == p[7] && // displacement
// 0000 == p[8] // displacement
w = READ64LE(p) & READ64LE("\377\377\377\307\377\377\377\377");
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
!p[8]) {
// now change the code
p[0] = 0145; // this changes gs segment to fs segment
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
// advance to the next instruction
n = 9;
} else {
// If Xed failed to decode the instruction, then we'll just plow
// through memory one byte at a time until Xed's morale improves
n = 1;
}
}

View file

@ -72,7 +72,7 @@ privileged void ftracer(void) {
long stackuse;
struct FtraceTls *ft;
struct StackFrame *sf;
ft = (struct FtraceTls *)(__get_tls_inline() + 0x08);
ft = (struct FtraceTls *)(__get_tls_privileged() + 0x08);
if (_cmpxchg(&ft->once, false, true)) {
ft->lastaddr = -1;
ft->skew = GetNestingLevelImpl(__builtin_frame_address(0));