Simplify TLS and reduce startup latency

This change simplifies the thread-local storage support code. On Windows
and Mac OS X the startup latency of __enable_tls() has been reduced from
30ms to 1ms. On Windows, TLS memory accesses will now go much faster due
to better self-modifying code that prevents a function call and acquires
our thread information block pointer in a single instruction.
This commit is contained in:
Justine Tunney 2022-07-18 03:33:32 -07:00
parent 38c3fa63fe
commit b1d9d11be1
15 changed files with 136 additions and 312 deletions

View file

@ -67,7 +67,7 @@ int chdir(const char *);
int chmod(const char *, uint32_t); int chmod(const char *, uint32_t);
int chown(const char *, uint32_t, uint32_t); int chown(const char *, uint32_t, uint32_t);
int chroot(const char *); int chroot(const char *);
int clone(void *, void *, size_t, int, void *, int *, void *, size_t, int *); int clone(void *, void *, size_t, int, void *, int *, void *, int *);
int close(int); int close(int);
int creat(const char *, uint32_t); int creat(const char *, uint32_t);
int dup(int); int dup(int);

View file

@ -50,10 +50,10 @@
* @asyncsignalsafe * @asyncsignalsafe
* @threadsafe * @threadsafe
*/ */
privileged int gettid(void) { int gettid(void) {
int tid; int tid;
if (__tls_enabled) { if (__tls_enabled) {
tid = *(int *)(__get_tls_inline() + 0x38); tid = *(int *)(__get_tls() + 0x38);
if (tid > 0) return tid; if (tid > 0) return tid;
} }
return sys_gettid(); return sys_gettid();

View file

@ -314,7 +314,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
if (!__tls_enabled) { if (!__tls_enabled) {
x = __pid; x = __pid;
} else { } else {
x = *(int *)(__get_tls_inline() + 0x38); x = *(int *)(__get_tls_privileged() + 0x38);
} }
} else { } else {
x = 666; x = 666;
@ -395,8 +395,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
i = 0; i = 0;
m = (1 << base) - 1; m = (1 << base) - 1;
if (hash && x) sign = hash; if (hash && x) sign = hash;
do do z[i++ & 127] = abet[x & m];
z[i++ & 127] = abet[x & m];
while ((x >>= base) || (pdot && i < prec)); while ((x >>= base) || (pdot && i < prec));
goto EmitNumber; goto EmitNumber;

View file

@ -5,10 +5,27 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0) #if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_ COSMOPOLITAN_C_START_
char *__get_tls(void) libcesque nosideeffect; #if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
/**
* Returns location of thread information block.
*
* This can't be used in privileged functions.
*/
static noasan inline char *__get_tls(void) {
char *tib;
asm("mov\t%%fs:0,%0" : "=r"(tib) : /* no inputs */ : "memory");
return tib;
}
#endif /* GNU x86-64 */
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__) #if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
static noasan inline char *__get_tls_inline(void) { /**
* Returns location of thread information block.
*
* This should be favored over __get_tls() for .privileged code that
* can't be self-modified by __enable_tls().
*/
static noasan inline char *__get_tls_privileged(void) {
char *tib, *lin = (char *)0x30; char *tib, *lin = (char *)0x30;
if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) { if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) {
asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory"); asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");

View file

@ -113,8 +113,7 @@ WinThreadEntry(int rdi, // rcx
static textwindows int CloneWindows(int (*func)(void *, int), char *stk, static textwindows int CloneWindows(int (*func)(void *, int), char *stk,
size_t stksz, int flags, void *arg, size_t stksz, int flags, void *arg,
void *tls, size_t tlssz, int *ptid, void *tls, int *ptid, int *ctid) {
int *ctid) {
int64_t h; int64_t h;
struct CloneArgs *wt; struct CloneArgs *wt;
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) - wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
@ -193,7 +192,7 @@ XnuThreadMain(void *pthread, // rdi
} }
static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags, static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
void *arg, void *tls, size_t tlssz, int *ptid, int *ctid) { void *arg, void *tls, int *ptid, int *ctid) {
int rc; int rc;
bool failed; bool failed;
static bool once; static bool once;
@ -244,8 +243,7 @@ static wontreturn void FreebsdThreadMain(void *p) {
} }
static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz, static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int flags, void *arg, void *tls, int *ptid, int *ctid) {
int *ptid, int *ctid) {
int ax; int ax;
bool failed; bool failed;
int64_t tid; int64_t tid;
@ -265,7 +263,7 @@ static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
.stack_base = stk, .stack_base = stk,
.stack_size = (((intptr_t)wt - (intptr_t)stk) & -16) - 8, .stack_size = (((intptr_t)wt - (intptr_t)stk) & -16) - 8,
.tls_base = flags & CLONE_SETTLS ? tls : 0, .tls_base = flags & CLONE_SETTLS ? tls : 0,
.tls_size = flags & CLONE_SETTLS ? tlssz : 0, .tls_size = 64,
.child_tid = &wt->tid64, .child_tid = &wt->tid64,
.parent_tid = &tid, .parent_tid = &tid,
}; };
@ -319,8 +317,7 @@ noasan static wontreturn void OpenbsdThreadMain(void *p) {
} }
static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz, static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int flags, void *arg, void *tls, int *ptid, int *ctid) {
int *ptid, int *ctid) {
int tid; int tid;
intptr_t sp; intptr_t sp;
struct __tfork *tf; struct __tfork *tf;
@ -373,8 +370,7 @@ static wontreturn void NetbsdThreadMain(void *arg, // rdi
} }
static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz, static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid, int flags, void *arg, void *tls, int *ptid, int *ctid) {
int *ctid) {
// NetBSD has its own clone() and it works, but it's technically a // NetBSD has its own clone() and it works, but it's technically a
// second-class API, intended to help Linux folks migrate to this. // second-class API, intended to help Linux folks migrate to this.
bool failed; bool failed;
@ -465,8 +461,7 @@ int sys_clone_linux(int flags, // rdi
void *arg); // 8(rsp) void *arg); // 8(rsp)
static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz, static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
int flags, void *arg, void *tls, size_t tlssz, int *ptid, int flags, void *arg, void *tls, int *ptid, int *ctid) {
int *ctid) {
long sp; long sp;
sp = (intptr_t)(stk + stksz); sp = (intptr_t)(stk + stksz);
if (~flags & CLONE_CHILD_SETTID) { if (~flags & CLONE_CHILD_SETTID) {
@ -589,14 +584,13 @@ static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
* @param arg is passed as an argument to `func` in the child thread * @param arg is passed as an argument to `func` in the child thread
* @param tls may be used to set the thread local storage segment; * @param tls may be used to set the thread local storage segment;
* this parameter is ignored if `CLONE_SETTLS` is not set * this parameter is ignored if `CLONE_SETTLS` is not set
* @param tlssz is the size of tls in bytes which must be at least 64
* @param ctid lets the child receive its thread id without having to * @param ctid lets the child receive its thread id without having to
* call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set * call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set
* @return tid of child on success, or -1 w/ errno * @return tid of child on success, or -1 w/ errno
* @threadsafe * @threadsafe
*/ */
int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid, int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
void *tls, size_t tlssz, int *ctid) { void *tls, int *ctid) {
int rc; int rc;
struct CloneArgs *wt; struct CloneArgs *wt;
@ -606,13 +600,12 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
if (!func) { if (!func) {
rc = einval(); rc = einval();
} else if (!IsTiny() && } else if (!IsTiny() &&
(((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15))) || ((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15)))) {
((flags & CLONE_SETTLS) && (tlssz < 64 || (tlssz & 7))))) {
rc = einval(); rc = einval();
} else if (IsAsan() && } else if (IsAsan() &&
((stksz > PAGESIZE && ((stksz > PAGESIZE &&
!__asan_is_valid((char *)stk + PAGESIZE, stksz - PAGESIZE)) || !__asan_is_valid((char *)stk + PAGESIZE, stksz - PAGESIZE)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, tlssz)) || ((flags & CLONE_SETTLS) && !__asan_is_valid(tls, 64)) ||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, sizeof(long))) || ((flags & CLONE_SETTLS) && !__asan_is_valid(tls, sizeof(long))) ||
((flags & CLONE_PARENT_SETTID) && ((flags & CLONE_PARENT_SETTID) &&
!__asan_is_valid(ptid, sizeof(*ptid))) || !__asan_is_valid(ptid, sizeof(*ptid))) ||
@ -620,7 +613,7 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
!__asan_is_valid(ctid, sizeof(*ctid))))) { !__asan_is_valid(ctid, sizeof(*ctid))))) {
rc = efault(); rc = efault();
} else if (IsLinux()) { } else if (IsLinux()) {
rc = CloneLinux(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid); rc = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (!IsTiny() && } else if (!IsTiny() &&
(flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID | (flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) != CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
@ -629,15 +622,15 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
STRACE("clone flag unsupported on this platform"); STRACE("clone flag unsupported on this platform");
rc = einval(); rc = einval();
} else if (IsXnu()) { } else if (IsXnu()) {
rc = CloneXnu(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid); rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsFreebsd()) { } else if (IsFreebsd()) {
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid); rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsNetbsd()) { } else if (IsNetbsd()) {
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid); rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsOpenbsd()) { } else if (IsOpenbsd()) {
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid); rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else if (IsWindows()) { } else if (IsWindows()) {
rc = CloneWindows(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid); rc = CloneWindows(func, stk, stksz, flags, arg, tls, ptid, ctid);
} else { } else {
rc = enosys(); rc = enosys();
} }
@ -647,8 +640,8 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
*ptid = rc; *ptid = rc;
} }
STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %'zu, %p) → %d% m", func, stk, STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %p) → %d% m", func, stk, stksz,
stksz, flags, arg, ptid, tls, tlssz, ctid, rc); flags, arg, ptid, tls, ctid, rc);
return rc; return rc;
} }

View file

@ -16,6 +16,7 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE. PERFORMANCE OF THIS SOFTWARE.
*/ */
#include "libc/bits/bits.h"
#include "libc/calls/calls.h" #include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h" #include "libc/calls/strace.internal.h"
#include "libc/calls/syscall-sysv.internal.h" #include "libc/calls/syscall-sysv.internal.h"
@ -45,11 +46,16 @@
#define _TLDZ ((intptr_t)_tdata_size) #define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t) #define _TIBZ sizeof(struct cthread_descriptor_t)
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc; __msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
extern unsigned char __tls_mov_nt_rax[]; extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[]; extern unsigned char __tls_add_nt_rax[];
/**
* Enables thread local storage.
*/
privileged void __enable_tls(void) { privileged void __enable_tls(void) {
if (__tls_enabled) return; if (__tls_enabled) return;
STRACE("__enable_tls()"); STRACE("__enable_tls()");
@ -111,98 +117,93 @@ privileged void __enable_tls(void) {
: "rcx", "r11", "memory"); : "rcx", "r11", "memory");
} }
/* // We need to rewrite SysV _Thread_local code. You MUST use the
* We need to rewrite SysV _Thread_local code. You MUST use the // -mno-tls-direct-seg-refs flag which generates code like this
* -mno-tls-direct-seg-refs flag which generates code like this //
* // 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R // 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R //
* // Which on Mac we can replace with this:
* Which on Mac we can replace with this: //
* // 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
* 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R //
* // Whereas on Windows we'll replace it with this:
* Whereas on Windows we'll replace it with this: //
* // 0f 1f 40 00 fatnop4
* 0f 1f 40 00 fatnop4 // e8 xx xx xx xx call __tls_mov_nt_%R
* e8 xx xx xx xx call __tls_mov_nt_%R //
* // Since we have no idea where the TLS instructions exist in the
* Since we have no idea where the TLS instructions exist in the // binary, we need to disassemble the whole program image. This'll
* binary, we need to disassemble the whole program image. This'll // potentially take a few milliseconds for some larger programs.
* potentially take a few milliseconds for some larger programs. //
* // We check `_tls_content` which is generated by the linker script
* We check `_tls_content` which is generated by the linker script // since it lets us determine ahead of time if _Thread_local vars
* since it lets us determine ahead of time if _Thread_local vars // have actually been linked into this program.
* have actually been linked into this program.
*
* TODO(jart): compute probability this is just overkill
*/
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) { if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
int n, reg, dis; int n;
uint64_t w;
unsigned m, dis;
unsigned char *p; unsigned char *p;
const unsigned char *impl;
struct XedDecodedInst xedd;
__morph_begin(); __morph_begin();
// The most expensive part of this process is we need to compute the if (IsXnu()) {
// byte length of each instruction in our program. We'll use Intel's // Apple is quite straightforward to patch. We basically
// disassembler for this purpose. // just change the segment register, and the linear slot
for (p = _ereal; p < __privileged_start; p += n) { // address 0x30 was promised to us, according to Go team
xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64); // https://github.com/golang/go/issues/23617
if (!xed_instruction_length_decode(&xedd, p, 15)) { dis = 0x30;
} else {
// MSVC __declspec(thread) generates binary code for this
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
// times we should be good.
dis = 0x1480 + __tls_index * 8;
}
// We now know p[0] is most likely the first byte of an x86 op. // iterate over modifiable code looking for 9 byte instruction
// Let's check and see if it's the GCC linear TIB address load. // this would take 30 ms using xed to enable tls on python.com
// We hope and pray GCC won't generate TLS stores to %r8..%r15. for (p = _ereal; p + 9 <= __privileged_start; p += n) {
if (xedd.length == 9 && //
0144 == p[0] && // fs
0110 == p[1] && // rex.w (64-bit operand size)
(0213 == p[2] || // mov reg/mem → reg (word-sized)
0003 == p[2]) && // add reg/mem → reg (word-sized)
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
0000 == p[5] && // displacement (von Neumann endian)
0000 == p[6] && // displacement
0000 == p[7] && // displacement
0000 == p[8]) { // displacement
// Apple is quite straightforward to patch. We basically // use sse to zoom zoom to fs register prefixes
// just change the segment register, and the linear slot // that way it'll take 1 ms to morph python.com
if (IsXnu()) { while (p + 9 + 16 <= __privileged_start) {
p[0] = 0145; // this changes gs segment to fs segment if ((m = __builtin_ia32_pmovmskb128(
p[5] = 0x30; // tib slot index for tib linear address *(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
} 0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144}))) {
// Windows is kind of complicated. We need to replace the m = __builtin_ctzll(m);
// segment mov instruction with a function call, that (a) p += m;
// won't clobber registers, and (b) has a return register break;
// that's the same as the mov destination. When setting } else {
// function displacement, &CALL+5+DISP must equal &FUNC. p += 16;
else {
if (p[2] == 3) {
impl = __tls_add_nt_rax;
} else {
impl = __tls_mov_nt_rax;
}
reg = (p[3] & 070) >> 3;
dis = (impl + reg * 18) - (p + 9);
p[0] = 0017; // map1
p[1] = 0037; // nopl (onl if reg=0)
p[2] = 0100; // mod/rm (%rax)+disp8
p[3] = 0000; // displacement
p[4] = 0350; // call
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
}
} }
}
// Move to the next instruction. // we're checking for the following expression:
n = xedd.length; // 0144 == p[0] && // fs
// 0110 == p[1] && // rex.w (64-bit operand size)
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
// 0000 == p[5] && // displacement (von Neumann endian)
// 0000 == p[6] && // displacement
// 0000 == p[7] && // displacement
// 0000 == p[8] // displacement
w = READ64LE(p) & READ64LE("\377\377\377\307\377\377\377\377");
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
!p[8]) {
// now change the code
p[0] = 0145; // this changes gs segment to fs segment
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
// advance to the next instruction
n = 9;
} else { } else {
// If Xed failed to decode the instruction, then we'll just plow
// through memory one byte at a time until Xed's morale improves
n = 1; n = 1;
} }
} }

View file

@ -72,7 +72,7 @@ privileged void ftracer(void) {
long stackuse; long stackuse;
struct FtraceTls *ft; struct FtraceTls *ft;
struct StackFrame *sf; struct StackFrame *sf;
ft = (struct FtraceTls *)(__get_tls_inline() + 0x08); ft = (struct FtraceTls *)(__get_tls_privileged() + 0x08);
if (_cmpxchg(&ft->once, false, true)) { if (_cmpxchg(&ft->once, false, true)) {
ft->lastaddr = -1; ft->lastaddr = -1;
ft->skew = GetNestingLevelImpl(__builtin_frame_address(0)); ft->skew = GetNestingLevelImpl(__builtin_frame_address(0));

View file

@ -28,5 +28,5 @@
*/ */
privileged nocallersavedregisters errno_t *(__errno_location)(void) { privileged nocallersavedregisters errno_t *(__errno_location)(void) {
if (!__tls_enabled) return &__errno; if (!__tls_enabled) return &__errno;
return (errno_t *)(__get_tls_inline() + 0x3c); return (errno_t *)(__get_tls_privileged() + 0x3c);
} }

View file

@ -1,33 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/nexgen32e/gettls.h"
#include "libc/nexgen32e/threaded.h"
/**
* Returns address of thread information block.
*
* This function must not be called until TLS is initialized.
*
* @see __get_tls_inline()
* @see __install_tls()
* @see _spawn()
*/
optimizespeed char *__get_tls(void) {
return __get_tls_inline();
}

View file

@ -39,8 +39,6 @@ LIBC_SYSV_A_FILES := \
libc/sysv/systemfive.S \ libc/sysv/systemfive.S \
libc/sysv/errno_location.greg.c \ libc/sysv/errno_location.greg.c \
libc/sysv/errno.c \ libc/sysv/errno.c \
libc/sysv/gettls.greg.c \
libc/sysv/tlspolyfill.S \
libc/sysv/errfun.S \ libc/sysv/errfun.S \
libc/sysv/strace.greg.c \ libc/sysv/strace.greg.c \
libc/sysv/describeos.greg.c \ libc/sysv/describeos.greg.c \

View file

@ -1,156 +0,0 @@
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
// Code morphing TLS polyfills for The New Technology.
//
// @note msvc generates this code so it's stable
// @note func ordering follows x86 reg encoding
// @note each function is exactly 18 bytes
// @see __enable_threads()
__tls_mov_nt_rax:
push %rcx
mov __tls_index(%rip),%ecx
mov %gs:0x1480(,%rcx,8),%rax
pop %rcx
ret
.endfn __tls_mov_nt_rax,globl,hidden
__tls_mov_nt_rcx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rcx
pop %rax
ret
.endfn __tls_mov_nt_rcx
__tls_mov_nt_rdx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rdx
pop %rax
ret
.endfn __tls_mov_nt_rdx
__tls_mov_nt_rbx:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rbx
pop %rax
ret
.endfn __tls_mov_nt_rbx
__tls_mov_nt_rsp:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rsp
pop %rax
ret
.endfn __tls_mov_nt_rsp
__tls_mov_nt_rbp:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rbp
pop %rax
ret
.endfn __tls_mov_nt_rbp
__tls_mov_nt_rsi:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rsi
pop %rax
ret
.endfn __tls_mov_nt_rsi
__tls_mov_nt_rdi:
push %rax
mov __tls_index(%rip),%eax
mov %gs:0x1480(,%rax,8),%rdi
pop %rax
ret
.endfn __tls_mov_nt_rdi
////////////////////////////////////////////////////////////////////////////////
__tls_add_nt_rax:
push %rcx
mov __tls_index(%rip),%ecx
add %gs:0x1480(,%rcx,8),%rax
pop %rcx
ret
.endfn __tls_add_nt_rax,globl,hidden
__tls_add_nt_rcx:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rcx
pop %rax
ret
.endfn __tls_add_nt_rcx
__tls_add_nt_rdx:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rdx
pop %rax
ret
.endfn __tls_add_nt_rdx
__tls_add_nt_rbx:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rbx
pop %rax
ret
.endfn __tls_add_nt_rbx
__tls_add_nt_rsp:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rsp
pop %rax
ret
.endfn __tls_add_nt_rsp
__tls_add_nt_rbp:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rbp
pop %rax
ret
.endfn __tls_add_nt_rbp
__tls_add_nt_rsi:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rsi
pop %rax
ret
.endfn __tls_add_nt_rsi
__tls_add_nt_rdi:
push %rax
mov __tls_index(%rip),%eax
add %gs:0x1480(,%rax,8),%rdi
pop %rax
ret
.endfn __tls_add_nt_rdi

View file

@ -26,5 +26,5 @@ STATIC_YOINK("_main_thread_ctor");
* Returns thread descriptor of the current thread. * Returns thread descriptor of the current thread.
*/ */
cthread_t(cthread_self)(void) { cthread_t(cthread_self)(void) {
return (cthread_t)__get_tls_inline(); return (cthread_t)__get_tls();
} }

View file

@ -107,7 +107,7 @@ int _spawn(int fun(void *, int), void *arg, struct spawn *opt_out_thread) {
CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
CLONE_CHILD_CLEARTID, CLONE_CHILD_CLEARTID,
arg, &th->ptid, th->tib, _TIBZ, th->ctid) == -1) { arg, &th->ptid, th->tib, th->ctid) == -1) {
_freestack(th->stk); _freestack(th->stk);
free(th->tls); free(th->tls);
return -1; return -1;

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE. PERFORMANCE OF THIS SOFTWARE.
*/ */
#include "libc/calls/calls.h" #include "libc/calls/calls.h"
#include "libc/calls/struct/timespec.h"
#include "libc/dce.h" #include "libc/dce.h"
#include "libc/errno.h" #include "libc/errno.h"
#include "libc/intrin/kprintf.h" #include "libc/intrin/kprintf.h"
@ -31,6 +32,8 @@
#include "libc/runtime/runtime.h" #include "libc/runtime/runtime.h"
#include "libc/runtime/stack.h" #include "libc/runtime/stack.h"
#include "libc/runtime/symbols.internal.h" #include "libc/runtime/symbols.internal.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/clock.h"
#include "libc/sysv/consts/clone.h" #include "libc/sysv/consts/clone.h"
#include "libc/sysv/consts/map.h" #include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/o.h" #include "libc/sysv/consts/o.h"
@ -165,6 +168,6 @@ BENCH(clone, bench) {
char *volatile tp; char *volatile tp;
errno_t *volatile ep; errno_t *volatile ep;
EZBENCH2("__errno_location", donothing, (ep = __errno_location())); EZBENCH2("__errno_location", donothing, (ep = __errno_location()));
EZBENCH2("__get_tls_inline", donothing, (tp = __get_tls_inline())); EZBENCH2("__get_tls_privileged", donothing, (tp = __get_tls_privileged()));
EZBENCH2("__get_tls", donothing, (tp = __get_tls())); EZBENCH2("__get_tls", donothing, (tp = __get_tls()));
} }

View file

@ -20,11 +20,13 @@
#include "libc/calls/struct/sched_param.h" #include "libc/calls/struct/sched_param.h"
#include "libc/dce.h" #include "libc/dce.h"
#include "libc/fmt/fmt.h" #include "libc/fmt/fmt.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/spinlock.h" #include "libc/intrin/spinlock.h"
#include "libc/intrin/wait0.internal.h" #include "libc/intrin/wait0.internal.h"
#include "libc/math.h" #include "libc/math.h"
#include "libc/mem/mem.h" #include "libc/mem/mem.h"
#include "libc/runtime/gc.internal.h" #include "libc/runtime/gc.internal.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/stack.h" #include "libc/runtime/stack.h"
#include "libc/stdio/stdio.h" #include "libc/stdio/stdio.h"
#include "libc/sysv/consts/clone.h" #include "libc/sysv/consts/clone.h"