mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-03-03 07:29:23 +00:00
Simplify TLS and reduce startup latency
This change simplifies the thread-local storage support code. On Windows and Mac OS X the startup latency of __enable_tls() has been reduced from 30ms to 1ms. On Windows, TLS memory accesses will now go much faster due to better self-modifying code that prevents a function call and acquires our thread information block pointer in a single instruction.
This commit is contained in:
parent
38c3fa63fe
commit
b1d9d11be1
15 changed files with 136 additions and 312 deletions
|
@ -67,7 +67,7 @@ int chdir(const char *);
|
||||||
int chmod(const char *, uint32_t);
|
int chmod(const char *, uint32_t);
|
||||||
int chown(const char *, uint32_t, uint32_t);
|
int chown(const char *, uint32_t, uint32_t);
|
||||||
int chroot(const char *);
|
int chroot(const char *);
|
||||||
int clone(void *, void *, size_t, int, void *, int *, void *, size_t, int *);
|
int clone(void *, void *, size_t, int, void *, int *, void *, int *);
|
||||||
int close(int);
|
int close(int);
|
||||||
int creat(const char *, uint32_t);
|
int creat(const char *, uint32_t);
|
||||||
int dup(int);
|
int dup(int);
|
||||||
|
|
|
@ -50,10 +50,10 @@
|
||||||
* @asyncsignalsafe
|
* @asyncsignalsafe
|
||||||
* @threadsafe
|
* @threadsafe
|
||||||
*/
|
*/
|
||||||
privileged int gettid(void) {
|
int gettid(void) {
|
||||||
int tid;
|
int tid;
|
||||||
if (__tls_enabled) {
|
if (__tls_enabled) {
|
||||||
tid = *(int *)(__get_tls_inline() + 0x38);
|
tid = *(int *)(__get_tls() + 0x38);
|
||||||
if (tid > 0) return tid;
|
if (tid > 0) return tid;
|
||||||
}
|
}
|
||||||
return sys_gettid();
|
return sys_gettid();
|
||||||
|
|
|
@ -314,7 +314,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
|
||||||
if (!__tls_enabled) {
|
if (!__tls_enabled) {
|
||||||
x = __pid;
|
x = __pid;
|
||||||
} else {
|
} else {
|
||||||
x = *(int *)(__get_tls_inline() + 0x38);
|
x = *(int *)(__get_tls_privileged() + 0x38);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
x = 666;
|
x = 666;
|
||||||
|
@ -395,8 +395,7 @@ privileged static size_t kformat(char *b, size_t n, const char *fmt,
|
||||||
i = 0;
|
i = 0;
|
||||||
m = (1 << base) - 1;
|
m = (1 << base) - 1;
|
||||||
if (hash && x) sign = hash;
|
if (hash && x) sign = hash;
|
||||||
do
|
do z[i++ & 127] = abet[x & m];
|
||||||
z[i++ & 127] = abet[x & m];
|
|
||||||
while ((x >>= base) || (pdot && i < prec));
|
while ((x >>= base) || (pdot && i < prec));
|
||||||
goto EmitNumber;
|
goto EmitNumber;
|
||||||
|
|
||||||
|
|
|
@ -5,10 +5,27 @@
|
||||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||||
COSMOPOLITAN_C_START_
|
COSMOPOLITAN_C_START_
|
||||||
|
|
||||||
char *__get_tls(void) libcesque nosideeffect;
|
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
|
||||||
|
/**
|
||||||
|
* Returns location of thread information block.
|
||||||
|
*
|
||||||
|
* This can't be used in privileged functions.
|
||||||
|
*/
|
||||||
|
static noasan inline char *__get_tls(void) {
|
||||||
|
char *tib;
|
||||||
|
asm("mov\t%%fs:0,%0" : "=r"(tib) : /* no inputs */ : "memory");
|
||||||
|
return tib;
|
||||||
|
}
|
||||||
|
#endif /* GNU x86-64 */
|
||||||
|
|
||||||
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
|
#if defined(__GNUC__) && defined(__x86_64__) && !defined(__STRICT_ANSI__)
|
||||||
static noasan inline char *__get_tls_inline(void) {
|
/**
|
||||||
|
* Returns location of thread information block.
|
||||||
|
*
|
||||||
|
* This should be favored over __get_tls() for .privileged code that
|
||||||
|
* can't be self-modified by __enable_tls().
|
||||||
|
*/
|
||||||
|
static noasan inline char *__get_tls_privileged(void) {
|
||||||
char *tib, *lin = (char *)0x30;
|
char *tib, *lin = (char *)0x30;
|
||||||
if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) {
|
if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) {
|
||||||
asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
|
asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory");
|
||||||
|
|
|
@ -113,8 +113,7 @@ WinThreadEntry(int rdi, // rcx
|
||||||
|
|
||||||
static textwindows int CloneWindows(int (*func)(void *, int), char *stk,
|
static textwindows int CloneWindows(int (*func)(void *, int), char *stk,
|
||||||
size_t stksz, int flags, void *arg,
|
size_t stksz, int flags, void *arg,
|
||||||
void *tls, size_t tlssz, int *ptid,
|
void *tls, int *ptid, int *ctid) {
|
||||||
int *ctid) {
|
|
||||||
int64_t h;
|
int64_t h;
|
||||||
struct CloneArgs *wt;
|
struct CloneArgs *wt;
|
||||||
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
|
wt = (struct CloneArgs *)(((intptr_t)(stk + stksz) -
|
||||||
|
@ -193,7 +192,7 @@ XnuThreadMain(void *pthread, // rdi
|
||||||
}
|
}
|
||||||
|
|
||||||
static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
|
static int CloneXnu(int (*fn)(void *), char *stk, size_t stksz, int flags,
|
||||||
void *arg, void *tls, size_t tlssz, int *ptid, int *ctid) {
|
void *arg, void *tls, int *ptid, int *ctid) {
|
||||||
int rc;
|
int rc;
|
||||||
bool failed;
|
bool failed;
|
||||||
static bool once;
|
static bool once;
|
||||||
|
@ -244,8 +243,7 @@ static wontreturn void FreebsdThreadMain(void *p) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
|
static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
|
||||||
int flags, void *arg, void *tls, size_t tlssz,
|
int flags, void *arg, void *tls, int *ptid, int *ctid) {
|
||||||
int *ptid, int *ctid) {
|
|
||||||
int ax;
|
int ax;
|
||||||
bool failed;
|
bool failed;
|
||||||
int64_t tid;
|
int64_t tid;
|
||||||
|
@ -265,7 +263,7 @@ static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
|
||||||
.stack_base = stk,
|
.stack_base = stk,
|
||||||
.stack_size = (((intptr_t)wt - (intptr_t)stk) & -16) - 8,
|
.stack_size = (((intptr_t)wt - (intptr_t)stk) & -16) - 8,
|
||||||
.tls_base = flags & CLONE_SETTLS ? tls : 0,
|
.tls_base = flags & CLONE_SETTLS ? tls : 0,
|
||||||
.tls_size = flags & CLONE_SETTLS ? tlssz : 0,
|
.tls_size = 64,
|
||||||
.child_tid = &wt->tid64,
|
.child_tid = &wt->tid64,
|
||||||
.parent_tid = &tid,
|
.parent_tid = &tid,
|
||||||
};
|
};
|
||||||
|
@ -319,8 +317,7 @@ noasan static wontreturn void OpenbsdThreadMain(void *p) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
|
static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
|
||||||
int flags, void *arg, void *tls, size_t tlssz,
|
int flags, void *arg, void *tls, int *ptid, int *ctid) {
|
||||||
int *ptid, int *ctid) {
|
|
||||||
int tid;
|
int tid;
|
||||||
intptr_t sp;
|
intptr_t sp;
|
||||||
struct __tfork *tf;
|
struct __tfork *tf;
|
||||||
|
@ -373,8 +370,7 @@ static wontreturn void NetbsdThreadMain(void *arg, // rdi
|
||||||
}
|
}
|
||||||
|
|
||||||
static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
|
static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz,
|
||||||
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
|
int flags, void *arg, void *tls, int *ptid, int *ctid) {
|
||||||
int *ctid) {
|
|
||||||
// NetBSD has its own clone() and it works, but it's technically a
|
// NetBSD has its own clone() and it works, but it's technically a
|
||||||
// second-class API, intended to help Linux folks migrate to this.
|
// second-class API, intended to help Linux folks migrate to this.
|
||||||
bool failed;
|
bool failed;
|
||||||
|
@ -465,8 +461,7 @@ int sys_clone_linux(int flags, // rdi
|
||||||
void *arg); // 8(rsp)
|
void *arg); // 8(rsp)
|
||||||
|
|
||||||
static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
|
static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
|
||||||
int flags, void *arg, void *tls, size_t tlssz, int *ptid,
|
int flags, void *arg, void *tls, int *ptid, int *ctid) {
|
||||||
int *ctid) {
|
|
||||||
long sp;
|
long sp;
|
||||||
sp = (intptr_t)(stk + stksz);
|
sp = (intptr_t)(stk + stksz);
|
||||||
if (~flags & CLONE_CHILD_SETTID) {
|
if (~flags & CLONE_CHILD_SETTID) {
|
||||||
|
@ -589,14 +584,13 @@ static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
|
||||||
* @param arg is passed as an argument to `func` in the child thread
|
* @param arg is passed as an argument to `func` in the child thread
|
||||||
* @param tls may be used to set the thread local storage segment;
|
* @param tls may be used to set the thread local storage segment;
|
||||||
* this parameter is ignored if `CLONE_SETTLS` is not set
|
* this parameter is ignored if `CLONE_SETTLS` is not set
|
||||||
* @param tlssz is the size of tls in bytes which must be at least 64
|
|
||||||
* @param ctid lets the child receive its thread id without having to
|
* @param ctid lets the child receive its thread id without having to
|
||||||
* call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set
|
* call gettid() and is ignored if `CLONE_CHILD_SETTID` isn't set
|
||||||
* @return tid of child on success, or -1 w/ errno
|
* @return tid of child on success, or -1 w/ errno
|
||||||
* @threadsafe
|
* @threadsafe
|
||||||
*/
|
*/
|
||||||
int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
|
int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
|
||||||
void *tls, size_t tlssz, int *ctid) {
|
void *tls, int *ctid) {
|
||||||
int rc;
|
int rc;
|
||||||
struct CloneArgs *wt;
|
struct CloneArgs *wt;
|
||||||
|
|
||||||
|
@ -606,13 +600,12 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
|
||||||
if (!func) {
|
if (!func) {
|
||||||
rc = einval();
|
rc = einval();
|
||||||
} else if (!IsTiny() &&
|
} else if (!IsTiny() &&
|
||||||
(((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15))) ||
|
((flags & CLONE_VM) && (stksz < PAGESIZE || (stksz & 15)))) {
|
||||||
((flags & CLONE_SETTLS) && (tlssz < 64 || (tlssz & 7))))) {
|
|
||||||
rc = einval();
|
rc = einval();
|
||||||
} else if (IsAsan() &&
|
} else if (IsAsan() &&
|
||||||
((stksz > PAGESIZE &&
|
((stksz > PAGESIZE &&
|
||||||
!__asan_is_valid((char *)stk + PAGESIZE, stksz - PAGESIZE)) ||
|
!__asan_is_valid((char *)stk + PAGESIZE, stksz - PAGESIZE)) ||
|
||||||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, tlssz)) ||
|
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, 64)) ||
|
||||||
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, sizeof(long))) ||
|
((flags & CLONE_SETTLS) && !__asan_is_valid(tls, sizeof(long))) ||
|
||||||
((flags & CLONE_PARENT_SETTID) &&
|
((flags & CLONE_PARENT_SETTID) &&
|
||||||
!__asan_is_valid(ptid, sizeof(*ptid))) ||
|
!__asan_is_valid(ptid, sizeof(*ptid))) ||
|
||||||
|
@ -620,7 +613,7 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
|
||||||
!__asan_is_valid(ctid, sizeof(*ctid))))) {
|
!__asan_is_valid(ctid, sizeof(*ctid))))) {
|
||||||
rc = efault();
|
rc = efault();
|
||||||
} else if (IsLinux()) {
|
} else if (IsLinux()) {
|
||||||
rc = CloneLinux(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
|
rc = CloneLinux(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||||
} else if (!IsTiny() &&
|
} else if (!IsTiny() &&
|
||||||
(flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
|
(flags & ~(CLONE_SETTLS | CLONE_PARENT_SETTID |
|
||||||
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
|
CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)) !=
|
||||||
|
@ -629,15 +622,15 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
|
||||||
STRACE("clone flag unsupported on this platform");
|
STRACE("clone flag unsupported on this platform");
|
||||||
rc = einval();
|
rc = einval();
|
||||||
} else if (IsXnu()) {
|
} else if (IsXnu()) {
|
||||||
rc = CloneXnu(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
|
rc = CloneXnu(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||||
} else if (IsFreebsd()) {
|
} else if (IsFreebsd()) {
|
||||||
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
|
rc = CloneFreebsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||||
} else if (IsNetbsd()) {
|
} else if (IsNetbsd()) {
|
||||||
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
|
rc = CloneNetbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||||
} else if (IsOpenbsd()) {
|
} else if (IsOpenbsd()) {
|
||||||
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
|
rc = CloneOpenbsd(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||||
} else if (IsWindows()) {
|
} else if (IsWindows()) {
|
||||||
rc = CloneWindows(func, stk, stksz, flags, arg, tls, tlssz, ptid, ctid);
|
rc = CloneWindows(func, stk, stksz, flags, arg, tls, ptid, ctid);
|
||||||
} else {
|
} else {
|
||||||
rc = enosys();
|
rc = enosys();
|
||||||
}
|
}
|
||||||
|
@ -647,8 +640,8 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
|
||||||
*ptid = rc;
|
*ptid = rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %'zu, %p) → %d% m", func, stk,
|
STRACE("clone(%t, %p, %'zu, %#x, %p, %p, %p, %p) → %d% m", func, stk, stksz,
|
||||||
stksz, flags, arg, ptid, tls, tlssz, ctid, rc);
|
flags, arg, ptid, tls, ctid, rc);
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||||
|
#include "libc/bits/bits.h"
|
||||||
#include "libc/calls/calls.h"
|
#include "libc/calls/calls.h"
|
||||||
#include "libc/calls/strace.internal.h"
|
#include "libc/calls/strace.internal.h"
|
||||||
#include "libc/calls/syscall-sysv.internal.h"
|
#include "libc/calls/syscall-sysv.internal.h"
|
||||||
|
@ -45,11 +46,16 @@
|
||||||
#define _TLDZ ((intptr_t)_tdata_size)
|
#define _TLDZ ((intptr_t)_tdata_size)
|
||||||
#define _TIBZ sizeof(struct cthread_descriptor_t)
|
#define _TIBZ sizeof(struct cthread_descriptor_t)
|
||||||
|
|
||||||
|
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||||
|
|
||||||
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
|
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
|
||||||
|
|
||||||
extern unsigned char __tls_mov_nt_rax[];
|
extern unsigned char __tls_mov_nt_rax[];
|
||||||
extern unsigned char __tls_add_nt_rax[];
|
extern unsigned char __tls_add_nt_rax[];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enables thread local storage.
|
||||||
|
*/
|
||||||
privileged void __enable_tls(void) {
|
privileged void __enable_tls(void) {
|
||||||
if (__tls_enabled) return;
|
if (__tls_enabled) return;
|
||||||
STRACE("__enable_tls()");
|
STRACE("__enable_tls()");
|
||||||
|
@ -111,98 +117,93 @@ privileged void __enable_tls(void) {
|
||||||
: "rcx", "r11", "memory");
|
: "rcx", "r11", "memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
// We need to rewrite SysV _Thread_local code. You MUST use the
|
||||||
* We need to rewrite SysV _Thread_local code. You MUST use the
|
// -mno-tls-direct-seg-refs flag which generates code like this
|
||||||
* -mno-tls-direct-seg-refs flag which generates code like this
|
//
|
||||||
*
|
// 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
|
||||||
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
|
// 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
|
||||||
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
|
//
|
||||||
*
|
// Which on Mac we can replace with this:
|
||||||
* Which on Mac we can replace with this:
|
//
|
||||||
*
|
// 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
|
||||||
* 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
|
//
|
||||||
*
|
// Whereas on Windows we'll replace it with this:
|
||||||
* Whereas on Windows we'll replace it with this:
|
//
|
||||||
*
|
// 0f 1f 40 00 fatnop4
|
||||||
* 0f 1f 40 00 fatnop4
|
// e8 xx xx xx xx call __tls_mov_nt_%R
|
||||||
* e8 xx xx xx xx call __tls_mov_nt_%R
|
//
|
||||||
*
|
// Since we have no idea where the TLS instructions exist in the
|
||||||
* Since we have no idea where the TLS instructions exist in the
|
// binary, we need to disassemble the whole program image. This'll
|
||||||
* binary, we need to disassemble the whole program image. This'll
|
// potentially take a few milliseconds for some larger programs.
|
||||||
* potentially take a few milliseconds for some larger programs.
|
//
|
||||||
*
|
// We check `_tls_content` which is generated by the linker script
|
||||||
* We check `_tls_content` which is generated by the linker script
|
// since it lets us determine ahead of time if _Thread_local vars
|
||||||
* since it lets us determine ahead of time if _Thread_local vars
|
// have actually been linked into this program.
|
||||||
* have actually been linked into this program.
|
|
||||||
*
|
|
||||||
* TODO(jart): compute probability this is just overkill
|
|
||||||
*/
|
|
||||||
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
|
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
|
||||||
int n, reg, dis;
|
int n;
|
||||||
|
uint64_t w;
|
||||||
|
unsigned m, dis;
|
||||||
unsigned char *p;
|
unsigned char *p;
|
||||||
const unsigned char *impl;
|
|
||||||
struct XedDecodedInst xedd;
|
|
||||||
__morph_begin();
|
__morph_begin();
|
||||||
|
|
||||||
// The most expensive part of this process is we need to compute the
|
if (IsXnu()) {
|
||||||
// byte length of each instruction in our program. We'll use Intel's
|
// Apple is quite straightforward to patch. We basically
|
||||||
// disassembler for this purpose.
|
// just change the segment register, and the linear slot
|
||||||
for (p = _ereal; p < __privileged_start; p += n) {
|
// address 0x30 was promised to us, according to Go team
|
||||||
xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64);
|
// https://github.com/golang/go/issues/23617
|
||||||
if (!xed_instruction_length_decode(&xedd, p, 15)) {
|
dis = 0x30;
|
||||||
|
} else {
|
||||||
|
// MSVC __declspec(thread) generates binary code for this
|
||||||
|
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
|
||||||
|
// times we should be good.
|
||||||
|
dis = 0x1480 + __tls_index * 8;
|
||||||
|
}
|
||||||
|
|
||||||
// We now know p[0] is most likely the first byte of an x86 op.
|
// iterate over modifiable code looking for 9 byte instruction
|
||||||
// Let's check and see if it's the GCC linear TIB address load.
|
// this would take 30 ms using xed to enable tls on python.com
|
||||||
// We hope and pray GCC won't generate TLS stores to %r8..%r15.
|
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
|
||||||
if (xedd.length == 9 && //
|
|
||||||
0144 == p[0] && // fs
|
|
||||||
0110 == p[1] && // rex.w (64-bit operand size)
|
|
||||||
(0213 == p[2] || // mov reg/mem → reg (word-sized)
|
|
||||||
0003 == p[2]) && // add reg/mem → reg (word-sized)
|
|
||||||
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
|
|
||||||
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
|
|
||||||
0000 == p[5] && // displacement (von Neumann endian)
|
|
||||||
0000 == p[6] && // displacement
|
|
||||||
0000 == p[7] && // displacement
|
|
||||||
0000 == p[8]) { // displacement
|
|
||||||
|
|
||||||
// Apple is quite straightforward to patch. We basically
|
// use sse to zoom zoom to fs register prefixes
|
||||||
// just change the segment register, and the linear slot
|
// that way it'll take 1 ms to morph python.com
|
||||||
if (IsXnu()) {
|
while (p + 9 + 16 <= __privileged_start) {
|
||||||
p[0] = 0145; // this changes gs segment to fs segment
|
if ((m = __builtin_ia32_pmovmskb128(
|
||||||
p[5] = 0x30; // tib slot index for tib linear address
|
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
|
||||||
}
|
0144, 0144, 0144, 0144, 0144, 0144,
|
||||||
|
0144, 0144, 0144, 0144}))) {
|
||||||
// Windows is kind of complicated. We need to replace the
|
m = __builtin_ctzll(m);
|
||||||
// segment mov instruction with a function call, that (a)
|
p += m;
|
||||||
// won't clobber registers, and (b) has a return register
|
break;
|
||||||
// that's the same as the mov destination. When setting
|
} else {
|
||||||
// function displacement, &CALL+5+DISP must equal &FUNC.
|
p += 16;
|
||||||
else {
|
|
||||||
if (p[2] == 3) {
|
|
||||||
impl = __tls_add_nt_rax;
|
|
||||||
} else {
|
|
||||||
impl = __tls_mov_nt_rax;
|
|
||||||
}
|
|
||||||
reg = (p[3] & 070) >> 3;
|
|
||||||
dis = (impl + reg * 18) - (p + 9);
|
|
||||||
p[0] = 0017; // map1
|
|
||||||
p[1] = 0037; // nopl (onl if reg=0)
|
|
||||||
p[2] = 0100; // mod/rm (%rax)+disp8
|
|
||||||
p[3] = 0000; // displacement
|
|
||||||
p[4] = 0350; // call
|
|
||||||
p[5] = (dis & 0x000000ff) >> 000; // displacement
|
|
||||||
p[6] = (dis & 0x0000ff00) >> 010; // displacement
|
|
||||||
p[7] = (dis & 0x00ff0000) >> 020; // displacement
|
|
||||||
p[8] = (dis & 0xff000000) >> 030; // displacement
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Move to the next instruction.
|
// we're checking for the following expression:
|
||||||
n = xedd.length;
|
// 0144 == p[0] && // fs
|
||||||
|
// 0110 == p[1] && // rex.w (64-bit operand size)
|
||||||
|
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
|
||||||
|
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
|
||||||
|
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
|
||||||
|
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
|
||||||
|
// 0000 == p[5] && // displacement (von Neumann endian)
|
||||||
|
// 0000 == p[6] && // displacement
|
||||||
|
// 0000 == p[7] && // displacement
|
||||||
|
// 0000 == p[8] // displacement
|
||||||
|
w = READ64LE(p) & READ64LE("\377\377\377\307\377\377\377\377");
|
||||||
|
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
|
||||||
|
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
|
||||||
|
!p[8]) {
|
||||||
|
|
||||||
|
// now change the code
|
||||||
|
p[0] = 0145; // this changes gs segment to fs segment
|
||||||
|
p[5] = (dis & 0x000000ff) >> 000; // displacement
|
||||||
|
p[6] = (dis & 0x0000ff00) >> 010; // displacement
|
||||||
|
p[7] = (dis & 0x00ff0000) >> 020; // displacement
|
||||||
|
p[8] = (dis & 0xff000000) >> 030; // displacement
|
||||||
|
|
||||||
|
// advance to the next instruction
|
||||||
|
n = 9;
|
||||||
} else {
|
} else {
|
||||||
// If Xed failed to decode the instruction, then we'll just plow
|
|
||||||
// through memory one byte at a time until Xed's morale improves
|
|
||||||
n = 1;
|
n = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,7 +72,7 @@ privileged void ftracer(void) {
|
||||||
long stackuse;
|
long stackuse;
|
||||||
struct FtraceTls *ft;
|
struct FtraceTls *ft;
|
||||||
struct StackFrame *sf;
|
struct StackFrame *sf;
|
||||||
ft = (struct FtraceTls *)(__get_tls_inline() + 0x08);
|
ft = (struct FtraceTls *)(__get_tls_privileged() + 0x08);
|
||||||
if (_cmpxchg(&ft->once, false, true)) {
|
if (_cmpxchg(&ft->once, false, true)) {
|
||||||
ft->lastaddr = -1;
|
ft->lastaddr = -1;
|
||||||
ft->skew = GetNestingLevelImpl(__builtin_frame_address(0));
|
ft->skew = GetNestingLevelImpl(__builtin_frame_address(0));
|
||||||
|
|
|
@ -28,5 +28,5 @@
|
||||||
*/
|
*/
|
||||||
privileged nocallersavedregisters errno_t *(__errno_location)(void) {
|
privileged nocallersavedregisters errno_t *(__errno_location)(void) {
|
||||||
if (!__tls_enabled) return &__errno;
|
if (!__tls_enabled) return &__errno;
|
||||||
return (errno_t *)(__get_tls_inline() + 0x3c);
|
return (errno_t *)(__get_tls_privileged() + 0x3c);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,33 +0,0 @@
|
||||||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
|
||||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
|
||||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
||||||
│ Copyright 2022 Justine Alexandra Roberts Tunney │
|
|
||||||
│ │
|
|
||||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
|
||||||
│ any purpose with or without fee is hereby granted, provided that the │
|
|
||||||
│ above copyright notice and this permission notice appear in all copies. │
|
|
||||||
│ │
|
|
||||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
|
||||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
|
||||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
|
||||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
|
||||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
|
||||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
|
||||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
|
||||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
|
||||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
||||||
#include "libc/nexgen32e/gettls.h"
|
|
||||||
#include "libc/nexgen32e/threaded.h"
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns address of thread information block.
|
|
||||||
*
|
|
||||||
* This function must not be called until TLS is initialized.
|
|
||||||
*
|
|
||||||
* @see __get_tls_inline()
|
|
||||||
* @see __install_tls()
|
|
||||||
* @see _spawn()
|
|
||||||
*/
|
|
||||||
optimizespeed char *__get_tls(void) {
|
|
||||||
return __get_tls_inline();
|
|
||||||
}
|
|
|
@ -39,8 +39,6 @@ LIBC_SYSV_A_FILES := \
|
||||||
libc/sysv/systemfive.S \
|
libc/sysv/systemfive.S \
|
||||||
libc/sysv/errno_location.greg.c \
|
libc/sysv/errno_location.greg.c \
|
||||||
libc/sysv/errno.c \
|
libc/sysv/errno.c \
|
||||||
libc/sysv/gettls.greg.c \
|
|
||||||
libc/sysv/tlspolyfill.S \
|
|
||||||
libc/sysv/errfun.S \
|
libc/sysv/errfun.S \
|
||||||
libc/sysv/strace.greg.c \
|
libc/sysv/strace.greg.c \
|
||||||
libc/sysv/describeos.greg.c \
|
libc/sysv/describeos.greg.c \
|
||||||
|
|
|
@ -1,156 +0,0 @@
|
||||||
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
|
||||||
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
|
||||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
||||||
│ Copyright 2022 Justine Alexandra Roberts Tunney │
|
|
||||||
│ │
|
|
||||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
|
||||||
│ any purpose with or without fee is hereby granted, provided that the │
|
|
||||||
│ above copyright notice and this permission notice appear in all copies. │
|
|
||||||
│ │
|
|
||||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
|
||||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
|
||||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
|
||||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
|
||||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
|
||||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
|
||||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
|
||||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
|
||||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
||||||
#include "libc/macros.internal.h"
|
|
||||||
|
|
||||||
// Code morphing TLS polyfills for The New Technology.
|
|
||||||
//
|
|
||||||
// @note msvc generates this code so it's stable
|
|
||||||
// @note func ordering follows x86 reg encoding
|
|
||||||
// @note each function is exactly 18 bytes
|
|
||||||
// @see __enable_threads()
|
|
||||||
|
|
||||||
__tls_mov_nt_rax:
|
|
||||||
push %rcx
|
|
||||||
mov __tls_index(%rip),%ecx
|
|
||||||
mov %gs:0x1480(,%rcx,8),%rax
|
|
||||||
pop %rcx
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rax,globl,hidden
|
|
||||||
|
|
||||||
__tls_mov_nt_rcx:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
mov %gs:0x1480(,%rax,8),%rcx
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rcx
|
|
||||||
|
|
||||||
__tls_mov_nt_rdx:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
mov %gs:0x1480(,%rax,8),%rdx
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rdx
|
|
||||||
|
|
||||||
__tls_mov_nt_rbx:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
mov %gs:0x1480(,%rax,8),%rbx
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rbx
|
|
||||||
|
|
||||||
__tls_mov_nt_rsp:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
mov %gs:0x1480(,%rax,8),%rsp
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rsp
|
|
||||||
|
|
||||||
__tls_mov_nt_rbp:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
mov %gs:0x1480(,%rax,8),%rbp
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rbp
|
|
||||||
|
|
||||||
__tls_mov_nt_rsi:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
mov %gs:0x1480(,%rax,8),%rsi
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rsi
|
|
||||||
|
|
||||||
__tls_mov_nt_rdi:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
mov %gs:0x1480(,%rax,8),%rdi
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_mov_nt_rdi
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
__tls_add_nt_rax:
|
|
||||||
push %rcx
|
|
||||||
mov __tls_index(%rip),%ecx
|
|
||||||
add %gs:0x1480(,%rcx,8),%rax
|
|
||||||
pop %rcx
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rax,globl,hidden
|
|
||||||
|
|
||||||
__tls_add_nt_rcx:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
add %gs:0x1480(,%rax,8),%rcx
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rcx
|
|
||||||
|
|
||||||
__tls_add_nt_rdx:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
add %gs:0x1480(,%rax,8),%rdx
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rdx
|
|
||||||
|
|
||||||
__tls_add_nt_rbx:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
add %gs:0x1480(,%rax,8),%rbx
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rbx
|
|
||||||
|
|
||||||
__tls_add_nt_rsp:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
add %gs:0x1480(,%rax,8),%rsp
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rsp
|
|
||||||
|
|
||||||
__tls_add_nt_rbp:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
add %gs:0x1480(,%rax,8),%rbp
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rbp
|
|
||||||
|
|
||||||
__tls_add_nt_rsi:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
add %gs:0x1480(,%rax,8),%rsi
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rsi
|
|
||||||
|
|
||||||
__tls_add_nt_rdi:
|
|
||||||
push %rax
|
|
||||||
mov __tls_index(%rip),%eax
|
|
||||||
add %gs:0x1480(,%rax,8),%rdi
|
|
||||||
pop %rax
|
|
||||||
ret
|
|
||||||
.endfn __tls_add_nt_rdi
|
|
|
@ -26,5 +26,5 @@ STATIC_YOINK("_main_thread_ctor");
|
||||||
* Returns thread descriptor of the current thread.
|
* Returns thread descriptor of the current thread.
|
||||||
*/
|
*/
|
||||||
cthread_t(cthread_self)(void) {
|
cthread_t(cthread_self)(void) {
|
||||||
return (cthread_t)__get_tls_inline();
|
return (cthread_t)__get_tls();
|
||||||
}
|
}
|
||||||
|
|
|
@ -107,7 +107,7 @@ int _spawn(int fun(void *, int), void *arg, struct spawn *opt_out_thread) {
|
||||||
CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
|
CLONE_VM | CLONE_THREAD | CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
|
||||||
CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
|
CLONE_SETTLS | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID |
|
||||||
CLONE_CHILD_CLEARTID,
|
CLONE_CHILD_CLEARTID,
|
||||||
arg, &th->ptid, th->tib, _TIBZ, th->ctid) == -1) {
|
arg, &th->ptid, th->tib, th->ctid) == -1) {
|
||||||
_freestack(th->stk);
|
_freestack(th->stk);
|
||||||
free(th->tls);
|
free(th->tls);
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||||
#include "libc/calls/calls.h"
|
#include "libc/calls/calls.h"
|
||||||
|
#include "libc/calls/struct/timespec.h"
|
||||||
#include "libc/dce.h"
|
#include "libc/dce.h"
|
||||||
#include "libc/errno.h"
|
#include "libc/errno.h"
|
||||||
#include "libc/intrin/kprintf.h"
|
#include "libc/intrin/kprintf.h"
|
||||||
|
@ -31,6 +32,8 @@
|
||||||
#include "libc/runtime/runtime.h"
|
#include "libc/runtime/runtime.h"
|
||||||
#include "libc/runtime/stack.h"
|
#include "libc/runtime/stack.h"
|
||||||
#include "libc/runtime/symbols.internal.h"
|
#include "libc/runtime/symbols.internal.h"
|
||||||
|
#include "libc/stdio/stdio.h"
|
||||||
|
#include "libc/sysv/consts/clock.h"
|
||||||
#include "libc/sysv/consts/clone.h"
|
#include "libc/sysv/consts/clone.h"
|
||||||
#include "libc/sysv/consts/map.h"
|
#include "libc/sysv/consts/map.h"
|
||||||
#include "libc/sysv/consts/o.h"
|
#include "libc/sysv/consts/o.h"
|
||||||
|
@ -165,6 +168,6 @@ BENCH(clone, bench) {
|
||||||
char *volatile tp;
|
char *volatile tp;
|
||||||
errno_t *volatile ep;
|
errno_t *volatile ep;
|
||||||
EZBENCH2("__errno_location", donothing, (ep = __errno_location()));
|
EZBENCH2("__errno_location", donothing, (ep = __errno_location()));
|
||||||
EZBENCH2("__get_tls_inline", donothing, (tp = __get_tls_inline()));
|
EZBENCH2("__get_tls_privileged", donothing, (tp = __get_tls_privileged()));
|
||||||
EZBENCH2("__get_tls", donothing, (tp = __get_tls()));
|
EZBENCH2("__get_tls", donothing, (tp = __get_tls()));
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,11 +20,13 @@
|
||||||
#include "libc/calls/struct/sched_param.h"
|
#include "libc/calls/struct/sched_param.h"
|
||||||
#include "libc/dce.h"
|
#include "libc/dce.h"
|
||||||
#include "libc/fmt/fmt.h"
|
#include "libc/fmt/fmt.h"
|
||||||
|
#include "libc/intrin/kprintf.h"
|
||||||
#include "libc/intrin/spinlock.h"
|
#include "libc/intrin/spinlock.h"
|
||||||
#include "libc/intrin/wait0.internal.h"
|
#include "libc/intrin/wait0.internal.h"
|
||||||
#include "libc/math.h"
|
#include "libc/math.h"
|
||||||
#include "libc/mem/mem.h"
|
#include "libc/mem/mem.h"
|
||||||
#include "libc/runtime/gc.internal.h"
|
#include "libc/runtime/gc.internal.h"
|
||||||
|
#include "libc/runtime/internal.h"
|
||||||
#include "libc/runtime/stack.h"
|
#include "libc/runtime/stack.h"
|
||||||
#include "libc/stdio/stdio.h"
|
#include "libc/stdio/stdio.h"
|
||||||
#include "libc/sysv/consts/clone.h"
|
#include "libc/sysv/consts/clone.h"
|
||||||
|
|
Loading…
Add table
Reference in a new issue