mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-06-27 06:48:31 +00:00
Simplify TLS and reduce startup latency
This change simplifies the thread-local storage support code. On Windows and Mac OS X the startup latency of __enable_tls() has been reduced from 30ms to 1ms. On Windows, TLS memory accesses will now go much faster due to better self-modifying code that prevents a function call and acquires our thread information block pointer in a single instruction.
This commit is contained in:
parent
38c3fa63fe
commit
b1d9d11be1
15 changed files with 136 additions and 312 deletions
|
@ -16,6 +16,7 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/bits/bits.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/strace.internal.h"
|
||||
#include "libc/calls/syscall-sysv.internal.h"
|
||||
|
@ -45,11 +46,16 @@
|
|||
#define _TLDZ ((intptr_t)_tdata_size)
|
||||
#define _TIBZ sizeof(struct cthread_descriptor_t)
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
|
||||
|
||||
extern unsigned char __tls_mov_nt_rax[];
|
||||
extern unsigned char __tls_add_nt_rax[];
|
||||
|
||||
/**
|
||||
* Enables thread local storage.
|
||||
*/
|
||||
privileged void __enable_tls(void) {
|
||||
if (__tls_enabled) return;
|
||||
STRACE("__enable_tls()");
|
||||
|
@ -111,98 +117,93 @@ privileged void __enable_tls(void) {
|
|||
: "rcx", "r11", "memory");
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to rewrite SysV _Thread_local code. You MUST use the
|
||||
* -mno-tls-direct-seg-refs flag which generates code like this
|
||||
*
|
||||
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
|
||||
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
|
||||
*
|
||||
* Which on Mac we can replace with this:
|
||||
*
|
||||
* 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
|
||||
*
|
||||
* Whereas on Windows we'll replace it with this:
|
||||
*
|
||||
* 0f 1f 40 00 fatnop4
|
||||
* e8 xx xx xx xx call __tls_mov_nt_%R
|
||||
*
|
||||
* Since we have no idea where the TLS instructions exist in the
|
||||
* binary, we need to disassemble the whole program image. This'll
|
||||
* potentially take a few milliseconds for some larger programs.
|
||||
*
|
||||
* We check `_tls_content` which is generated by the linker script
|
||||
* since it lets us determine ahead of time if _Thread_local vars
|
||||
* have actually been linked into this program.
|
||||
*
|
||||
* TODO(jart): compute probability this is just overkill
|
||||
*/
|
||||
// We need to rewrite SysV _Thread_local code. You MUST use the
|
||||
// -mno-tls-direct-seg-refs flag which generates code like this
|
||||
//
|
||||
// 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
|
||||
// 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
|
||||
//
|
||||
// Which on Mac we can replace with this:
|
||||
//
|
||||
// 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
|
||||
//
|
||||
// Whereas on Windows we'll replace it with this:
|
||||
//
|
||||
// 0f 1f 40 00 fatnop4
|
||||
// e8 xx xx xx xx call __tls_mov_nt_%R
|
||||
//
|
||||
// Since we have no idea where the TLS instructions exist in the
|
||||
// binary, we need to disassemble the whole program image. This'll
|
||||
// potentially take a few milliseconds for some larger programs.
|
||||
//
|
||||
// We check `_tls_content` which is generated by the linker script
|
||||
// since it lets us determine ahead of time if _Thread_local vars
|
||||
// have actually been linked into this program.
|
||||
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
|
||||
int n, reg, dis;
|
||||
int n;
|
||||
uint64_t w;
|
||||
unsigned m, dis;
|
||||
unsigned char *p;
|
||||
const unsigned char *impl;
|
||||
struct XedDecodedInst xedd;
|
||||
__morph_begin();
|
||||
|
||||
// The most expensive part of this process is we need to compute the
|
||||
// byte length of each instruction in our program. We'll use Intel's
|
||||
// disassembler for this purpose.
|
||||
for (p = _ereal; p < __privileged_start; p += n) {
|
||||
xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64);
|
||||
if (!xed_instruction_length_decode(&xedd, p, 15)) {
|
||||
if (IsXnu()) {
|
||||
// Apple is quite straightforward to patch. We basically
|
||||
// just change the segment register, and the linear slot
|
||||
// address 0x30 was promised to us, according to Go team
|
||||
// https://github.com/golang/go/issues/23617
|
||||
dis = 0x30;
|
||||
} else {
|
||||
// MSVC __declspec(thread) generates binary code for this
|
||||
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
|
||||
// times we should be good.
|
||||
dis = 0x1480 + __tls_index * 8;
|
||||
}
|
||||
|
||||
// We now know p[0] is most likely the first byte of an x86 op.
|
||||
// Let's check and see if it's the GCC linear TIB address load.
|
||||
// We hope and pray GCC won't generate TLS stores to %r8..%r15.
|
||||
if (xedd.length == 9 && //
|
||||
0144 == p[0] && // fs
|
||||
0110 == p[1] && // rex.w (64-bit operand size)
|
||||
(0213 == p[2] || // mov reg/mem → reg (word-sized)
|
||||
0003 == p[2]) && // add reg/mem → reg (word-sized)
|
||||
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
|
||||
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
|
||||
0000 == p[5] && // displacement (von Neumann endian)
|
||||
0000 == p[6] && // displacement
|
||||
0000 == p[7] && // displacement
|
||||
0000 == p[8]) { // displacement
|
||||
// iterate over modifiable code looking for 9 byte instruction
|
||||
// this would take 30 ms using xed to enable tls on python.com
|
||||
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
|
||||
|
||||
// Apple is quite straightforward to patch. We basically
|
||||
// just change the segment register, and the linear slot
|
||||
if (IsXnu()) {
|
||||
p[0] = 0145; // this changes gs segment to fs segment
|
||||
p[5] = 0x30; // tib slot index for tib linear address
|
||||
}
|
||||
|
||||
// Windows is kind of complicated. We need to replace the
|
||||
// segment mov instruction with a function call, that (a)
|
||||
// won't clobber registers, and (b) has a return register
|
||||
// that's the same as the mov destination. When setting
|
||||
// function displacement, &CALL+5+DISP must equal &FUNC.
|
||||
else {
|
||||
if (p[2] == 3) {
|
||||
impl = __tls_add_nt_rax;
|
||||
} else {
|
||||
impl = __tls_mov_nt_rax;
|
||||
}
|
||||
reg = (p[3] & 070) >> 3;
|
||||
dis = (impl + reg * 18) - (p + 9);
|
||||
p[0] = 0017; // map1
|
||||
p[1] = 0037; // nopl (onl if reg=0)
|
||||
p[2] = 0100; // mod/rm (%rax)+disp8
|
||||
p[3] = 0000; // displacement
|
||||
p[4] = 0350; // call
|
||||
p[5] = (dis & 0x000000ff) >> 000; // displacement
|
||||
p[6] = (dis & 0x0000ff00) >> 010; // displacement
|
||||
p[7] = (dis & 0x00ff0000) >> 020; // displacement
|
||||
p[8] = (dis & 0xff000000) >> 030; // displacement
|
||||
}
|
||||
// use sse to zoom zoom to fs register prefixes
|
||||
// that way it'll take 1 ms to morph python.com
|
||||
while (p + 9 + 16 <= __privileged_start) {
|
||||
if ((m = __builtin_ia32_pmovmskb128(
|
||||
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
|
||||
0144, 0144, 0144, 0144, 0144, 0144,
|
||||
0144, 0144, 0144, 0144}))) {
|
||||
m = __builtin_ctzll(m);
|
||||
p += m;
|
||||
break;
|
||||
} else {
|
||||
p += 16;
|
||||
}
|
||||
}
|
||||
|
||||
// Move to the next instruction.
|
||||
n = xedd.length;
|
||||
// we're checking for the following expression:
|
||||
// 0144 == p[0] && // fs
|
||||
// 0110 == p[1] && // rex.w (64-bit operand size)
|
||||
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
|
||||
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
|
||||
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
|
||||
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
|
||||
// 0000 == p[5] && // displacement (von Neumann endian)
|
||||
// 0000 == p[6] && // displacement
|
||||
// 0000 == p[7] && // displacement
|
||||
// 0000 == p[8] // displacement
|
||||
w = READ64LE(p) & READ64LE("\377\377\377\307\377\377\377\377");
|
||||
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
|
||||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
|
||||
!p[8]) {
|
||||
|
||||
// now change the code
|
||||
p[0] = 0145; // this changes gs segment to fs segment
|
||||
p[5] = (dis & 0x000000ff) >> 000; // displacement
|
||||
p[6] = (dis & 0x0000ff00) >> 010; // displacement
|
||||
p[7] = (dis & 0x00ff0000) >> 020; // displacement
|
||||
p[8] = (dis & 0xff000000) >> 030; // displacement
|
||||
|
||||
// advance to the next instruction
|
||||
n = 9;
|
||||
} else {
|
||||
// If Xed failed to decode the instruction, then we'll just plow
|
||||
// through memory one byte at a time until Xed's morale improves
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue