cosmopolitan/libc/runtime/enable_tls.c
2022-07-10 08:27:50 -07:00

216 lines
9.2 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2022 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/nt/thread.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/nrlinux.h"
#include "libc/thread/thread.h"
#include "third_party/xed/x86.h"
#define __NR_sysarch 0x000000a5 // freebsd+netbsd
#define AMD64_SET_GSBASE 131 // freebsd
#define AMD64_SET_FSBASE 129 // freebsd
#define X86_SET_GSBASE 16 // netbsd
#define X86_SET_FSBASE 17 // netbsd
#define __NR___set_tcb 0x00000149
#define __NR__lwp_setprivate 0x0000013d
#define __NR_thread_fast_set_cthread_self 0x03000003
#define _TLSZ ((intptr_t)_tls_size)
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t)
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[];
privileged void __enable_tls(void) {
if (__tls_enabled) return;
STRACE("__enable_tls()");
// allocate tls memory for main process
//
// %fs Linux/BSDs
// │
// _Thread_local │ __get_tls()
// ┌───┬──────────┬──────────┼───┐
// │pad│ .tdata │ .tbss │tib│
// └───┴──────────┴──────────┼───┘
// │
// Windows/Mac %gs
//
size_t siz;
cthread_t tib;
char *mem, *tls;
siz = ROUNDUP(_TLSZ + _TIBZ, FRAMESIZE);
mem = _mapanon(siz);
tib = (cthread_t)(mem + siz - _TIBZ);
tls = mem + siz - _TIBZ - _TLSZ;
tib->self = tib;
tib->self2 = tib;
tib->err = __errno;
tib->tid = sys_gettid();
memmove(tls, _tdata_start, _TLDZ);
// ask the operating system to change the x86 segment register
int ax, dx;
if (IsWindows()) {
__tls_index = __imp_TlsAlloc();
asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tib));
} else if (IsFreebsd()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_sysarch), "D"(AMD64_SET_FSBASE), "S"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsNetbsd()) {
asm volatile("syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sysarch), "D"(X86_SET_FSBASE), "S"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsXnu()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_thread_fast_set_cthread_self),
"D"((intptr_t)tib - 0x30)
: "rcx", "r11", "memory", "cc");
} else if (IsOpenbsd()) {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR___set_tcb), "D"(tib)
: "rcx", "r11", "memory", "cc");
} else {
asm volatile("syscall"
: "=a"(ax)
: "0"(__NR_linux_arch_prctl), "D"(ARCH_SET_FS), "S"(tib)
: "rcx", "r11", "memory");
}
/*
* We need to rewrite SysV _Thread_local code. You MUST use the
* -mno-tls-direct-seg-refs flag which generates code like this
*
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
*
* Which on Mac we can replace with this:
*
* 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
*
* Whereas on Windows we'll replace it with this:
*
* 0f 1f 40 00 fatnop4
* e8 xx xx xx xx call __tls_mov_nt_%R
*
* Since we have no idea where the TLS instructions exist in the
* binary, we need to disassemble the whole program image. This'll
* potentially take a few milliseconds for some larger programs.
*
* We check `_tls_content` which is generated by the linker script
* since it lets us determine ahead of time if _Thread_local vars
* have actually been linked into this program.
*
* TODO(jart): compute probability this is just overkill
*/
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
int n, reg, dis;
unsigned char *p;
const unsigned char *impl;
struct XedDecodedInst xedd;
__morph_begin();
// The most expensive part of this process is we need to compute the
// byte length of each instruction in our program. We'll use Intel's
// disassembler for this purpose.
for (p = _ereal; p < __privileged_start; p += n) {
xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64);
if (!xed_instruction_length_decode(&xedd, p, 15)) {
// We now know p[0] is most likely the first byte of an x86 op.
// Let's check and see if it's the GCC linear TIB address load.
// We hope and pray GCC won't generate TLS stores to %r8..%r15.
if (xedd.length == 9 && //
0144 == p[0] && // fs
0110 == p[1] && // rex.w (64-bit operand size)
(0213 == p[2] || // mov reg/mem → reg (word-sized)
0003 == p[2]) && // add reg/mem → reg (word-sized)
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
0000 == p[5] && // displacement (von Neumann endian)
0000 == p[6] && // displacement
0000 == p[7] && // displacement
0000 == p[8]) { // displacement
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
if (IsXnu()) {
p[0] = 0145; // this changes gs segment to fs segment
p[5] = 0x30; // tib slot index for tib linear address
}
// Windows is kind of complicated. We need to replace the
// segment mov instruction with a function call, that (a)
// won't clobber registers, and (b) has a return register
// that's the same as the mov destination. When setting
// function displacement, &CALL+5+DISP must equal &FUNC.
else {
if (p[2] == 3) {
impl = __tls_add_nt_rax;
} else {
impl = __tls_mov_nt_rax;
}
reg = (p[3] & 070) >> 3;
dis = (impl + reg * 18) - (p + 9);
p[0] = 0017; // map1
p[1] = 0037; // nopl (onl if reg=0)
p[2] = 0100; // mod/rm (%rax)+disp8
p[3] = 0000; // displacement
p[4] = 0350; // call
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
}
}
// Move to the next instruction.
n = xedd.length;
} else {
// If Xed failed to decode the instruction, then we'll just plow
// through memory one byte at a time until Xed's morale improves
n = 1;
}
}
__morph_end();
}
// we are now allowed to use tls
// setting this variable
__tls_enabled = true;
}