Always initialize thread local storage

We had previously not enabled TLS in MODE=tiny in order to keep the
smallest example programs (e.g. life.com) just 16kb in size. But it
was error prone doing that, so now we just always enable it because
this change uses hacks to ensure it won't increase life.com's size.

This change also fixes a bug on NetBSD, where signal handlers would
break thread local storage if SA_SIGINFO was being used. This looks
like it might be a bug in NetBSD, but it's got a simple workaround.
This commit is contained in:
Justine Tunney 2022-07-18 22:26:11 -07:00
parent 057e8f5b54
commit 69f4152f38
33 changed files with 174 additions and 123 deletions

View file

@ -199,7 +199,7 @@ int arch_prctl(int code, int64_t addr) {
case METAL:
return arch_prctl_msr(code, addr);
case FREEBSD:
/* claims support but it appears not */
// TODO(jart): this should use sysarch()
return arch_prctl_freebsd(code, addr);
case OPENBSD:
return arch_prctl_openbsd(code, addr);

View file

@ -565,21 +565,9 @@ static int CloneLinux(int (*func)(void *arg, int tid), char *stk, size_t stksz,
* either terminated or has finished using its stack memory
*
* - `CLONE_SETTLS` is needed if you intend to specify the `tls`
* argument, which provides a fast-path solution for changing the
* appropriate TLS segment register within the child thread. The
* child thread may then obtain a reference to the TIB address you
* supplied, by calling __get_tls(). Your C library holds certain
* expectations about the layout of your Thread Information Block
* (TIB), which are all documented by __initialize_tls(). That
* function can be used to initialize the first positive 64 bytes
* of your TLS allocation, which is the memory Cosmopolitan Libc
* wants for itself (and negative addresses are reserved by the
* GNU Linker). Using this flag will transition the C runtime to a
* `__tls_enabled` state automatically. If you use TLS for just
* one thread, then you must be specify TLS for ALL THREADS. It's
* a good idea to do that since TLS can offer considerable (i.e.
* multiple orders of a magnitude) performance improvement for
* TID-dependent C library services, e.g. recursive mutexes.
* argument, which after thread creation may be accessed using
* __get_tls(). Doing this means that `errno`, gettid(), etc.
* correctly work. Caveat emptor if you choose not to do this.
*
* @param arg is passed as an argument to `func` in the child thread
* @param tls may be used to set the thread local storage segment;
@ -594,8 +582,9 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
int rc;
struct CloneArgs *wt;
if (flags & CLONE_SETTLS) __enable_tls();
if (flags & CLONE_THREAD) __enable_threads();
if (flags & CLONE_THREAD) {
__enable_threads();
}
if (!func) {
rc = einval();

View file

@ -76,22 +76,14 @@ cosmo: push %rbp
ret
.endfn cosmo,weak
#if !IsTiny()
// Enable TLS early if _Thread_local is used
// In MODE=tiny you may need to explicitly call __enable_tls()
// Otherwise this would bloat life.com from 16kb 32kb D:
// Enables Thread Local Storage.
.init.start 304,_init_tls
mov $_tls_content,%eax
test %eax,%eax
jz 1f
push %rdi
push %rsi
call __enable_tls
pop %rsi
pop %rdi
jz 1f
1: .init.end 304,_init_tls
#endif
.init.end 304,_init_tls
#if !IsTiny()
// Creates deterministically addressed stack we can use

View file

@ -16,12 +16,16 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/bits/bits.h"
#include "libc/bits/weaken.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/kprintf.h"
#include "libc/log/libfatal.internal.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/nt/thread.h"
@ -52,12 +56,35 @@ __msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[];
_Alignas(long) static char __static_tls[5008];
/**
* Enables thread local storage.
*
* This function is always called by the core runtime to guarantee TLS
* is always available to your program. You must build your code using
* -mno-tls-direct-seg-refs if you want to use _Thread_local.
*
* You can use __get_tls() to get the linear address of your tib. When
* accessing TLS via privileged code you must use __get_tls_privileged
* because we need code morphing to support The New Technology and XNU
*
* On XNU and The New Technology, this function imposes 1ms of latency
* during startup for larger binaries like Python.
*
* If you don't want TLS and you're sure you're not using it, then you
* can disable it as follows:
*
* int main() {
* __tls_enabled = false;
* // do stuff
* }
*
* This is useful if you want to wrestle back control of %fs using the
* arch_prctl() function. However, such programs might not be portable
* and your `errno` variable also won't be thread safe anymore.
*/
privileged void __enable_tls(void) {
if (__tls_enabled) return;
STRACE("__enable_tls()");
// allocate tls memory for main process
@ -74,20 +101,42 @@ privileged void __enable_tls(void) {
size_t siz;
cthread_t tib;
char *mem, *tls;
siz = ROUNDUP(_TLSZ + _TIBZ, FRAMESIZE);
mem = _mapanon(siz);
siz = ROUNDUP(_TLSZ + _TIBZ, alignof(__static_tls));
if (siz <= sizeof(__static_tls)) {
// if tls requirement is small then use the static tls block
// which helps avoid a system call for appes with little tls
// this is crucial to keeping life.com 16 kilobytes in size!
_Static_assert(alignof(__static_tls) >= alignof(cthread_t));
mem = __static_tls;
} else {
// if this binary needs a hefty tls block then we'll bank on
// malloc() being linked, which links _mapanon(). otherwise
// if you exceed this, you need to STATIC_YOINK("_mapanon").
// please note that it's probably too early to call calloc()
assert(weaken(_mapanon));
siz = ROUNDUP(siz, FRAMESIZE);
mem = weaken(_mapanon)(siz);
assert(mem);
}
tib = (cthread_t)(mem + siz - _TIBZ);
tls = mem + siz - _TIBZ - _TLSZ;
tib->self = tib;
tib->self2 = tib;
tib->err = __errno;
tib->tid = sys_gettid();
memmove(tls, _tdata_start, _TLDZ);
if (IsLinux()) {
// gnu/systemd guarantees pid==tid for the main thread so we can
// avoid issuing a superfluous system call at startup in program
tib->tid = __pid;
} else {
tib->tid = sys_gettid();
}
__repmovsb(tls, _tdata_start, _TLDZ);
// ask the operating system to change the x86 segment register
int ax, dx;
if (IsWindows()) {
__tls_index = __imp_TlsAlloc();
assert(0 <= __tls_index && __tls_index < 64);
asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tib));
} else if (IsFreebsd()) {
asm volatile("syscall"
@ -95,9 +144,12 @@ privileged void __enable_tls(void) {
: "0"(__NR_sysarch), "D"(AMD64_SET_FSBASE), "S"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsNetbsd()) {
// netbsd has sysarch(X86_SET_FSBASE) but we can't use that because
// signal handlers will cause it to be reset due to net setting the
// _mc_tlsbase field in struct mcontext_netbsd.
asm volatile("syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sysarch), "D"(X86_SET_FSBASE), "S"(tib)
: "0"(__NR__lwp_setprivate), "D"(tib)
: "rcx", "r11", "memory", "cc");
} else if (IsXnu()) {
asm volatile("syscall"
@ -179,7 +231,7 @@ privileged void __enable_tls(void) {
}
// we're checking for the following expression:
// 0144 == p[0] && // fs
// 0144 == p[0] && // %fs
// 0110 == p[1] && // rex.w (64-bit operand size)
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
@ -195,7 +247,7 @@ privileged void __enable_tls(void) {
!p[8]) {
// now change the code
p[0] = 0145; // this changes gs segment to fs segment
p[0] = 0145; // change %fs to %gs
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement

View file

@ -92,7 +92,6 @@ privileged void ftracer(void) {
textstartup int ftrace_install(void) {
if (GetSymbolTable()) {
__enable_tls();
g_stackdigs = LengthInt64Thousands(GetStackSize());
return __hook(ftrace_hook, GetSymbolTable());
} else {

View file

@ -27,7 +27,7 @@ extern unsigned char _tls_size[];
extern unsigned char _tls_content[];
void _init(void) hidden;
void __enable_tls(void) hidden;
void __enable_tls(void);
void __enable_threads(void) hidden;
void __restorewintty(void) hidden;
void *__cxa_finalize(void *) hidden;

View file

@ -55,15 +55,16 @@
*
* That is performed automatically for unit test executables.
*
* @return memory map address on success, or null w/ errrno
* @return memory map address on success, or null w/ errno
*/
void *_mapanon(size_t size) {
/* asan runtime depends on this function */
void *m;
m = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (m == MAP_FAILED && weaken(__oom_hook)) {
weaken(__oom_hook)(size);
return 0;
if (m != MAP_FAILED) {
return m;
}
return m;
if (errno == ENOMEM && weaken(__oom_hook)) {
weaken(__oom_hook)(size);
}
return 0;
}

View file

@ -17,6 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#define ShouldUseMsabiAttribute() 1
#include "libc/assert.h"
#include "libc/bits/asmflag.h"
#include "libc/calls/internal.h"
#include "libc/calls/strace.internal.h"
@ -58,10 +59,28 @@ static privileged void __morph_mprotect(void *addr, size_t size, int prot,
* @return 0 on success, or -1 w/ errno
*/
privileged void __morph_begin(void) {
int ax;
bool cf;
intptr_t dx;
sigset_t ss = {{-1, -1}};
STRACE("__morph_begin()");
if (!IsWindows()) {
sys_sigprocmask(SIG_BLOCK, &ss, &oldss);
if (!IsOpenbsd()) {
asm volatile("mov\t$8,%%r10d\n\t"
"syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sigprocmask), "D"(SIG_BLOCK), "S"(&ss),
"1"(&oldss)
: "rcx", "r10", "r11", "memory", "cc");
assert(!ax);
} else {
asm volatile(CFLAG_ASM("syscall")
: CFLAG_CONSTRAINT(cf), "=a"(ax), "=d"(dx)
: "1"(__NR_sigprocmask), "D"(SIG_BLOCK), "S"(-1u)
: "rcx", "r11", "memory");
oldss.__bits[0] = ax & 0xffffffff;
assert(!cf);
}
}
__morph_mprotect(_base, __privileged_addr - _base, PROT_READ | PROT_WRITE,
kNtPageWritecopy);
@ -71,10 +90,28 @@ privileged void __morph_begin(void) {
* Begins code morphing execuatble.
*/
privileged void __morph_end(void) {
int ax;
long dx;
bool cf;
__morph_mprotect(_base, __privileged_addr - _base, PROT_READ | PROT_EXEC,
kNtPageExecuteRead);
if (!IsWindows()) {
sys_sigprocmask(SIG_SETMASK, &oldss, 0);
if (!IsOpenbsd()) {
asm volatile("mov\t$8,%%r10d\n\t"
"syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sigprocmask), "D"(SIG_SETMASK), "S"(&oldss),
"1"(0)
: "rcx", "r10", "r11", "memory", "cc");
assert(!ax);
} else {
asm volatile(CFLAG_ASM("syscall")
: CFLAG_CONSTRAINT(cf), "=a"(ax), "=d"(dx)
: "1"(__NR_sigprocmask), "D"(SIG_SETMASK),
"S"(oldss.__bits[0])
: "rcx", "r11", "memory");
assert(!cf);
}
}
STRACE("__morph_end()");
}