Make _Thread_local more seamlessly working

This commit is contained in:
Justine Tunney 2022-07-10 08:27:50 -07:00
parent 5f4f6b0e69
commit 5fa77f1e8f
23 changed files with 217 additions and 283 deletions

View file

@ -64,10 +64,7 @@ struct CloneArgs {
uint32_t utid;
int64_t tid64;
};
union {
char lock;
void *oldrsp;
};
char lock;
int *ptid;
int *ctid;
int *ztid;
@ -287,6 +284,15 @@ static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
////////////////////////////////////////////////////////////////////////////////
// OPEN BESIYATA DISHMAYA
static void *oldrsp;
__attribute__((__constructor__)) static void OpenbsdGetSafeRsp(void) {
// main thread stack should never be freed during process lifetime. we
// won't actually change this stack below. we just need need a place
// where threads can park RSP for a few instructions while dying.
oldrsp = __builtin_frame_address(0);
}
static wontreturn void OpenbsdThreadMain(void *p) {
struct CloneArgs *wt = p;
*wt->ptid = wt->tid;
@ -303,7 +309,7 @@ static wontreturn void OpenbsdThreadMain(void *p) {
"movl\t$0,(%%rdi)\n\t" // *wt->ztid = 0
"syscall" // __threxit()
: "=m"(*wt->ztid)
: "a"(302), "m"(wt->oldrsp), "D"(wt->ztid)
: "a"(302), "m"(oldrsp), "D"(wt->ztid)
: "rcx", "r11", "memory");
unreachable;
}
@ -325,7 +331,6 @@ static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
wt->oldrsp = __builtin_frame_address(0);
wt->arg = arg;
wt->func = func;
tf->tf_stack = (char *)wt - 8;
@ -591,13 +596,8 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
int rc;
struct CloneArgs *wt;
if ((flags & CLONE_SETTLS) && !__tls_enabled) {
__enable_tls();
}
if ((flags & CLONE_THREAD) && !__threaded) {
__enable_threads();
}
if (flags & CLONE_SETTLS) __enable_tls();
if (flags & CLONE_THREAD) __enable_threads();
if (!func) {
rc = einval();

View file

@ -76,6 +76,23 @@ cosmo: push %rbp
ret
.endfn cosmo,weak
#if !IsTiny()
// Enable TLS early if _Thread_local is used
// In MODE=tiny you may need to explicitly call __enable_tls()
// Otherwise this would bloat life.com from 16kb 32kb D:
.init.start 304,_init_tls
mov $_tls_content,%eax
test %eax,%eax
jz 1f
push %rdi
push %rsi
call __enable_tls
pop %rsi
pop %rdi
jz 1f
1: .init.end 304,_init_tls
#endif
#if !IsTiny()
// Creates deterministically addressed stack we can use
//

View file

@ -0,0 +1,57 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2022 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/runtime/runtime.h"
extern int __threadcalls_end[];
extern int __threadcalls_start[];
privileged void __enable_threads(void) {
if (__threaded) return;
STRACE("__enable_threads()");
__threaded = gettid();
__morph_begin();
/*
* _NOPL("__threadcalls", func)
*
* The big ugly macro above is used by Cosmopolitan Libc to unser
* locking primitive (e.g. flockfile, funlockfile) have zero impact on
* performance and binary size when threads aren't actually in play.
*
* we have this
*
* 0f 1f 05 b1 19 00 00 nopl func(%rip)
*
* we're going to turn it into this
*
* 67 67 e8 b1 19 00 00 addr32 addr32 call func
*
* This is cheap and fast because the big ugly macro stored in the
* binary the offsets of all the instructions we need to change.
*/
for (int *p = __threadcalls_start; p < __threadcalls_end; ++p) {
_base[*p + 0] = 0x67;
_base[*p + 1] = 0x67;
_base[*p + 2] = 0xe8;
}
__morph_end();
}

View file

@ -16,17 +16,14 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/bits/bits.h"
#include "libc/calls/calls.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/kprintf.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/threaded.h"
#include "libc/nt/thread.h"
#include "libc/nt/thunk/msabi.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/str/str.h"
@ -48,14 +45,14 @@
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct cthread_descriptor_t)
extern int __threadcalls_end[];
extern int __threadcalls_start[];
extern unsigned char __get_tls_nt_rax[];
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[];
privileged void __enable_tls(void) {
assert(!__threaded);
assert(!__tls_enabled);
if (__tls_enabled) return;
STRACE("__enable_tls()");
// allocate tls memory for main process
//
@ -119,6 +116,7 @@ privileged void __enable_tls(void) {
* -mno-tls-direct-seg-refs flag which generates code like this
*
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
*
* Which on Mac we can replace with this:
*
@ -127,17 +125,22 @@ privileged void __enable_tls(void) {
* Whereas on Windows we'll replace it with this:
*
* 0f 1f 40 00 fatnop4
* e8 xx xx xx xx call __get_tls_nt_%R
* e8 xx xx xx xx call __tls_mov_nt_%R
*
* Since we have no idea where the TLS instructions exist in the
* binary, we need to disassemble the whole program image. This'll
* potentially take a few milliseconds for some larger programs.
*
* We check `_tls_content` which is generated by the linker script
* since it lets us determine ahead of time if _Thread_local vars
* have actually been linked into this program.
*
* TODO(jart): compute probability this is just overkill
*/
if (IsWindows() || IsXnu()) {
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
int n, reg, dis;
unsigned char *p;
const unsigned char *impl;
struct XedDecodedInst xedd;
__morph_begin();
@ -154,7 +157,8 @@ privileged void __enable_tls(void) {
if (xedd.length == 9 && //
0144 == p[0] && // fs
0110 == p[1] && // rex.w (64-bit operand size)
0213 == p[2] && // mov reg/mem → reg (word-sized)
(0213 == p[2] || // mov reg/mem → reg (word-sized)
0003 == p[2]) && // add reg/mem → reg (word-sized)
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
0000 == p[5] && // displacement (von Neumann endian)
@ -175,8 +179,13 @@ privileged void __enable_tls(void) {
// that's the same as the mov destination. When setting
// function displacement, &CALL+5+DISP must equal &FUNC.
else {
if (p[2] == 3) {
impl = __tls_add_nt_rax;
} else {
impl = __tls_mov_nt_rax;
}
reg = (p[3] & 070) >> 3;
dis = (__get_tls_nt_rax + reg * 18) - (p + 9);
dis = (impl + reg * 18) - (p + 9);
p[0] = 0017; // map1
p[1] = 0037; // nopl (onl if reg=0)
p[2] = 0100; // mod/rm (%rax)+disp8
@ -202,35 +211,6 @@ privileged void __enable_tls(void) {
}
// we are now allowed to use tls
// setting this variable
__tls_enabled = true;
}
privileged void __enable_threads(void) {
assert(!__threaded);
__threaded = gettid();
__morph_begin();
/*
* _NOPL("__threadcalls", func)
*
* The big ugly macro above is used by Cosmopolitan Libc to unser
* locking primitive (e.g. flockfile, funlockfile) have zero impact on
* performance and binary size when threads aren't actually in play.
*
* we have this
*
* 0f 1f 05 b1 19 00 00 nopl func(%rip)
*
* we're going to turn it into this
*
* 67 67 e8 b1 19 00 00 addr32 addr32 call func
*
* This is cheap and fast because the big ugly macro stored in the
* binary the offsets of all the instructions we need to change.
*/
for (int *p = __threadcalls_start; p < __threadcalls_end; ++p) {
_base[*p + 0] = 0x67;
_base[*p + 1] = 0x67;
_base[*p + 2] = 0xe8;
}
__morph_end();
}

View file

@ -24,6 +24,7 @@ extern unsigned char _tdata_size[];
extern unsigned char _tbss_start[];
extern unsigned char _tbss_end[];
extern unsigned char _tls_size[];
extern unsigned char _tls_content[];
void _init(void) hidden;
void __enable_tls(void) hidden;

View file

@ -19,6 +19,7 @@
#define ShouldUseMsabiAttribute() 1
#include "libc/bits/asmflag.h"
#include "libc/calls/internal.h"
#include "libc/calls/strace.internal.h"
#include "libc/calls/struct/sigset.h"
#include "libc/dce.h"
#include "libc/errno.h"
@ -58,6 +59,7 @@ static privileged void __morph_mprotect(void *addr, size_t size, int prot,
*/
privileged void __morph_begin(void) {
sigset_t ss = {{-1, -1}};
STRACE("__morph_begin()");
if (!IsWindows()) {
sys_sigprocmask(SIG_BLOCK, &ss, &oldss);
}
@ -74,4 +76,5 @@ privileged void __morph_end(void) {
if (!IsWindows()) {
sys_sigprocmask(SIG_SETMASK, &oldss, 0);
}
STRACE("__morph_end()");
}