mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-06-30 16:28:30 +00:00
Make _Thread_local more seamlessly working
This commit is contained in:
parent
5f4f6b0e69
commit
5fa77f1e8f
23 changed files with 217 additions and 283 deletions
|
@ -64,10 +64,7 @@ struct CloneArgs {
|
|||
uint32_t utid;
|
||||
int64_t tid64;
|
||||
};
|
||||
union {
|
||||
char lock;
|
||||
void *oldrsp;
|
||||
};
|
||||
char lock;
|
||||
int *ptid;
|
||||
int *ctid;
|
||||
int *ztid;
|
||||
|
@ -287,6 +284,15 @@ static int CloneFreebsd(int (*func)(void *, int), char *stk, size_t stksz,
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OPEN BESIYATA DISHMAYA
|
||||
|
||||
static void *oldrsp;
|
||||
|
||||
__attribute__((__constructor__)) static void OpenbsdGetSafeRsp(void) {
|
||||
// main thread stack should never be freed during process lifetime. we
|
||||
// won't actually change this stack below. we just need need a place
|
||||
// where threads can park RSP for a few instructions while dying.
|
||||
oldrsp = __builtin_frame_address(0);
|
||||
}
|
||||
|
||||
static wontreturn void OpenbsdThreadMain(void *p) {
|
||||
struct CloneArgs *wt = p;
|
||||
*wt->ptid = wt->tid;
|
||||
|
@ -303,7 +309,7 @@ static wontreturn void OpenbsdThreadMain(void *p) {
|
|||
"movl\t$0,(%%rdi)\n\t" // *wt->ztid = 0
|
||||
"syscall" // __threxit()
|
||||
: "=m"(*wt->ztid)
|
||||
: "a"(302), "m"(wt->oldrsp), "D"(wt->ztid)
|
||||
: "a"(302), "m"(oldrsp), "D"(wt->ztid)
|
||||
: "rcx", "r11", "memory");
|
||||
unreachable;
|
||||
}
|
||||
|
@ -325,7 +331,6 @@ static int CloneOpenbsd(int (*func)(void *, int), char *stk, size_t stksz,
|
|||
wt->ptid = flags & CLONE_PARENT_SETTID ? ptid : &wt->tid;
|
||||
wt->ctid = flags & CLONE_CHILD_SETTID ? ctid : &wt->tid;
|
||||
wt->ztid = flags & CLONE_CHILD_CLEARTID ? ctid : &wt->tid;
|
||||
wt->oldrsp = __builtin_frame_address(0);
|
||||
wt->arg = arg;
|
||||
wt->func = func;
|
||||
tf->tf_stack = (char *)wt - 8;
|
||||
|
@ -591,13 +596,8 @@ int clone(void *func, void *stk, size_t stksz, int flags, void *arg, int *ptid,
|
|||
int rc;
|
||||
struct CloneArgs *wt;
|
||||
|
||||
if ((flags & CLONE_SETTLS) && !__tls_enabled) {
|
||||
__enable_tls();
|
||||
}
|
||||
|
||||
if ((flags & CLONE_THREAD) && !__threaded) {
|
||||
__enable_threads();
|
||||
}
|
||||
if (flags & CLONE_SETTLS) __enable_tls();
|
||||
if (flags & CLONE_THREAD) __enable_threads();
|
||||
|
||||
if (!func) {
|
||||
rc = einval();
|
||||
|
|
|
@ -76,6 +76,23 @@ cosmo: push %rbp
|
|||
ret
|
||||
.endfn cosmo,weak
|
||||
|
||||
#if !IsTiny()
|
||||
// Enable TLS early if _Thread_local is used
|
||||
// In MODE=tiny you may need to explicitly call __enable_tls()
|
||||
// Otherwise this would bloat life.com from 16kb → 32kb D:
|
||||
.init.start 304,_init_tls
|
||||
mov $_tls_content,%eax
|
||||
test %eax,%eax
|
||||
jz 1f
|
||||
push %rdi
|
||||
push %rsi
|
||||
call __enable_tls
|
||||
pop %rsi
|
||||
pop %rdi
|
||||
jz 1f
|
||||
1: .init.end 304,_init_tls
|
||||
#endif
|
||||
|
||||
#if !IsTiny()
|
||||
// Creates deterministically addressed stack we can use
|
||||
//
|
||||
|
|
57
libc/runtime/enable_threads.c
Normal file
57
libc/runtime/enable_threads.c
Normal file
|
@ -0,0 +1,57 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2022 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/assert.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/strace.internal.h"
|
||||
#include "libc/nexgen32e/threaded.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
|
||||
extern int __threadcalls_end[];
|
||||
extern int __threadcalls_start[];
|
||||
|
||||
privileged void __enable_threads(void) {
|
||||
if (__threaded) return;
|
||||
STRACE("__enable_threads()");
|
||||
__threaded = gettid();
|
||||
__morph_begin();
|
||||
/*
|
||||
* _NOPL("__threadcalls", func)
|
||||
*
|
||||
* The big ugly macro above is used by Cosmopolitan Libc to unser
|
||||
* locking primitive (e.g. flockfile, funlockfile) have zero impact on
|
||||
* performance and binary size when threads aren't actually in play.
|
||||
*
|
||||
* we have this
|
||||
*
|
||||
* 0f 1f 05 b1 19 00 00 nopl func(%rip)
|
||||
*
|
||||
* we're going to turn it into this
|
||||
*
|
||||
* 67 67 e8 b1 19 00 00 addr32 addr32 call func
|
||||
*
|
||||
* This is cheap and fast because the big ugly macro stored in the
|
||||
* binary the offsets of all the instructions we need to change.
|
||||
*/
|
||||
for (int *p = __threadcalls_start; p < __threadcalls_end; ++p) {
|
||||
_base[*p + 0] = 0x67;
|
||||
_base[*p + 1] = 0x67;
|
||||
_base[*p + 2] = 0xe8;
|
||||
}
|
||||
__morph_end();
|
||||
}
|
|
@ -16,17 +16,14 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/assert.h"
|
||||
#include "libc/bits/bits.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/strace.internal.h"
|
||||
#include "libc/calls/syscall-sysv.internal.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/errno.h"
|
||||
#include "libc/intrin/kprintf.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/nexgen32e/threaded.h"
|
||||
#include "libc/nt/thread.h"
|
||||
#include "libc/nt/thunk/msabi.h"
|
||||
#include "libc/runtime/internal.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/str/str.h"
|
||||
|
@ -48,14 +45,14 @@
|
|||
#define _TLDZ ((intptr_t)_tdata_size)
|
||||
#define _TIBZ sizeof(struct cthread_descriptor_t)
|
||||
|
||||
extern int __threadcalls_end[];
|
||||
extern int __threadcalls_start[];
|
||||
extern unsigned char __get_tls_nt_rax[];
|
||||
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
|
||||
|
||||
extern unsigned char __tls_mov_nt_rax[];
|
||||
extern unsigned char __tls_add_nt_rax[];
|
||||
|
||||
privileged void __enable_tls(void) {
|
||||
assert(!__threaded);
|
||||
assert(!__tls_enabled);
|
||||
if (__tls_enabled) return;
|
||||
STRACE("__enable_tls()");
|
||||
|
||||
// allocate tls memory for main process
|
||||
//
|
||||
|
@ -119,6 +116,7 @@ privileged void __enable_tls(void) {
|
|||
* -mno-tls-direct-seg-refs flag which generates code like this
|
||||
*
|
||||
* 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
|
||||
* 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
|
||||
*
|
||||
* Which on Mac we can replace with this:
|
||||
*
|
||||
|
@ -127,17 +125,22 @@ privileged void __enable_tls(void) {
|
|||
* Whereas on Windows we'll replace it with this:
|
||||
*
|
||||
* 0f 1f 40 00 fatnop4
|
||||
* e8 xx xx xx xx call __get_tls_nt_%R
|
||||
* e8 xx xx xx xx call __tls_mov_nt_%R
|
||||
*
|
||||
* Since we have no idea where the TLS instructions exist in the
|
||||
* binary, we need to disassemble the whole program image. This'll
|
||||
* potentially take a few milliseconds for some larger programs.
|
||||
*
|
||||
* We check `_tls_content` which is generated by the linker script
|
||||
* since it lets us determine ahead of time if _Thread_local vars
|
||||
* have actually been linked into this program.
|
||||
*
|
||||
* TODO(jart): compute probability this is just overkill
|
||||
*/
|
||||
if (IsWindows() || IsXnu()) {
|
||||
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
|
||||
int n, reg, dis;
|
||||
unsigned char *p;
|
||||
const unsigned char *impl;
|
||||
struct XedDecodedInst xedd;
|
||||
__morph_begin();
|
||||
|
||||
|
@ -154,7 +157,8 @@ privileged void __enable_tls(void) {
|
|||
if (xedd.length == 9 && //
|
||||
0144 == p[0] && // fs
|
||||
0110 == p[1] && // rex.w (64-bit operand size)
|
||||
0213 == p[2] && // mov reg/mem → reg (word-sized)
|
||||
(0213 == p[2] || // mov reg/mem → reg (word-sized)
|
||||
0003 == p[2]) && // add reg/mem → reg (word-sized)
|
||||
0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
|
||||
0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
|
||||
0000 == p[5] && // displacement (von Neumann endian)
|
||||
|
@ -175,8 +179,13 @@ privileged void __enable_tls(void) {
|
|||
// that's the same as the mov destination. When setting
|
||||
// function displacement, &CALL+5+DISP must equal &FUNC.
|
||||
else {
|
||||
if (p[2] == 3) {
|
||||
impl = __tls_add_nt_rax;
|
||||
} else {
|
||||
impl = __tls_mov_nt_rax;
|
||||
}
|
||||
reg = (p[3] & 070) >> 3;
|
||||
dis = (__get_tls_nt_rax + reg * 18) - (p + 9);
|
||||
dis = (impl + reg * 18) - (p + 9);
|
||||
p[0] = 0017; // map1
|
||||
p[1] = 0037; // nopl (onl if reg=0)
|
||||
p[2] = 0100; // mod/rm (%rax)+disp8
|
||||
|
@ -202,35 +211,6 @@ privileged void __enable_tls(void) {
|
|||
}
|
||||
|
||||
// we are now allowed to use tls
|
||||
// setting this variable
|
||||
__tls_enabled = true;
|
||||
}
|
||||
|
||||
privileged void __enable_threads(void) {
|
||||
assert(!__threaded);
|
||||
__threaded = gettid();
|
||||
__morph_begin();
|
||||
/*
|
||||
* _NOPL("__threadcalls", func)
|
||||
*
|
||||
* The big ugly macro above is used by Cosmopolitan Libc to unser
|
||||
* locking primitive (e.g. flockfile, funlockfile) have zero impact on
|
||||
* performance and binary size when threads aren't actually in play.
|
||||
*
|
||||
* we have this
|
||||
*
|
||||
* 0f 1f 05 b1 19 00 00 nopl func(%rip)
|
||||
*
|
||||
* we're going to turn it into this
|
||||
*
|
||||
* 67 67 e8 b1 19 00 00 addr32 addr32 call func
|
||||
*
|
||||
* This is cheap and fast because the big ugly macro stored in the
|
||||
* binary the offsets of all the instructions we need to change.
|
||||
*/
|
||||
for (int *p = __threadcalls_start; p < __threadcalls_end; ++p) {
|
||||
_base[*p + 0] = 0x67;
|
||||
_base[*p + 1] = 0x67;
|
||||
_base[*p + 2] = 0xe8;
|
||||
}
|
||||
__morph_end();
|
||||
}
|
|
@ -24,6 +24,7 @@ extern unsigned char _tdata_size[];
|
|||
extern unsigned char _tbss_start[];
|
||||
extern unsigned char _tbss_end[];
|
||||
extern unsigned char _tls_size[];
|
||||
extern unsigned char _tls_content[];
|
||||
|
||||
void _init(void) hidden;
|
||||
void __enable_tls(void) hidden;
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#define ShouldUseMsabiAttribute() 1
|
||||
#include "libc/bits/asmflag.h"
|
||||
#include "libc/calls/internal.h"
|
||||
#include "libc/calls/strace.internal.h"
|
||||
#include "libc/calls/struct/sigset.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/errno.h"
|
||||
|
@ -58,6 +59,7 @@ static privileged void __morph_mprotect(void *addr, size_t size, int prot,
|
|||
*/
|
||||
privileged void __morph_begin(void) {
|
||||
sigset_t ss = {{-1, -1}};
|
||||
STRACE("__morph_begin()");
|
||||
if (!IsWindows()) {
|
||||
sys_sigprocmask(SIG_BLOCK, &ss, &oldss);
|
||||
}
|
||||
|
@ -74,4 +76,5 @@ privileged void __morph_end(void) {
|
|||
if (!IsWindows()) {
|
||||
sys_sigprocmask(SIG_SETMASK, &oldss, 0);
|
||||
}
|
||||
STRACE("__morph_end()");
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue