mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-06-27 14:58:30 +00:00
Make more threading improvements
- ASAN memory morgue is now lockless - Make C11 atomics header more portable - Rewrote pthread keys support to be lockless - Simplify Python's unicode table unpacking code - Make crash report write(2) closer to being atomic - Make it possible to strace/ftrace a single thread - ASAN now checks nul-terminated strings fast and properly - Windows fork() now restores TLS memory of calling thread
This commit is contained in:
parent
d7b88734cd
commit
e522aa3a07
189 changed files with 1363 additions and 1217 deletions
|
@ -16,52 +16,24 @@
|
|||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "ape/sections.internal.h"
|
||||
#include "libc/assert.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/struct/sigset.h"
|
||||
#include "libc/calls/syscall-sysv.internal.h"
|
||||
#include "libc/dce.h"
|
||||
#include "libc/errno.h"
|
||||
#include "libc/intrin/asan.internal.h"
|
||||
#include "libc/intrin/asancodes.h"
|
||||
#include "libc/intrin/atomic.h"
|
||||
#include "libc/intrin/bits.h"
|
||||
#include "libc/intrin/weaken.h"
|
||||
#include "libc/log/libfatal.internal.h"
|
||||
#include "libc/macros.internal.h"
|
||||
#include "libc/nexgen32e/msr.h"
|
||||
#include "libc/nt/thread.h"
|
||||
#include "libc/runtime/internal.h"
|
||||
#include "libc/runtime/morph.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/stdalign.internal.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "libc/sysv/consts/nrlinux.h"
|
||||
#include "libc/thread/posixthread.internal.h"
|
||||
#include "libc/thread/tls.h"
|
||||
#include "third_party/xed/x86.h"
|
||||
|
||||
#define __NR_sysarch 0x000000a5 // freebsd+netbsd
|
||||
#define AMD64_SET_GSBASE 131 // freebsd
|
||||
#define AMD64_SET_FSBASE 129 // freebsd
|
||||
#define X86_SET_GSBASE 16 // netbsd
|
||||
#define X86_SET_FSBASE 17 // netbsd
|
||||
|
||||
#define __NR___set_tcb 0x00000149
|
||||
#define __NR__lwp_setprivate 0x0000013d
|
||||
#define __NR_thread_fast_set_cthread_self 0x03000003
|
||||
|
||||
#define _TLSZ ((intptr_t)_tls_size)
|
||||
#define _TLDZ ((intptr_t)_tdata_size)
|
||||
#define _TIBZ sizeof(struct CosmoTib)
|
||||
|
||||
int sys_enable_tls();
|
||||
|
||||
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
|
||||
|
||||
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
|
||||
|
||||
struct PosixThread _pthread_main;
|
||||
extern unsigned char __tls_mov_nt_rax[];
|
||||
extern unsigned char __tls_add_nt_rax[];
|
||||
|
@ -102,12 +74,12 @@ _Alignas(TLS_ALIGNMENT) static char __static_tls[5008];
|
|||
* arch_prctl() function. However, such programs might not be portable
|
||||
* and your `errno` variable also won't be thread safe anymore.
|
||||
*/
|
||||
privileged void __enable_tls(void) {
|
||||
void __enable_tls(void) {
|
||||
int tid;
|
||||
size_t siz;
|
||||
struct CosmoTib *tib;
|
||||
char *mem, *tls;
|
||||
siz = ROUNDUP(_TLSZ + _TIBZ, alignof(__static_tls));
|
||||
siz = ROUNDUP(_TLSZ + _TIBZ, _Alignof(__static_tls));
|
||||
if (siz <= sizeof(__static_tls)) {
|
||||
// if tls requirement is small then use the static tls block
|
||||
// which helps avoid a system call for appes with little tls
|
||||
|
@ -134,6 +106,8 @@ privileged void __enable_tls(void) {
|
|||
tib->tib_self = tib;
|
||||
tib->tib_self2 = tib;
|
||||
tib->tib_errno = __errno;
|
||||
tib->tib_strace = __strace;
|
||||
tib->tib_ftrace = __ftrace;
|
||||
tib->tib_pthread = (pthread_t)&_pthread_main;
|
||||
if (IsLinux()) {
|
||||
// gnu/systemd guarantees pid==tid for the main thread so we can
|
||||
|
@ -149,124 +123,10 @@ privileged void __enable_tls(void) {
|
|||
__repmovsb(tls, _tdata_start, _TLDZ);
|
||||
|
||||
// ask the operating system to change the x86 segment register
|
||||
int ax, dx;
|
||||
if (IsWindows()) {
|
||||
__tls_index = __imp_TlsAlloc();
|
||||
_npassert(0 <= __tls_index && __tls_index < 64);
|
||||
asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tib));
|
||||
} else if (IsFreebsd()) {
|
||||
sys_enable_tls(AMD64_SET_FSBASE, tib);
|
||||
} else if (IsLinux()) {
|
||||
sys_enable_tls(ARCH_SET_FS, tib);
|
||||
} else if (IsNetbsd()) {
|
||||
// netbsd has sysarch(X86_SET_FSBASE) but we can't use that because
|
||||
// signal handlers will cause it to be reset due to not setting the
|
||||
// _mc_tlsbase field in struct mcontext_netbsd.
|
||||
sys_enable_tls(tib);
|
||||
} else if (IsOpenbsd()) {
|
||||
sys_enable_tls(tib);
|
||||
} else if (IsXnu()) {
|
||||
// thread_fast_set_cthread_self has a weird ABI
|
||||
int e = errno;
|
||||
sys_enable_tls((intptr_t)tib - 0x30);
|
||||
errno = e;
|
||||
} else {
|
||||
uint64_t val = (uint64_t)tib;
|
||||
asm volatile("wrmsr"
|
||||
: /* no outputs */
|
||||
: "c"(MSR_IA32_FS_BASE), "a"((uint32_t)val),
|
||||
"d"((uint32_t)(val >> 32)));
|
||||
}
|
||||
__set_tls(tib);
|
||||
|
||||
// We need to rewrite SysV _Thread_local code. You MUST use the
|
||||
// -mno-tls-direct-seg-refs flag which generates code like this
|
||||
//
|
||||
// 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
|
||||
// 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
|
||||
//
|
||||
// Which on Mac we can replace with this:
|
||||
//
|
||||
// 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
|
||||
//
|
||||
// Since we have no idea where the TLS instructions exist in the
|
||||
// binary, we need to disassemble the whole program image. This'll
|
||||
// potentially take a few milliseconds for some larger programs.
|
||||
//
|
||||
// We check `_tls_content` which is generated by the linker script
|
||||
// since it lets us determine ahead of time if _Thread_local vars
|
||||
// have actually been linked into this program.
|
||||
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
|
||||
int n;
|
||||
uint64_t w;
|
||||
sigset_t mask;
|
||||
unsigned m, dis;
|
||||
unsigned char *p;
|
||||
__morph_begin(&mask);
|
||||
|
||||
if (IsXnu()) {
|
||||
// Apple is quite straightforward to patch. We basically
|
||||
// just change the segment register, and the linear slot
|
||||
// address 0x30 was promised to us, according to Go team
|
||||
// https://github.com/golang/go/issues/23617
|
||||
dis = 0x30;
|
||||
} else {
|
||||
// MSVC __declspec(thread) generates binary code for this
|
||||
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
|
||||
// times we should be good.
|
||||
dis = 0x1480 + __tls_index * 8;
|
||||
}
|
||||
|
||||
// iterate over modifiable code looking for 9 byte instruction
|
||||
// this would take 30 ms using xed to enable tls on python.com
|
||||
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
|
||||
|
||||
// use sse to zoom zoom to fs register prefixes
|
||||
// that way it'll take 1 ms to morph python.com
|
||||
while (p + 9 + 16 <= __privileged_start) {
|
||||
if ((m = __builtin_ia32_pmovmskb128(
|
||||
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
|
||||
0144, 0144, 0144, 0144, 0144, 0144,
|
||||
0144, 0144, 0144, 0144}))) {
|
||||
m = __builtin_ctzll(m);
|
||||
p += m;
|
||||
break;
|
||||
} else {
|
||||
p += 16;
|
||||
}
|
||||
}
|
||||
|
||||
// we're checking for the following expression:
|
||||
// 0144 == p[0] && // %fs
|
||||
// 0110 == (p[1] & 0373) && // rex.w (and ignore rex.r)
|
||||
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
|
||||
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
|
||||
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
|
||||
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
|
||||
// 0000 == p[5] && // displacement (von Neumann endian)
|
||||
// 0000 == p[6] && // displacement
|
||||
// 0000 == p[7] && // displacement
|
||||
// 0000 == p[8] // displacement
|
||||
w = READ64LE(p) & READ64LE("\377\373\377\307\377\377\377\377");
|
||||
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
|
||||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
|
||||
!p[8]) {
|
||||
|
||||
// now change the code
|
||||
p[0] = 0145; // change %fs to %gs
|
||||
p[5] = (dis & 0x000000ff) >> 000; // displacement
|
||||
p[6] = (dis & 0x0000ff00) >> 010; // displacement
|
||||
p[7] = (dis & 0x00ff0000) >> 020; // displacement
|
||||
p[8] = (dis & 0xff000000) >> 030; // displacement
|
||||
|
||||
// advance to the next instruction
|
||||
n = 9;
|
||||
} else {
|
||||
n = 1;
|
||||
}
|
||||
}
|
||||
|
||||
__morph_end(&mask);
|
||||
}
|
||||
// rewrite the executable tls opcodes in memory
|
||||
__morph_tls();
|
||||
|
||||
// we are now allowed to use tls
|
||||
__tls_enabled = true;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue