Make more threading improvements

- ASAN memory morgue is now lockless
- Make C11 atomics header more portable
- Rewrote pthread keys support to be lockless
- Simplify Python's unicode table unpacking code
- Make crash report write(2) closer to being atomic
- Make it possible to strace/ftrace a single thread
- ASAN now checks nul-terminated strings fast and properly
- Windows fork() now restores TLS memory of calling thread
This commit is contained in:
Justine Tunney 2022-11-01 22:36:03 -07:00
parent d7b88734cd
commit e522aa3a07
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
189 changed files with 1363 additions and 1217 deletions

View file

@ -16,52 +16,24 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "ape/sections.internal.h"
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/calls/struct/sigset.h"
#include "libc/calls/syscall-sysv.internal.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/asan.internal.h"
#include "libc/intrin/asancodes.h"
#include "libc/intrin/atomic.h"
#include "libc/intrin/bits.h"
#include "libc/intrin/weaken.h"
#include "libc/log/libfatal.internal.h"
#include "libc/macros.internal.h"
#include "libc/nexgen32e/msr.h"
#include "libc/nt/thread.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/morph.h"
#include "libc/runtime/runtime.h"
#include "libc/stdalign.internal.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/nrlinux.h"
#include "libc/thread/posixthread.internal.h"
#include "libc/thread/tls.h"
#include "third_party/xed/x86.h"
#define __NR_sysarch 0x000000a5 // freebsd+netbsd
#define AMD64_SET_GSBASE 131 // freebsd
#define AMD64_SET_FSBASE 129 // freebsd
#define X86_SET_GSBASE 16 // netbsd
#define X86_SET_FSBASE 17 // netbsd
#define __NR___set_tcb 0x00000149
#define __NR__lwp_setprivate 0x0000013d
#define __NR_thread_fast_set_cthread_self 0x03000003
#define _TLSZ ((intptr_t)_tls_size)
#define _TLDZ ((intptr_t)_tdata_size)
#define _TIBZ sizeof(struct CosmoTib)
int sys_enable_tls();
typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
__msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;
struct PosixThread _pthread_main;
extern unsigned char __tls_mov_nt_rax[];
extern unsigned char __tls_add_nt_rax[];
@ -102,12 +74,12 @@ _Alignas(TLS_ALIGNMENT) static char __static_tls[5008];
* arch_prctl() function. However, such programs might not be portable
* and your `errno` variable also won't be thread safe anymore.
*/
privileged void __enable_tls(void) {
void __enable_tls(void) {
int tid;
size_t siz;
struct CosmoTib *tib;
char *mem, *tls;
siz = ROUNDUP(_TLSZ + _TIBZ, alignof(__static_tls));
siz = ROUNDUP(_TLSZ + _TIBZ, _Alignof(__static_tls));
if (siz <= sizeof(__static_tls)) {
// if tls requirement is small then use the static tls block
// which helps avoid a system call for appes with little tls
@ -134,6 +106,8 @@ privileged void __enable_tls(void) {
tib->tib_self = tib;
tib->tib_self2 = tib;
tib->tib_errno = __errno;
tib->tib_strace = __strace;
tib->tib_ftrace = __ftrace;
tib->tib_pthread = (pthread_t)&_pthread_main;
if (IsLinux()) {
// gnu/systemd guarantees pid==tid for the main thread so we can
@ -149,124 +123,10 @@ privileged void __enable_tls(void) {
__repmovsb(tls, _tdata_start, _TLDZ);
// ask the operating system to change the x86 segment register
int ax, dx;
if (IsWindows()) {
__tls_index = __imp_TlsAlloc();
_npassert(0 <= __tls_index && __tls_index < 64);
asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tib));
} else if (IsFreebsd()) {
sys_enable_tls(AMD64_SET_FSBASE, tib);
} else if (IsLinux()) {
sys_enable_tls(ARCH_SET_FS, tib);
} else if (IsNetbsd()) {
// netbsd has sysarch(X86_SET_FSBASE) but we can't use that because
// signal handlers will cause it to be reset due to not setting the
// _mc_tlsbase field in struct mcontext_netbsd.
sys_enable_tls(tib);
} else if (IsOpenbsd()) {
sys_enable_tls(tib);
} else if (IsXnu()) {
// thread_fast_set_cthread_self has a weird ABI
int e = errno;
sys_enable_tls((intptr_t)tib - 0x30);
errno = e;
} else {
uint64_t val = (uint64_t)tib;
asm volatile("wrmsr"
: /* no outputs */
: "c"(MSR_IA32_FS_BASE), "a"((uint32_t)val),
"d"((uint32_t)(val >> 32)));
}
__set_tls(tib);
// We need to rewrite SysV _Thread_local code. You MUST use the
// -mno-tls-direct-seg-refs flag which generates code like this
//
// 64 48 8b 0R4 25 00 00 00 00 mov %fs:0,%R
// 64 48 03 0R4 25 00 00 00 00 add %fs:0,%R
//
// Which on Mac we can replace with this:
//
// 65 48 8b 0R4 25 30 00 00 00 mov %gs:0x30,%R
//
// Since we have no idea where the TLS instructions exist in the
// binary, we need to disassemble the whole program image. This'll
// potentially take a few milliseconds for some larger programs.
//
// We check `_tls_content` which is generated by the linker script
// since it lets us determine ahead of time if _Thread_local vars
// have actually been linked into this program.
if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
int n;
uint64_t w;
sigset_t mask;
unsigned m, dis;
unsigned char *p;
__morph_begin(&mask);
if (IsXnu()) {
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
// address 0x30 was promised to us, according to Go team
// https://github.com/golang/go/issues/23617
dis = 0x30;
} else {
// MSVC __declspec(thread) generates binary code for this
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
// times we should be good.
dis = 0x1480 + __tls_index * 8;
}
// iterate over modifiable code looking for 9 byte instruction
// this would take 30 ms using xed to enable tls on python.com
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
// use sse to zoom zoom to fs register prefixes
// that way it'll take 1 ms to morph python.com
while (p + 9 + 16 <= __privileged_start) {
if ((m = __builtin_ia32_pmovmskb128(
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144}))) {
m = __builtin_ctzll(m);
p += m;
break;
} else {
p += 16;
}
}
// we're checking for the following expression:
// 0144 == p[0] && // %fs
// 0110 == (p[1] & 0373) && // rex.w (and ignore rex.r)
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
// 0000 == p[5] && // displacement (von Neumann endian)
// 0000 == p[6] && // displacement
// 0000 == p[7] && // displacement
// 0000 == p[8] // displacement
w = READ64LE(p) & READ64LE("\377\373\377\307\377\377\377\377");
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
!p[8]) {
// now change the code
p[0] = 0145; // change %fs to %gs
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
// advance to the next instruction
n = 9;
} else {
n = 1;
}
}
__morph_end(&mask);
}
// rewrite the executable tls opcodes in memory
__morph_tls();
// we are now allowed to use tls
__tls_enabled = true;