Fix thread-local storage bugs on aarch64

This change fixes an issue where .tbss memory might not be initialized.
This commit is contained in:
Justine Tunney 2024-05-08 04:03:51 -07:00
parent 793393a341
commit ae2a7ac844
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
6 changed files with 93 additions and 65 deletions

View file

@ -293,7 +293,6 @@ _tdata_size = _tdata_end - _tdata_start;
_tbss_size = _tbss_end - _tbss_start;
_tbss_offset = _tbss_start - _tdata_start;
_tls_content = (_tdata_end - _tdata_start) + (_tbss_end - _tbss_start);
_tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss));
ASSERT(ALIGNOF(.tdata) <= TLS_ALIGNMENT && ALIGNOF(.tbss) <= TLS_ALIGNMENT,
"_Thread_local _Alignof can't exceed TLS_ALIGNMENT");
_tdata_align = ALIGNOF(.tdata);
_tbss_align = ALIGNOF(.tbss);
_tls_align = MAX(TLS_ALIGNMENT, MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)));

View file

@ -553,7 +553,9 @@ _tdata_size = _tdata_end - _tdata_start;
_tbss_size = _tbss_end - _tbss_start;
_tbss_offset = _tbss_start - _tdata_start;
_tls_content = (_tdata_end - _tdata_start) + (_tbss_end - _tbss_start);
_tls_align = 1;
_tdata_align = ALIGNOF(.tdata);
_tbss_align = ALIGNOF(.tbss);
_tls_align = MAX(TLS_ALIGNMENT, MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)));
ape_cod_offset = 0;
ape_cod_vaddr = ADDR(.head);

View file

@ -16,6 +16,8 @@ extern unsigned char _tdata_end[] __attribute__((__weak__));
extern unsigned char _tbss_start[] __attribute__((__weak__));
extern unsigned char _tbss_end[] __attribute__((__weak__));
extern unsigned char _tls_align[] __attribute__((__weak__));
extern unsigned char _tdata_align[] __attribute__((__weak__));
extern unsigned char _tbss_align[] __attribute__((__weak__));
extern unsigned char __test_start[] __attribute__((__weak__));
extern unsigned char __ro[] __attribute__((__weak__));
extern unsigned char __data_start[] __attribute__((__weak__));

View file

@ -255,7 +255,7 @@ static int __sigaction(int sig, const struct sigaction *act,
// xnu silicon claims to support sa_resethand but it does nothing
// this can be tested, since it clears the bit from flags as well
if (!rc && oldact &&
(((struct sigaction_silicon *)ap)->sa_flags & SA_RESETHAND)) {
(((struct sigaction_silicon *)oldact)->sa_flags & SA_RESETHAND)) {
((struct sigaction_silicon *)oldact)->sa_flags |= SA_RESETHAND;
}
}

View file

@ -112,10 +112,6 @@ static unsigned long ParseMask(const char *str) {
* and your `errno` variable also won't be thread safe anymore.
*/
textstartup void __enable_tls(void) {
int tid;
size_t siz;
char *mem, *tls;
struct CosmoTib *tib;
// Here's the layout we're currently using:
//
@ -138,7 +134,8 @@ textstartup void __enable_tls(void) {
#ifdef __x86_64__
siz = ROUNDUP(I(_tls_size) + sizeof(*tib), TLS_ALIGNMENT);
char *mem;
size_t siz = ROUNDUP(I(_tls_size) + sizeof(struct CosmoTib), TLS_ALIGNMENT);
if (siz <= sizeof(__static_tls)) {
// if tls requirement is small then use the static tls block
// which helps avoid a system call for appes with little tls
@ -158,28 +155,44 @@ textstartup void __enable_tls(void) {
kAsanProtected);
}
tib = (struct CosmoTib *)(mem + siz - sizeof(*tib));
tls = mem + siz - sizeof(*tib) - I(_tls_size);
struct CosmoTib *tib = (struct CosmoTib *)(mem + siz - sizeof(*tib));
char *tls = mem + siz - sizeof(*tib) - I(_tls_size);
// copy in initialized data section
if (I(_tdata_size)) {
if (IsAsan()) {
__asan_memcpy(tls, _tdata_start, I(_tdata_size));
} else {
memcpy(tls, _tdata_start, I(_tdata_size));
}
}
#elif defined(__aarch64__)
size_t hiz = ROUNDUP(sizeof(*tib) + 2 * sizeof(void *), I(_tls_align));
siz = hiz + I(_tls_size);
if (siz <= sizeof(__static_tls)) {
uintptr_t size = ROUNDUP(sizeof(struct CosmoTib), I(_tls_align)) + //
ROUNDUP(sizeof(uintptr_t) * 2, I(_tdata_align)) + //
ROUNDUP(I(_tdata_size), I(_tbss_align)) + //
I(_tbss_size);
char *mem;
if (I(_tls_align) <= TLS_ALIGNMENT && size <= sizeof(__static_tls)) {
mem = __static_tls;
} else {
mem = _weaken(_mapanon)(siz);
mem = _weaken(_mapanon)(size);
}
if (IsAsan()) {
// there's a roundup(pagesize) gap between .tdata and .tbss
// poison that empty space
__asan_poison(mem + hiz + I(_tdata_size), I(_tbss_offset) - I(_tdata_size),
kAsanProtected);
}
struct CosmoTib *tib =
(struct CosmoTib *)(mem +
ROUNDUP(sizeof(struct CosmoTib), I(_tls_align)) -
sizeof(struct CosmoTib));
tib = (struct CosmoTib *)mem;
tls = mem + hiz;
uintptr_t *dtv = (uintptr_t *)(tib + 1);
size_t dtv_size = sizeof(uintptr_t) * 2;
char *tdata = (char *)dtv + ROUNDUP(dtv_size, I(_tdata_align));
if (I(_tdata_size)) {
memmove(tdata, _tdata_start, I(_tdata_size));
}
// Set the DTV.
//
@ -189,8 +202,8 @@ textstartup void __enable_tls(void) {
//
// @see musl/src/env/__init_tls.c
// @see https://chao-tic.github.io/blog/2018/12/25/tls
((uintptr_t *)tls)[-2] = 1;
((void **)tls)[-1] = tls;
dtv[0] = 1;
dtv[1] = (uintptr_t)tdata;
#else
#error "unsupported architecture"
@ -213,6 +226,8 @@ textstartup void __enable_tls(void) {
} else if (IsXnuSilicon()) {
tib->tib_syshand = __syslib->__pthread_self();
}
int tid;
if (IsLinux() || IsXnuSilicon()) {
// gnu/systemd guarantees pid==tid for the main thread so we can
// avoid issuing a superfluous system call at startup in program
@ -237,15 +252,6 @@ textstartup void __enable_tls(void) {
_pthread_list = &_pthread_static.list;
atomic_store_explicit(&_pthread_static.ptid, tid, memory_order_relaxed);
// copy in initialized data section
if (I(_tdata_size)) {
if (IsAsan()) {
__asan_memcpy(tls, _tdata_start, I(_tdata_size));
} else {
memcpy(tls, _tdata_start, I(_tdata_size));
}
}
// ask the operating system to change the x86 segment register
__set_tls(tib);

View file

@ -25,6 +25,7 @@
#include "libc/mem/mem.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/sysparam.h"
#include "libc/str/locale.h"
#include "libc/str/str.h"
#include "libc/thread/tls.h"
@ -54,6 +55,19 @@ static char *_mktls_below(struct CosmoTib **out_tib) {
char *mem, *tls;
struct CosmoTib *tib;
// Here's the TLS memory layout on x86_64
//
// __get_tls()
// │
// %fs OpenBSD/NetBSD
// _Thread_local │
// ┌───┬──────────┬──────────┼───┐
// │pad│ .tdata │ .tbss │tib│
// └───┴──────────┴──────────┼───┘
// │
// Linux/FreeBSD/Windows/Mac %gs
//
siz = ROUNDUP(I(_tls_size) + sizeof(*tib), _Alignof(struct CosmoTib));
siz = ROUNDUP(siz, _Alignof(struct CosmoTib));
mem = memalign(_Alignof(struct CosmoTib), siz);
@ -77,6 +91,7 @@ static char *_mktls_below(struct CosmoTib **out_tib) {
}
// clear .tbss
if (I(_tbss_size))
bzero(tls + I(_tbss_offset), I(_tbss_size));
// set up thread information block
@ -84,46 +99,50 @@ static char *_mktls_below(struct CosmoTib **out_tib) {
}
static char *_mktls_above(struct CosmoTib **out_tib) {
size_t hiz, siz;
struct CosmoTib *tib;
char *mem, *dtv, *tls;
// allocate memory for tdata, tbss, and tib
hiz = ROUNDUP(sizeof(*tib) + 2 * sizeof(void *), I(_tls_align));
siz = hiz + I(_tls_size);
mem = memalign(TLS_ALIGNMENT, siz);
// Here's the TLS memory layout on aarch64
//
// x28
// %tpidr_el0
// │
// │ _Thread_local
// ┌───┼───┬──────────┬──────────┐
// │tib│dtv│ .tdata │ .tbss │
// ├───┴───┴──────────┴──────────┘
// │
// __get_tls()
//
size_t size = ROUNDUP(sizeof(struct CosmoTib), I(_tls_align)) + //
ROUNDUP(sizeof(uintptr_t) * 2, I(_tdata_align)) + //
ROUNDUP(I(_tdata_size), I(_tbss_align)) + //
I(_tbss_size);
char *mem = memalign(I(_tls_align), size);
if (!mem)
return 0;
// poison memory between tdata and tbss
if (IsAsan()) {
__asan_poison(mem + hiz + I(_tdata_size), I(_tbss_offset) - I(_tdata_size),
kAsanProtected);
}
struct CosmoTib *tib =
(struct CosmoTib *)(mem +
ROUNDUP(sizeof(struct CosmoTib), I(_tls_align)) -
sizeof(struct CosmoTib));
tib = (struct CosmoTib *)mem;
dtv = mem + sizeof(*tib);
tls = mem + hiz;
uintptr_t *dtv = (uintptr_t *)(tib + 1);
size_t dtv_size = sizeof(uintptr_t) * 2;
// set dtv
((uintptr_t *)dtv)[0] = 1;
((void **)dtv)[1] = tls;
// initialize .tdata
char *tdata = (char *)dtv + ROUNDUP(dtv_size, I(_tdata_align));
if (I(_tdata_size)) {
if (IsAsan()) {
__asan_memcpy(tls, _tdata_start, I(_tdata_size));
} else {
memmove(tls, _tdata_start, I(_tdata_size));
}
memmove(tdata, _tdata_start, I(_tdata_size));
}
// clear .tbss
char *tbss = tdata + ROUNDUP(I(_tdata_size), I(_tbss_align));
if (I(_tbss_size)) {
bzero(tls + I(_tbss_offset), I(_tbss_size));
bzero(tbss, I(_tbss_size));
}
// set up thread information block
dtv[0] = 1;
dtv[1] = (uintptr_t)tdata;
return _mktls_finish(out_tib, mem, tib);
}