Fix breakages in Linux-only build modes

- compile.com now polyfills -march=native which gcc/clang removed
- Guarantee zero Windows code is linked into non-Windows binaries
- MODE=tinylinux binaries are now back to being as tiny as ~4kb
- Improve the runtime's stack allocation / alignment hack
- GitHub Actions now tests Linux modes for assurance
This commit is contained in:
Justine Tunney 2023-07-09 19:47:46 -07:00
parent 0e4c828a8e
commit 3dc86ce154
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
32 changed files with 283 additions and 104 deletions

View file

@ -45,77 +45,75 @@ privileged void __morph_tls(void) {
// We check `_tls_content` which is generated by the linker script
// since it lets us determine ahead of time if _Thread_local vars
// have actually been linked into this program.
if (IsWindows() || IsXnu()) {
int n;
uint64_t w;
sigset_t mask;
unsigned m, dis;
unsigned char *p;
__morph_begin(&mask);
int n;
uint64_t w;
sigset_t mask;
unsigned m, dis;
unsigned char *p;
__morph_begin(&mask);
if (IsXnu()) {
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
// address 0x30 was promised to us, according to Go team
// https://github.com/golang/go/issues/23617
dis = 0x30;
} else {
// MSVC __declspec(thread) generates binary code for this
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
// times we should be good.
dis = 0x1480 + __tls_index * 8;
}
// iterate over modifiable code looking for 9 byte instruction
// this would take 30 ms using xed to enable tls on python.com
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
// use sse to zoom zoom to fs register prefixes
// that way it'll take 1 ms to morph python.com
while (p + 9 + 16 <= __privileged_start) {
if ((m = __builtin_ia32_pmovmskb128(
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144}))) {
m = __builtin_ctzll(m);
p += m;
break;
} else {
p += 16;
}
}
// we're checking for the following expression:
// 0144 == p[0] && // %fs
// 0110 == (p[1] & 0373) && // rex.w (and ignore rex.r)
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
// 0000 == p[5] && // displacement (von Neumann endian)
// 0000 == p[6] && // displacement
// 0000 == p[7] && // displacement
// 0000 == p[8] // displacement
w = READ64LE(p) & READ64LE("\377\373\377\307\377\377\377\377");
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
!p[8]) {
// now change the code
p[0] = 0145; // change %fs to %gs
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
// advance to the next instruction
n = 9;
} else {
n = 1;
}
}
__morph_end(&mask);
if (IsXnu()) {
// Apple is quite straightforward to patch. We basically
// just change the segment register, and the linear slot
// address 0x30 was promised to us, according to Go team
// https://github.com/golang/go/issues/23617
dis = 0x30;
} else {
// MSVC __declspec(thread) generates binary code for this
// %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
// times we should be good.
dis = 0x1480 + __tls_index * 8;
}
// iterate over modifiable code looking for 9 byte instruction
// this would take 30 ms using xed to enable tls on python.com
for (p = _ereal; p + 9 <= __privileged_start; p += n) {
// use sse to zoom zoom to fs register prefixes
// that way it'll take 1 ms to morph python.com
while (p + 9 + 16 <= __privileged_start) {
if ((m = __builtin_ia32_pmovmskb128(
*(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144, 0144, 0144, 0144, 0144, 0144,
0144, 0144}))) {
m = __builtin_ctzll(m);
p += m;
break;
} else {
p += 16;
}
}
// we're checking for the following expression:
// 0144 == p[0] && // %fs
// 0110 == (p[1] & 0373) && // rex.w (and ignore rex.r)
// (0213 == p[2] || // mov reg/mem → reg (word-sized)
// 0003 == p[2]) && // add reg/mem → reg (word-sized)
// 0004 == (p[3] & 0307) && // mod/rm (4,reg,0) means sib → reg
// 0045 == p[4] && // sib (5,4,0) → (rbp,rsp,0) → disp32
// 0000 == p[5] && // displacement (von Neumann endian)
// 0000 == p[6] && // displacement
// 0000 == p[7] && // displacement
// 0000 == p[8] // displacement
w = READ64LE(p) & READ64LE("\377\373\377\307\377\377\377\377");
if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
w == READ64LE("\144\110\003\004\045\000\000\000")) &&
!p[8]) {
// now change the code
p[0] = 0145; // change %fs to %gs
p[5] = (dis & 0x000000ff) >> 000; // displacement
p[6] = (dis & 0x0000ff00) >> 010; // displacement
p[7] = (dis & 0x00ff0000) >> 020; // displacement
p[8] = (dis & 0xff000000) >> 030; // displacement
// advance to the next instruction
n = 9;
} else {
n = 1;
}
}
__morph_end(&mask);
#endif
}