Get --ftrace working on aarch64

This change implements a new approach to function call logging, that's
based on the GCC flag: -fpatchable-function-entry. Read the commentary
in build/config.mk to learn how it works.
This commit is contained in:
Justine Tunney 2023-06-05 23:35:31 -07:00
parent 5b908bc756
commit eb40cb371d
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
934 changed files with 2259 additions and 1268 deletions

View file

@ -191,7 +191,7 @@ cosmo: push %rbp
.init.end 305,_init_symbols
#endif
#ifdef __PG__
#ifdef FTRACE
.init.start 306,_init_ftrace
push %rdi
push %rsi

View file

@ -154,7 +154,7 @@ textstartup void cosmo(long *sp, struct Syslib *m1) {
environ = envp;
if (argc) program_invocation_name = argv[0];
// run initialization callbacks
// initialize program
_init();
__enable_tls();
#ifdef SYSDEBUG
@ -163,6 +163,9 @@ textstartup void cosmo(long *sp, struct Syslib *m1) {
for (fp = __init_array_end; fp-- > __init_array_start;) {
(*fp)(argc, argv, envp, auxv);
}
#ifdef FTRACE
argc = ftrace_init();
#endif
// run program
if (!IsTiny()) __wipe(0);

View file

@ -19,10 +19,11 @@
#include "libc/macros.internal.h"
// Re-initializes FPU.
.ftrace1
fpreset:
_fpreset:
.ftrace2
.leafprologue
.profilable
finit
.leafepilogue
.endfn _fpreset,globl

View file

@ -67,43 +67,30 @@ ftrace_hook:
movaps 0x60(%rsp),%xmm6
movaps 0x70(%rsp),%xmm7
leave
ret
#elif defined(__aarch64__)
stp x0,x1,[sp,#-16]!
stp x2,x3,[sp,#-16]!
stp x4,x5,[sp,#-16]!
stp x6,x7,[sp,#-16]!
stp x8,x9,[sp,#-16]!
stp x10,x11,[sp,#-16]!
stp x12,x13,[sp,#-16]!
stp x14,x15,[sp,#-16]!
stp x16,x17,[sp,#-16]!
stp x18,x19,[sp,#-16]!
stp x20,x21,[sp,#-16]!
stp x22,x23,[sp,#-16]!
stp x24,x25,[sp,#-16]!
stp x26,x27,[sp,#-16]!
stp x28,x29,[sp,#-16]!
str x30,[sp,#-16]!
adrp x9,__ftrace
ldr w9,[x9,#:lo12:__ftrace]
cmp w9,1
bge 1f
ret
1: stp x29,x30,[sp,-96]!
mov x29,sp
stp x0,x1,[sp,16]
stp x2,x3,[sp,32]
stp x4,x5,[sp,48]
stp x6,x7,[sp,64]
str x8,[sp,80]
bl ftracer
ldr x30,[sp,#16]!
ldp x28,x29,[sp,#16]!
ldp x26,x27,[sp,#16]!
ldp x24,x25,[sp,#16]!
ldp x22,x23,[sp,#16]!
ldp x20,x21,[sp,#16]!
ldp x18,x19,[sp,#16]!
ldp x16,x17,[sp,#16]!
ldp x14,x15,[sp,#16]!
ldp x12,x13,[sp,#16]!
ldp x10,x11,[sp,#16]!
ldp x8,x9,[sp,#16]!
ldp x6,x7,[sp,#16]!
ldp x4,x5,[sp,#16]!
ldp x2,x3,[sp,#16]!
ldp x0,x1,[sp,#16]!
ldr x8,[sp,80]
ldp x6,x7,[sp,64]
ldp x4,x5,[sp,48]
ldp x2,x3,[sp,32]
ldp x0,x1,[sp,16]
ldp x29,x30,[sp],96
ret
#endif /* __x86_64__ */
ret
.endfn ftrace_hook,globl

View file

@ -31,8 +31,6 @@
#include "libc/thread/tls.h"
#include "libc/thread/tls2.h"
#define MAX_NESTING 512
/**
* @fileoverview Plain-text function call logging.
*
@ -41,6 +39,14 @@
* into gzip.
*/
#define MAX_NESTING 512
#ifdef __x86_64__
#define DETOUR_SKEW 2
#elif defined(__aarch64__)
#define DETOUR_SKEW 8
#endif
void ftrace_hook(void);
static int g_stackdigs;
@ -72,6 +78,7 @@ static privileged inline int GetNestingLevel(struct CosmoFtrace *ft,
* according to the System Five NexGen32e ABI.
*/
privileged void ftracer(void) {
uintptr_t fn;
long stackuse;
struct CosmoTib *tib;
struct StackFrame *sf;
@ -91,11 +98,12 @@ privileged void ftracer(void) {
if (_cmpxchg(&ft->ft_noreentry, false, true)) {
sf = __builtin_frame_address(0);
sf = sf->next;
if (sf->addr != ft->ft_lastaddr) {
fn = sf->addr + DETOUR_SKEW;
if (fn != ft->ft_lastaddr) {
stackuse = GetStackAddr() + GetStackSize() - (intptr_t)sf;
kprintf("%rFUN %6P %'13T %'*ld %*s%t\n", g_stackdigs, stackuse,
GetNestingLevel(ft, sf) * 2, "", sf->addr);
ft->ft_lastaddr = sf->addr;
GetNestingLevel(ft, sf) * 2, "", fn);
ft->ft_lastaddr = fn;
}
ft->ft_noreentry = false;
}

View file

@ -18,99 +18,114 @@
*/
#include "ape/sections.internal.h"
#include "libc/calls/struct/sigset.h"
#include "libc/limits.h"
#include "libc/macros.internal.h"
#include "libc/runtime/morph.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/symbols.internal.h"
#ifdef __x86_64__
typedef uint8_t code_t;
#elif defined(__aarch64__)
typedef uint32_t code_t;
#else
#error "unsupported architecture"
#endif
static privileged bool IsVirginFunction(const code_t *func) {
#ifdef __x86_64__
long i;
// function must be preceeded by 9 nops
for (i = -9; i < 0; ++i) {
if (func[i] != 0x90) return false;
}
// function must start with `nop nop` or `xchg %ax,%ax`
if (func[0] == 0x90 && func[1] == 0x90) return true;
if (func[0] == 0x66 && func[1] == 0x90) return true;
return false;
#elif defined(__aarch64__)
long i;
// function must be preceeded by 6 nops
for (i = -6; i < 0; ++i) {
if (func[i] != 0xd503201f) return false;
}
// function must start with one nop
return func[0] == 0xd503201f;
#endif
}
static privileged void HookFunction(code_t *func, void *dest) {
long dp;
#ifdef __x86_64__
dp = (intptr_t)dest - (intptr_t)(func - 7 + 5);
if (!(INT32_MIN <= dp && dp <= INT32_MAX)) return;
// emit `ud2` signature for safety and checkability
func[-9] = 0x0f;
func[-8] = 0x0b;
// emit `call dest` instruction
func[-7] = 0xe8;
func[-6] = dp;
func[-5] = dp >> 8;
func[-4] = dp >> 16;
func[-3] = dp >> 24;
// emit `jmp +2` instruction to re-enter hooked function
func[-2] = 0xeb;
func[-1] = 0x02;
// emit `jmp -2` instruction to enter detour
func[+0] = 0xeb;
func[+1] = -7 - 2;
#elif defined(__aarch64__)
dp = (code_t *)dest - (func - 3);
if (!(-33554432 <= dp && dp <= +33554431)) return;
func[-6] = 0xd4200000 | (31337 << 5); // brk #31337
func[-5] = 0xa9bf7bfd; // stp x29,x30,[sp, #-16]!
func[-4] = 0x910003fd; // mov x29,sp
func[-3] = 0x94000000 | (dp & 0x03ffffff); // bl dest
func[-2] = 0xa8c17bfd; // ldp x29,x30,[sp], #16
func[-1] = 0x14000000 | (+2 & 0x03ffffff); // b +1
func[+0] = 0x14000000 | (-5 & 0x03ffffff); // b -5
#endif
}
/**
* Rewrites code in memory to hook function calls.
*
* We do this by searching each function for the nop instruction
* inserted by GCC when we use the -pg -mnop-mcount flags. There's no
* risk of corrupting data since the linker scripts won't mix code and
* data.
* On x86_64 you need the compiler flag:
*
* Modules built with -O3 and without the profiling flags might have
* these same nop instructions, but that shouldn't be problematic since
* they're only there for the puposes of aligning jumps, and therefore
* aren't actually executed. However codebases that use huge function
* alignments with wide-nop slides could pose minor issues. Further note
* that Cosmopolitan sources are almost never intentionally written to
* use code alignment, since we've only seen a few cases where it helps.
* -fpatchable-function-entry=11,9
*
* On Aarch64 you need the compiler flag:
*
* -fpatchable-function-entry=7,6
*
* This function can currently only be called once.
*
* @param dest is the address of the target function, which all hookable
* functions shall be reprogrammed to call from their epilogues; and
* must be sufficiently close in memory to the the program image, in
* order to meet ISA displacement requirements
* @param st can be obtained using `GetSymbolTable()`
* @see ape/ape.lds
*/
privileged noinstrument noasan int __hook(void *ifunc,
struct SymbolTable *symbols) {
int rc;
size_t i;
char *p, *pe;
intptr_t addr;
privileged noinstrument noasan int __hook(void *dest, struct SymbolTable *st) {
long i;
sigset_t mask;
uint64_t code, mcode;
intptr_t kMcount = (intptr_t)&mcount;
intptr_t kProgramCodeStart = (intptr_t)_ereal;
intptr_t kPrivilegedStart = (intptr_t)__privileged_addr;
if (!symbols) return -1;
code_t *p, *pe;
intptr_t lowest;
if (!st) return -1;
__morph_begin(&mask);
for (i = 0; i < symbols->count; ++i) {
if (symbols->addr_base + symbols->symbols[i].x < kProgramCodeStart) {
continue;
}
if (symbols->addr_base + symbols->symbols[i].y >= kPrivilegedStart) {
break;
}
for (p = (char *)symbols->addr_base + symbols->symbols[i].x,
pe = (char *)symbols->addr_base + symbols->symbols[i].y;
p + 8 - 1 <= pe; ++p) {
code = ((uint64_t)(255 & p[7]) << 070 | (uint64_t)(255 & p[6]) << 060 |
(uint64_t)(255 & p[5]) << 050 | (uint64_t)(255 & p[4]) << 040 |
(uint64_t)(255 & p[3]) << 030 | (uint64_t)(255 & p[2]) << 020 |
(uint64_t)(255 & p[1]) << 010 | (uint64_t)(255 & p[0]) << 000);
/*
* Test for -mrecord-mcount (w/ -fpie or -fpic)
*
* nopw 0x00(%rax,%rax,1) morphed by package.com
* call *mcount(%rip) linked w/o -static
* addr32 call mcount relaxed w/ -static
* addr32 call mcount relaxed w/ -static
*
* Note that gcc refuses to insert the six byte nop.
*/
if ((code & 0x0000FFFFFFFFFFFF) == 0x0000441F0F66 ||
(code & 0x0000FFFFFFFFFFFF) ==
((((kMcount - ((intptr_t)&p[2] + 4)) << 16) | 0xE867) &
0x0000FFFFFFFFFFFF) ||
(code & 0x0000FFFFFFFFFFFF) ==
((((kMcount - ((intptr_t)&p[2] + 4)) << 16) | 0xFF15) &
0x0000FFFFFFFFFFFF)) {
p[0] = 0x67;
p[1] = 0xE8;
addr = (intptr_t)ifunc - ((intptr_t)&p[2] + 4);
p[2] = (addr & 0x000000ff) >> 000;
p[3] = (addr & 0x0000ff00) >> 010;
p[4] = (addr & 0x00ff0000) >> 020;
p[5] = (addr & 0xff000000) >> 030;
break;
}
/*
* Test for -mnop-mcount (w/ -fno-pie)
*/
mcode = code & 0x000000FFFFFFFFFF;
if ((mcode == 0x00441F0F /* nopl 0x00(%eax,%eax,1) [canonical] */) ||
(mcode == 0x00041F0F67 /* nopl (%eax,%eax,1) [older gcc] */)) {
if (p[-1] != 0x66 /* nopw 0x0(%rax,%rax,1) [donotwant] */) {
p[0] = 0xE8 /* call Jvds */;
addr = (intptr_t)ifunc - ((intptr_t)&p[1] + 4);
p[1] = (addr & 0x000000ff) >> 000;
p[2] = (addr & 0x0000ff00) >> 010;
p[3] = (addr & 0x00ff0000) >> 020;
p[4] = (addr & 0xff000000) >> 030;
}
break;
}
lowest = MAX((intptr_t)__executable_start, (intptr_t)_ereal);
for (i = 0; i < st->count; ++i) {
if (st->symbols[i].x < 9) continue;
if (st->addr_base + st->symbols[i].x < lowest) continue;
if (st->addr_base + st->symbols[i].y >= (intptr_t)__privileged_addr) break;
p = (code_t *)((char *)st->addr_base + st->symbols[i].x);
pe = (code_t *)((char *)st->addr_base + st->symbols[i].y);
if (pe - p < 2) continue;
if (IsVirginFunction(p)) {
// kprintf("hooking %t\n", p);
HookFunction(p, dest);
} else {
// kprintf("can't hook %t at %lx\n", p, p);
}
}
__morph_end(&mask);

View file

@ -48,11 +48,12 @@
.section .initprologue,"ax",@progbits
.type _init,@function
.globl _init
.ftrace1
_init:
.ftrace2
#ifdef __x86_64__
push %rbp
mov %rsp,%rbp
.profilable
ezlea __init_bss_start,di
ezlea __init_rodata_start,si
#elif defined(__aarch64__)

View file

@ -29,8 +29,9 @@ extern unsigned char _tls_size[] __attribute__((__weak__));
extern unsigned char _tls_content[] __attribute__((__weak__));
void _init(void) _Hide;
void __morph_tls(void);
void __enable_tls(void);
int ftrace_init(void) _Hide;
void __morph_tls(void) _Hide;
void __enable_tls(void) _Hide;
void __enable_threads(void) _Hide;
void *__cxa_finalize(void *) _Hide;
void __stack_chk_fail(void) wontreturn relegated;

View file

@ -18,14 +18,12 @@
*/
#define ShouldUseMsabiAttribute() 1
#include "ape/sections.internal.h"
#include "libc/assert.h"
#include "libc/calls/internal.h"
#include "libc/calls/struct/sigset.h"
#include "libc/dce.h"
#include "libc/errno.h"
#include "libc/intrin/asmflag.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/strace.internal.h"
#include "libc/nt/enum/pageflags.h"
#include "libc/nt/memory.h"
#include "libc/nt/runtime.h"
@ -38,29 +36,21 @@
__msabi extern typeof(VirtualProtect) *const __imp_VirtualProtect;
static inline int __morph_rt_sigprocmask(int h, const sigset_t *s, sigset_t *o,
size_t c) {
#ifdef __aarch64__
register long r0 asm("x0") = (long)h;
register long r1 asm("x1") = (long)s;
register long r2 asm("x2") = (long)o;
register long r3 asm("x3") = (long)c;
register long r8 asm("x8") = (long)__NR_sigprocmask;
register long res_x0 asm("x0");
static privileged void __aarch64_sigprocmask(int how, const sigset_t *set,
sigset_t *oldset) {
register int r0 asm("x0") = how;
register long r1 asm("x1") = (long)set;
register long r2 asm("x2") = (long)oldset;
register long r3 asm("x3") = 8;
register long r8 asm("x8") = __NR_sigprocmask;
register long r16 asm("x16") = __NR_sigprocmask;
asm volatile("svc\t0"
: "=r"(res_x0)
: "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r8)
: "+r"(r0)
: "r"(r1), "r"(r2), "r"(r3), "r"(r8), "r"(r16)
: "memory");
return res_x0;
#else
return 0;
}
#endif
}
static inline int __morph_sigprocmask(int how, const sigset_t *set,
sigset_t *oldset) {
return __morph_rt_sigprocmask(how, set, oldset, sizeof(*set));
}
static privileged void __morph_mprotect(void *addr, size_t size, int prot,
int ntprot) {
@ -81,7 +71,7 @@ static privileged void __morph_mprotect(void *addr, size_t size, int prot,
_Exit(26);
}
#endif
_npassert(!ax);
if (ax) notpossible;
} else {
__imp_VirtualProtect(addr, size, ntprot, &op);
}
@ -90,12 +80,11 @@ static privileged void __morph_mprotect(void *addr, size_t size, int prot,
register long r1 asm("x1") = (long)size;
register long r2 asm("x2") = (long)prot;
register long r8 asm("x8") = (long)__NR_mprotect;
register long res_x0 asm("x0");
register long r16 asm("x16") = (long)__NR_mprotect;
asm volatile("svc\t0"
: "=r"(res_x0)
: "r"(r0), "r"(r1), "r"(r2), "r"(r8)
: "+r"(r0)
: "r"(r1), "r"(r2), "r"(r8), "r"(r16)
: "memory");
_npassert(!res_x0);
#endif
}
@ -109,7 +98,6 @@ privileged void __morph_begin(sigset_t *save) {
bool cf;
intptr_t dx;
sigset_t ss = {{-1, -1}};
STRACE("__morph_begin()");
#ifdef __x86_64__
if (IsOpenbsd()) {
asm volatile(CFLAG_ASM("syscall")
@ -117,17 +105,19 @@ privileged void __morph_begin(sigset_t *save) {
: "1"(__NR_sigprocmask), "D"(SIG_BLOCK), "S"(-1u)
: "rcx", "r8", "r9", "r10", "r11", "memory");
save->__bits[0] = ax & 0xffffffff;
_npassert(!cf);
if (cf) notpossible;
} else if (!IsWindows() && !IsMetal()) {
asm volatile("mov\t$8,%%r10d\n\t"
"syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sigprocmask), "D"(SIG_BLOCK), "S"(&ss), "1"(save)
: "rcx", "r8", "r9", "r10", "r11", "memory", "cc");
_npassert(!ax);
if (ax) notpossible;
}
#elif defined(__aarch64__)
__aarch64_sigprocmask(SIG_BLOCK, &ss, save);
#else
__morph_sigprocmask(SIG_BLOCK, &ss, save);
#error "unsupported architecture"
#endif
__morph_mprotect(__executable_start, __privileged_addr - __executable_start,
PROT_READ | PROT_WRITE, kNtPageWritecopy);
@ -148,17 +138,18 @@ privileged void __morph_end(sigset_t *save) {
: CFLAG_CONSTRAINT(cf), "=a"(ax), "=d"(dx)
: "1"(__NR_sigprocmask), "D"(SIG_SETMASK), "S"(save->__bits[0])
: "rcx", "r8", "r9", "r10", "r11", "memory");
_npassert(!cf);
if (cf) notpossible;
} else if (!IsWindows() && !IsMetal()) {
asm volatile("mov\t$8,%%r10d\n\t"
"syscall"
: "=a"(ax), "=d"(dx)
: "0"(__NR_sigprocmask), "D"(SIG_SETMASK), "S"(save), "1"(0L)
: "rcx", "r8", "r9", "r10", "r11", "memory", "cc");
_npassert(!ax);
if (ax) notpossible;
}
#elif defined(__aarch64__)
__aarch64_sigprocmask(SIG_SETMASK, save, 0);
#else
__morph_sigprocmask(SIG_SETMASK, save, 0);
#error "unsupported architecture"
#endif
STRACE("__morph_end()");
}

View file

@ -65,10 +65,10 @@ $(LIBC_RUNTIME_A).pkg: \
# we can't use function tracing because:
# this is the function tracing runtime
o/$(MODE)/libc/runtime/cosmo2.o: private \
OVERRIDE_CFLAGS += -O0
CFLAGS += -O0
o/$(MODE)/libc/runtime/ftracer.o: private \
OVERRIDE_CFLAGS += \
CFLAGS += \
-x-no-pg \
$(MNO_FENTRY) \
-ffreestanding \
@ -94,7 +94,7 @@ o/$(MODE)/libc/runtime/stackchkfail.o \
o/$(MODE)/libc/runtime/stackchkfaillocal.o \
o/$(MODE)/libc/runtime/winmain.greg.o \
o/$(MODE)/libc/runtime/opensymboltable.o: private \
OVERRIDE_CFLAGS += \
CFLAGS += \
-Os \
-ffreestanding \
$(NO_MAGIC)
@ -102,11 +102,11 @@ o/$(MODE)/libc/runtime/opensymboltable.o: private \
# must use alloca()
# can't use asan or any runtime services
o/$(MODE)/libc/runtime/fork-nt.o: private \
OVERRIDE_CPPFLAGS += \
CPPFLAGS += \
-DSTACK_FRAME_UNLIMITED
o/$(MODE)/libc/runtime/qsort.o: private \
OVERRIDE_CFLAGS += \
CFLAGS += \
-Og
# make always linked runtimes less huge when it's profitable
@ -114,13 +114,13 @@ o//libc/runtime/mmap.o \
o//libc/runtime/munmap.o \
o//libc/runtime/memtrack.greg.o \
o//libc/runtime/opensymboltable.greg.o: private \
OVERRIDE_CFLAGS += \
CFLAGS += \
-Os
ifeq ($(ARCH), aarch64)
o/$(MODE)/libc/runtime/mmap.o \
o/$(MODE)/libc/runtime/enable_tls.o: private \
OVERRIDE_CFLAGS += \
CFLAGS += \
-mcmodel=large
endif

View file

@ -52,14 +52,15 @@
// @returnstwice
// @threadsafe
// @vforksafe
.ftrace1
vfork:
.ftrace2
#ifdef __x86_64__
#if !IsTiny()
push %rbp
mov %rsp,%rbp
.profilable
call __require_tls
#ifdef SYSDEBUG
ezlea .Llog,di