From fad1279c61fdc34dcd1f91872caac5c2a1fea3c3 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Fri, 5 Jan 2024 20:36:57 -0800 Subject: [PATCH] Make cosmo_dlopen() safer and faster If cosmo_dlopen() is linked on AMD64 then the runtime will switch to using %gs for thread-local storage. This eliminates the need for the imported symbol trampoline. It's now safer to pass function pointers back and forth with imported libraries. Your program gets recompiled at runtime to make it happen and the overhead is a few milliseconds. --- libc/dlopen/dlopen.c | 17 +++++++ .../set_tls-sysv.S => intrin/sys_set_tls.S} | 0 libc/intrin/tlsmorphed.c | 21 ++++++++ libc/runtime/clone.c | 50 +++++++++++++++---- libc/runtime/morph_tls.c | 7 ++- libc/runtime/set_tls.c | 7 ++- libc/thread/tls.h | 1 + libc/thread/tls2.internal.h | 6 ++- 8 files changed, 93 insertions(+), 16 deletions(-) rename libc/{runtime/set_tls-sysv.S => intrin/sys_set_tls.S} (100%) create mode 100644 libc/intrin/tlsmorphed.c diff --git a/libc/dlopen/dlopen.c b/libc/dlopen/dlopen.c index 9be0c1f56..f1d75fcb5 100644 --- a/libc/dlopen/dlopen.c +++ b/libc/dlopen/dlopen.c @@ -45,6 +45,7 @@ #include "libc/nt/memory.h" #include "libc/nt/runtime.h" #include "libc/proc/posix_spawn.h" +#include "libc/runtime/internal.h" #include "libc/runtime/runtime.h" #include "libc/runtime/syslib.internal.h" #include "libc/serialize.h" @@ -494,6 +495,8 @@ static uint8_t *movimm(uint8_t p[static 16], int reg, uint64_t val) { static void *foreign_thunk_sysv(void *func) { uint8_t *code, *p; #ifdef __x86_64__ + // it is no longer needed + if (1) return func; // movabs $func,%rax // movabs $foreign_tramp,%r10 // jmp *%r10 @@ -896,3 +899,17 @@ char *cosmo_dlerror(void) { STRACE("dlerror() → %#s", res); return res; } + +#ifdef __x86_64__ +static textstartup void dlopen_init() { + if (IsLinux() || IsFreebsd()) { + // switch from %fs to %gs for tls + struct CosmoTib *tib = __get_tls(); + __morph_tls(); + __set_tls(tib); + } +} +const void *const dlopen_ctor[] initarray = { + dlopen_init, +}; +#endif diff --git a/libc/runtime/set_tls-sysv.S b/libc/intrin/sys_set_tls.S similarity index 100% rename from libc/runtime/set_tls-sysv.S rename to libc/intrin/sys_set_tls.S diff --git a/libc/intrin/tlsmorphed.c b/libc/intrin/tlsmorphed.c new file mode 100644 index 000000000..3a805571e --- /dev/null +++ b/libc/intrin/tlsmorphed.c @@ -0,0 +1,21 @@ +/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│ +│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│ +╞══════════════════════════════════════════════════════════════════════════════╡ +│ Copyright 2023 Justine Alexandra Roberts Tunney │ +│ │ +│ Permission to use, copy, modify, and/or distribute this software for │ +│ any purpose with or without fee is hereby granted, provided that the │ +│ above copyright notice and this permission notice appear in all copies. │ +│ │ +│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ +│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ +│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ +│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ +│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ +│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ +│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ +│ PERFORMANCE OF THIS SOFTWARE. │ +╚─────────────────────────────────────────────────────────────────────────────*/ +#include "libc/runtime/internal.h" + +char __tls_morphed; diff --git a/libc/runtime/clone.c b/libc/runtime/clone.c index 72c76a94f..85d3db9df 100644 --- a/libc/runtime/clone.c +++ b/libc/runtime/clone.c @@ -48,6 +48,7 @@ #include "libc/sock/internal.h" #include "libc/stdalign.internal.h" #include "libc/str/str.h" +#include "libc/sysv/consts/arch.h" #include "libc/sysv/consts/clone.h" #include "libc/sysv/consts/futex.h" #include "libc/sysv/consts/nr.h" @@ -63,6 +64,9 @@ #define kMaxThreadIds 32768 #define kMinThreadId 262144 +#define AMD64_SET_FSBASE 129 +#define AMD64_SET_GSBASE 131 + #define __NR_thr_new 455 #define __NR_clone_linux 56 #define __NR__lwp_create 309 @@ -90,6 +94,7 @@ struct CloneArgs { void *arg; }; +int sys_set_tls(); int __stack_call(void *, int, long, long, int (*)(void *, int), void *); static struct CloneArgs *AllocateCloneArgs(char *stk, size_t stksz) { @@ -390,14 +395,14 @@ static int CloneNetbsd(int (*func)(void *, int), char *stk, size_t stksz, //////////////////////////////////////////////////////////////////////////////// // FREE BESIYATA DISHMAYA -void bone(struct CloneArgs *wt) { - *wt->ztid = 0; -} - static wontreturn void FreebsdThreadMain(void *p) { struct CloneArgs *wt = p; #ifdef __aarch64__ asm volatile("mov\tx28,%0" : /* no outputs */ : "r"(wt->tls)); +#elif defined(__x86_64__) + if (__tls_morphed) { + sys_set_tls(AMD64_SET_GSBASE, wt->tls); + } #endif *wt->ctid = wt->tid; wt->func(wt->arg, wt->tid); @@ -534,6 +539,13 @@ static errno_t CloneSilicon(int (*fn)(void *, int), char *stk, size_t stksz, //////////////////////////////////////////////////////////////////////////////// // GNU/SYSTEMD +struct LinuxCloneArgs { + int (*func)(void *, int); + void *arg; + char *tls; + int ctid; +}; + int sys_clone_linux(int flags, // rdi long sp, // rsi int *ptid, // rdx @@ -542,24 +554,40 @@ int sys_clone_linux(int flags, // rdi void *func, // r9 void *arg); // 8(rsp) +static int LinuxThreadEntry(void *arg, int tid) { + struct LinuxCloneArgs *wt = arg; + sys_set_tls(ARCH_SET_GS, wt->tls); + return wt->func(wt->arg, tid); +} + static int CloneLinux(int (*func)(void *arg, int rc), char *stk, size_t stksz, int flags, void *arg, void *tls, int *ptid, int *ctid) { int rc; long sp; + struct LinuxCloneArgs *wt; sp = (intptr_t)(stk + stksz); - if (~flags & CLONE_CHILD_SETTID) { - flags |= CLONE_CHILD_SETTID; - sp -= sizeof(int); - sp = sp & -alignof(int); - ctid = (int *)sp; - sp -= 8; // experiment - } + sp -= sizeof(struct LinuxCloneArgs); // align the stack #ifdef __aarch64__ sp = sp & -128; // for kernel 4.6 and earlier #else sp = sp & -16; #endif + wt = (struct LinuxCloneArgs *)sp; +#ifdef __x86_64__ + if ((flags & CLONE_SETTLS) && __tls_morphed) { + flags &= ~CLONE_SETTLS; + wt->arg = arg; + wt->tls = tls; + wt->func = func; + func = LinuxThreadEntry; + arg = wt; + } +#endif + if (~flags & CLONE_CHILD_SETTID) { + flags |= CLONE_CHILD_SETTID; + ctid = &wt->ctid; + } if ((rc = sys_clone_linux(flags, sp, ptid, ctid, tls, func, arg)) >= 0) { // clone() is documented as setting ptid before return return 0; diff --git a/libc/runtime/morph_tls.c b/libc/runtime/morph_tls.c index 442b746e5..b780533ed 100644 --- a/libc/runtime/morph_tls.c +++ b/libc/runtime/morph_tls.c @@ -17,9 +17,9 @@ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "ape/sections.internal.h" -#include "libc/serialize.h" #include "libc/runtime/internal.h" #include "libc/runtime/runtime.h" +#include "libc/serialize.h" #include "libc/thread/tls.h" typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1))); @@ -55,11 +55,13 @@ privileged void __morph_tls(void) { // address 0x30 was promised to us, according to Go team // https://github.com/golang/go/issues/23617 dis = 0x30; - } else { + } else if (IsWindows()) { // MSVC __declspec(thread) generates binary code for this // %gs:0x1480 abi. So long as TlsAlloc() isn't called >64 // times we should be good. dis = 0x1480 + __tls_index * 8; + } else { + dis = 0; } // iterate over modifiable code looking for 9 byte instruction @@ -112,6 +114,7 @@ privileged void __morph_tls(void) { } } + __tls_morphed = 1; __morph_end(); #endif } diff --git a/libc/runtime/set_tls.c b/libc/runtime/set_tls.c index 3e87f0764..0dd8049c5 100644 --- a/libc/runtime/set_tls.c +++ b/libc/runtime/set_tls.c @@ -26,6 +26,9 @@ #include "libc/thread/tls.h" #include "libc/thread/tls2.internal.h" +#define AMD64_SET_FSBASE 129 +#define AMD64_SET_GSBASE 131 + int sys_set_tls(); // we can't allow --ftrace here because cosmo_dlopen() calls this @@ -37,9 +40,9 @@ dontinstrument textstartup void __set_tls(struct CosmoTib *tib) { if (IsWindows()) { asm("mov\t%1,%%gs:%0" : "=m"(*((long *)0x1480 + __tls_index)) : "r"(tib)); } else if (IsFreebsd()) { - sys_set_tls(129 /*AMD64_SET_FSBASE*/, tib); + sys_set_tls(__tls_morphed ? AMD64_SET_GSBASE : AMD64_SET_FSBASE, tib); } else if (IsLinux()) { - sys_set_tls(ARCH_SET_FS, tib); + sys_set_tls(__tls_morphed ? ARCH_SET_GS : ARCH_SET_FS, tib); } else if (IsNetbsd()) { // netbsd has sysarch(X86_SET_FSBASE) but we can't use that because // signal handlers will cause it to be reset due to not setting the diff --git a/libc/thread/tls.h b/libc/thread/tls.h index 45f1a97ff..a713b050c 100644 --- a/libc/thread/tls.h +++ b/libc/thread/tls.h @@ -42,6 +42,7 @@ struct CosmoTib { }; extern int __threaded; +extern char __tls_morphed; extern unsigned __tls_index; char *_mktls(struct CosmoTib **); diff --git a/libc/thread/tls2.internal.h b/libc/thread/tls2.internal.h index 1bbe95c5e..b96693276 100644 --- a/libc/thread/tls2.internal.h +++ b/libc/thread/tls2.internal.h @@ -14,7 +14,11 @@ COSMOPOLITAN_C_START_ __funline struct CosmoTib *__get_tls_privileged(void) { char *tib, *lin = (char *)0x30; if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd() || IsMetal()) { - asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory"); + if (!__tls_morphed) { + asm("mov\t%%fs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory"); + } else { + asm("mov\t%%gs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory"); + } } else { asm("mov\t%%gs:(%1),%0" : "=a"(tib) : "r"(lin) : "memory"); if (IsWindows()) {