Simplify TLS and reduce startup latency

This change simplifies the thread-local storage support code. On Windows and Mac OS X the startup latency of __enable_tls() has been reduced from 30ms to 1ms. On Windows, TLS memory accesses will now go much faster due to better self-modifying code that prevents a function call and acquires our thread information block pointer in a single instruction.
2025-10-13 05:09:10 +00:00 · 2022-07-18 03:33:32 -07:00 · 2022-07-18 03:33:32 -07:00 · b1d9d11be1
commit b1d9d11be1
parent 38c3fa63fe
15 changed files with 136 additions and 312 deletions
--- a/libc/runtime/enable_tls.c
+++ b/libc/runtime/enable_tls.c
@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/bits/bits.h"
 #include "libc/calls/calls.h"
 #include "libc/calls/strace.internal.h"
 #include "libc/calls/syscall-sysv.internal.h"
@ -45,11 +46,16 @@
 #define _TLDZ ((intptr_t)_tdata_size)
 #define _TIBZ sizeof(struct cthread_descriptor_t)

+typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
+
 __msabi extern typeof(TlsAlloc) *const __imp_TlsAlloc;

 extern unsigned char __tls_mov_nt_rax[];
 extern unsigned char __tls_add_nt_rax[];

+/**
+ * Enables thread local storage.
+ */
 privileged void __enable_tls(void) {
  if (__tls_enabled) return;
  STRACE("__enable_tls()");
@ -111,98 +117,93 @@ privileged void __enable_tls(void) {
                 : "rcx", "r11", "memory");
  }

-  /*
-   * We need to rewrite SysV _Thread_local code. You MUST use the
-   * -mno-tls-direct-seg-refs flag which generates code like this
-   *
-   *     64 48 8b 0R4 25 00 00 00 00   mov %fs:0,%R
-   *     64 48 03 0R4 25 00 00 00 00   add %fs:0,%R
-   *
-   * Which on Mac we can replace with this:
-   *
-   *     65 48 8b 0R4 25 30 00 00 00   mov %gs:0x30,%R
-   *
-   * Whereas on Windows we'll replace it with this:
-   *
-   *     0f 1f 40 00     fatnop4
-   *     e8 xx xx xx xx  call __tls_mov_nt_%R
-   *
-   * Since we have no idea where the TLS instructions exist in the
-   * binary, we need to disassemble the whole program image. This'll
-   * potentially take a few milliseconds for some larger programs.
-   *
-   * We check `_tls_content` which is generated by the linker script
-   * since it lets us determine ahead of time if _Thread_local vars
-   * have actually been linked into this program.
-   *
-   * TODO(jart): compute probability this is just overkill
-   */
+  // We need to rewrite SysV _Thread_local code. You MUST use the
+  // -mno-tls-direct-seg-refs flag which generates code like this
+  //
+  //     64 48 8b 0R4 25 00 00 00 00   mov %fs:0,%R
+  //     64 48 03 0R4 25 00 00 00 00   add %fs:0,%R
+  //
+  // Which on Mac we can replace with this:
+  //
+  //     65 48 8b 0R4 25 30 00 00 00   mov %gs:0x30,%R
+  //
+  // Whereas on Windows we'll replace it with this:
+  //
+  //     0f 1f 40 00     fatnop4
+  //     e8 xx xx xx xx  call __tls_mov_nt_%R
+  //
+  // Since we have no idea where the TLS instructions exist in the
+  // binary, we need to disassemble the whole program image. This'll
+  // potentially take a few milliseconds for some larger programs.
+  //
+  // We check `_tls_content` which is generated by the linker script
+  // since it lets us determine ahead of time if _Thread_local vars
+  // have actually been linked into this program.
  if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
-    int n, reg, dis;
+    int n;
+    uint64_t w;
+    unsigned m, dis;
    unsigned char *p;
-    const unsigned char *impl;
-    struct XedDecodedInst xedd;
    __morph_begin();

-    // The most expensive part of this process is we need to compute the
-    // byte length of each instruction in our program. We'll use Intel's
-    // disassembler for this purpose.
-    for (p = _ereal; p < __privileged_start; p += n) {
-      xed_decoded_inst_zero_set_mode(&xedd, XED_MACHINE_MODE_LONG_64);
-      if (!xed_instruction_length_decode(&xedd, p, 15)) {
+    if (IsXnu()) {
+      // Apple is quite straightforward to patch. We basically
+      // just change the segment register, and the linear slot
+      // address 0x30 was promised to us, according to Go team
+      // https://github.com/golang/go/issues/23617
+      dis = 0x30;
+    } else {
+      // MSVC __declspec(thread) generates binary code for this
+      // %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
+      // times we should be good.
+      dis = 0x1480 + __tls_index * 8;
+    }

-        // We now know p[0] is most likely the first byte of an x86 op.
-        // Let's check and see if it's the GCC linear TIB address load.
-        // We hope and pray GCC won't generate TLS stores to %r8..%r15.
-        if (xedd.length == 9 &&       //
-            0144 == p[0] &&           // fs
-            0110 == p[1] &&           // rex.w (64-bit operand size)
-            (0213 == p[2] ||          // mov reg/mem → reg (word-sized)
-             0003 == p[2]) &&         // add reg/mem → reg (word-sized)
-            0004 == (p[3] & 0307) &&  // mod/rm (4,reg,0) means sib → reg
-            0045 == p[4] &&           // sib (5,4,0) → (rbp,rsp,0) → disp32
-            0000 == p[5] &&           // displacement (von Neumann endian)
-            0000 == p[6] &&           // displacement
-            0000 == p[7] &&           // displacement
-            0000 == p[8]) {           // displacement
+    // iterate over modifiable code looking for 9 byte instruction
+    // this would take 30 ms using xed to enable tls on python.com
+    for (p = _ereal; p + 9 <= __privileged_start; p += n) {

-          // Apple is quite straightforward to patch. We basically
-          // just change the segment register, and the linear slot
-          if (IsXnu()) {
-            p[0] = 0145;  // this changes gs segment to fs segment
-            p[5] = 0x30;  // tib slot index for tib linear address
-          }
-
-          // Windows is kind of complicated. We need to replace the
-          // segment mov instruction with a function call, that (a)
-          // won't clobber registers, and (b) has a return register
-          // that's the same as the mov destination. When setting
-          // function displacement, &CALL+5+DISP must equal &FUNC.
-          else {
-            if (p[2] == 3) {
-              impl = __tls_add_nt_rax;
-            } else {
-              impl = __tls_mov_nt_rax;
-            }
-            reg = (p[3] & 070) >> 3;
-            dis = (impl + reg * 18) - (p + 9);
-            p[0] = 0017;                       // map1
-            p[1] = 0037;                       // nopl (onl if reg=0)
-            p[2] = 0100;                       // mod/rm (%rax)+disp8
-            p[3] = 0000;                       // displacement
-            p[4] = 0350;                       // call
-            p[5] = (dis & 0x000000ff) >> 000;  // displacement
-            p[6] = (dis & 0x0000ff00) >> 010;  // displacement
-            p[7] = (dis & 0x00ff0000) >> 020;  // displacement
-            p[8] = (dis & 0xff000000) >> 030;  // displacement
-          }
+      // use sse to zoom zoom to fs register prefixes
+      // that way it'll take 1 ms to morph python.com
+      while (p + 9 + 16 <= __privileged_start) {
+        if ((m = __builtin_ia32_pmovmskb128(
+                 *(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
+                                        0144, 0144, 0144, 0144, 0144, 0144,
+                                        0144, 0144, 0144, 0144}))) {
+          m = __builtin_ctzll(m);
+          p += m;
+          break;
+        } else {
+          p += 16;
        }
+      }

-        // Move to the next instruction.
-        n = xedd.length;
+      // we're checking for the following expression:
+      //   0144 == p[0] &&           // fs
+      //   0110 == p[1] &&           // rex.w (64-bit operand size)
+      //   (0213 == p[2] ||          // mov reg/mem → reg (word-sized)
+      //   0003 == p[2]) &&          // add reg/mem → reg (word-sized)
+      //   0004 == (p[3] & 0307) &&  // mod/rm (4,reg,0) means sib → reg
+      //   0045 == p[4] &&           // sib (5,4,0) → (rbp,rsp,0) → disp32
+      //   0000 == p[5] &&           // displacement (von Neumann endian)
+      //   0000 == p[6] &&           // displacement
+      //   0000 == p[7] &&           // displacement
+      //   0000 == p[8]              // displacement
+      w = READ64LE(p) & READ64LE("\377\377\377\307\377\377\377\377");
+      if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
+           w == READ64LE("\144\110\003\004\045\000\000\000")) &&
+          !p[8]) {
+
+        // now change the code
+        p[0] = 0145;  // this changes gs segment to fs segment
+        p[5] = (dis & 0x000000ff) >> 000;  // displacement
+        p[6] = (dis & 0x0000ff00) >> 010;  // displacement
+        p[7] = (dis & 0x00ff0000) >> 020;  // displacement
+        p[8] = (dis & 0xff000000) >> 030;  // displacement
+
+        // advance to the next instruction
+        n = 9;
      } else {
-        // If Xed failed to decode the instruction, then we'll just plow
-        // through memory one byte at a time until Xed's morale improves
        n = 1;
      }
    }