/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2022 Justine Alexandra Roberts Tunney                              │
│                                                                              │
│ Permission to use, copy, modify, and/or distribute this software for         │
│ any purpose with or without fee is hereby granted, provided that the         │
│ above copyright notice and this permission notice appear in all copies.      │
│                                                                              │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
│ PERFORMANCE OF THIS SOFTWARE.                                                │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "ape/sections.internal.h"
#include "libc/dce.h"
#include "libc/intrin/bits.h"
#include "libc/runtime/internal.h"
#include "libc/runtime/morph.h"
#include "libc/thread/tls.h"

typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));

privileged void __morph_tls(void) {
#ifdef __x86_64__
  // We need to rewrite SysV _Thread_local code. You MUST use the
  // -mno-tls-direct-seg-refs flag which generates code like this
  //
  //     64 48 8b 0R4 25 00 00 00 00   mov %fs:0,%R
  //     64 48 03 0R4 25 00 00 00 00   add %fs:0,%R
  //
  // Which on Mac we can replace with this:
  //
  //     65 48 8b 0R4 25 30 00 00 00   mov %gs:0x30,%R
  //
  // Since we have no idea where the TLS instructions exist in the
  // binary, we need to disassemble the whole program image. This'll
  // potentially take a few milliseconds for some larger programs.
  //
  // We check `_tls_content` which is generated by the linker script
  // since it lets us determine ahead of time if _Thread_local vars
  // have actually been linked into this program.
  if ((intptr_t)_tls_content && (IsWindows() || IsXnu())) {
    int n;
    uint64_t w;
    sigset_t mask;
    unsigned m, dis;
    unsigned char *p;
    __morph_begin(&mask);

    if (IsXnu()) {
      // Apple is quite straightforward to patch. We basically
      // just change the segment register, and the linear slot
      // address 0x30 was promised to us, according to Go team
      // https://github.com/golang/go/issues/23617
      dis = 0x30;
    } else {
      // MSVC __declspec(thread) generates binary code for this
      // %gs:0x1480 abi. So long as TlsAlloc() isn't called >64
      // times we should be good.
      dis = 0x1480 + __tls_index * 8;
    }

    // iterate over modifiable code looking for 9 byte instruction
    // this would take 30 ms using xed to enable tls on python.com
    for (p = _ereal; p + 9 <= __privileged_start; p += n) {

      // use sse to zoom zoom to fs register prefixes
      // that way it'll take 1 ms to morph python.com
      while (p + 9 + 16 <= __privileged_start) {
        if ((m = __builtin_ia32_pmovmskb128(
                 *(xmm_t *)p == (xmm_t){0144, 0144, 0144, 0144, 0144, 0144,
                                        0144, 0144, 0144, 0144, 0144, 0144,
                                        0144, 0144, 0144, 0144}))) {
          m = __builtin_ctzll(m);
          p += m;
          break;
        } else {
          p += 16;
        }
      }

      // we're checking for the following expression:
      //   0144 == p[0] &&           // %fs
      //   0110 == (p[1] & 0373) &&  // rex.w (and ignore rex.r)
      //   (0213 == p[2] ||          // mov reg/mem → reg (word-sized)
      //   0003 == p[2]) &&          // add reg/mem → reg (word-sized)
      //   0004 == (p[3] & 0307) &&  // mod/rm (4,reg,0) means sib → reg
      //   0045 == p[4] &&           // sib (5,4,0) → (rbp,rsp,0) → disp32
      //   0000 == p[5] &&           // displacement (von Neumann endian)
      //   0000 == p[6] &&           // displacement
      //   0000 == p[7] &&           // displacement
      //   0000 == p[8]              // displacement
      w = READ64LE(p) & READ64LE("\377\373\377\307\377\377\377\377");
      if ((w == READ64LE("\144\110\213\004\045\000\000\000") ||
           w == READ64LE("\144\110\003\004\045\000\000\000")) &&
          !p[8]) {

        // now change the code
        p[0] = 0145;                       // change %fs to %gs
        p[5] = (dis & 0x000000ff) >> 000;  // displacement
        p[6] = (dis & 0x0000ff00) >> 010;  // displacement
        p[7] = (dis & 0x00ff0000) >> 020;  // displacement
        p[8] = (dis & 0xff000000) >> 030;  // displacement

        // advance to the next instruction
        n = 9;
      } else {
        n = 1;
      }
    }

    __morph_end(&mask);
  }
#endif
}