WIP

2025-08-08 10:50:28 +00:00 · 2023-07-08 20:57:13 +00:00 · 2023-07-08 20:57:13 +00:00 · 2b4d6124d9
commit 2b4d6124d9
parent 6bc04598bf
20 changed files with 63 additions and 6666 deletions
--- a/third_party/mold/elf/arch-alpha.cc
+++ b/third_party/mold/elf/arch-alpha.cc
@ -1,331 +0,0 @@
-// clang-format off
-// Alpha is a 64-bit RISC ISA developed by DEC (Digital Equipment
-// Corporation) in the early '90s. It aimed to be an ISA that would last
-// 25 years. DEC expected Alpha would become 1000x faster during that time
-// span. Since the ISA was developed from scratch for future machines,
-// it's 64-bit from the beginning. There's no 32-bit variant.
-//
-// DEC ported its own Unix (Tru64) to Alpha. Microsoft also ported Windows
-// NT to it. But it wasn't a huge commercial success.
-//
-// DEC was acquired by Compaq in 1997. In the late '90s, Intel and
-// Hewlett-Packard were advertising that their upcoming Itanium processor
-// would achieve significantly better performance than RISC processors, so
-// Compaq decided to discontinue the Alpha processor line to switch to
-// Itanium. Itanium resulted in a miserable failure, but it still suceeded
-// to wipe out several RISC processors just by promising overly optimistic
-// perf numbers. Alpha as an ISA would probably have been fine after 25
-// years since its introduction (which is 1992 + 25 = 2017), but the
-// company and its market didn't last that long.
-//
-// From the linker's point of view, there are a few peculiarities in its
-// psABI as shown below:
-//
-//  - Alpha lacks PC-relative memory load/store instructions, so it uses
-//    register-relative load/store instructions in position-independent
-//    code. Specifically, GP (which is an alias for $r29) is always
-//    maintained to refer to .got+0x8000, and global variables' addresses
-//    are loaded in a GP-relative manner.
-//
-//  - It looks like even function addresses are first loaded to register
-//    in a GP-relative manner before calling it. We can relax it to
-//    convert the instruction sequence with a direct branch instruction,
-//    but by default, object files don't use a direct branch to call a
-//    function. Therefore, by default, we don't need to create a PLT.
-//    Any function call is made by first reading its address from GOT and
-//    jump to the address.
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = ALPHA;
-
-// A 32-bit immediate can be materialized in a register with a "load high"
-// and a "load low" instruction sequence. The first instruction sets the
-// upper 16 bits in a register, and the second one set the lower 16
-// bits. When doing so, they sign-extend an immediate.  Therefore, if the
-// 15th bit of an immediate happens to be 1, setting a "low half" value
-// negates the upper 16 bit values that has already been set in a
-// register. To compensate that, we need to add 0x8000 when setting the
-// upper 16 bits.
-static u32 hi(u32 val) {
-  return bits(val + 0x8000, 31, 16);
-}
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_ALPHA_SREL32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-    u64 GP = ctx.got->shdr.sh_addr + 0x8000;
-
-    switch (rel.r_type) {
-    case R_ALPHA_REFQUAD:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_ALPHA_GPREL32:
-      *(ul32 *)loc = S + A - GP;
-      break;
-    case R_ALPHA_LITERAL:
-      if (A)
-        *(ul16 *)loc = ctx.extra.got->get_addr(sym, A) - GP;
-      else
-        *(ul16 *)loc = GOT + G - GP;
-      break;
-    case R_ALPHA_BRSGP:
-      *(ul32 *)loc |= bits(S + A - P - 4, 22, 0);
-      break;
-    case R_ALPHA_GPDISP:
-      *(ul16 *)loc = hi(GP - P);
-      *(ul16 *)(loc + A) = GP - P;
-      break;
-    case R_ALPHA_SREL32:
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_ALPHA_GPRELHIGH:
-      *(ul16 *)loc = hi(S + A - GP);
-      break;
-    case R_ALPHA_GPRELLOW:
-      *(ul16 *)loc = S + A - GP;
-      break;
-    case R_ALPHA_TLSGD:
-      *(ul16 *)loc = sym.get_tlsgd_addr(ctx) - GP;
-      break;
-    case R_ALPHA_TLSLDM:
-      *(ul16 *)loc = ctx.got->get_tlsld_addr(ctx) - GP;
-      break;
-    case R_ALPHA_DTPRELHI:
-      *(ul16 *)loc = hi(S + A - ctx.dtp_addr);
-      break;
-    case R_ALPHA_DTPRELLO:
-      *(ul16 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_ALPHA_GOTTPREL:
-      *(ul16 *)loc = sym.get_gottp_addr(ctx) + A - GP;
-      break;
-    case R_ALPHA_TPRELHI:
-      *(ul16 *)loc = hi(S + A - ctx.tp_addr);
-      break;
-    case R_ALPHA_TPRELLO:
-      *(ul16 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_ALPHA_LITUSE:
-    case R_ALPHA_HINT:
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_ALPHA_REFLONG:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
-      else
-        *(ul32 *)loc = S + A;
-      break;
-    case R_ALPHA_REFQUAD:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul64 *)loc = *val;
-      else
-        *(ul64 *)loc = S + A;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      Error(ctx) << sym << ": GNU ifunc symbol is not supported on Alpha";
-
-    switch (rel.r_type) {
-    case R_ALPHA_REFQUAD:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_ALPHA_LITERAL:
-      if (rel.r_addend)
-        ctx.extra.got->add_symbol(sym, rel.r_addend);
-      else
-        sym.flags |= NEEDS_GOT;
-      break;
-    case R_ALPHA_SREL32:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_ALPHA_BRSGP:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_ALPHA_TLSGD:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_ALPHA_TLSLDM:
-      ctx.needs_tlsld = true;
-      break;
-    case R_ALPHA_GOTTPREL:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_ALPHA_TPRELHI:
-    case R_ALPHA_TPRELLO:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_ALPHA_GPREL32:
-    case R_ALPHA_LITUSE:
-    case R_ALPHA_GPDISP:
-    case R_ALPHA_HINT:
-    case R_ALPHA_GPRELHIGH:
-    case R_ALPHA_GPRELLOW:
-    case R_ALPHA_DTPRELHI:
-    case R_ALPHA_DTPRELLO:
-      break;
-    default:
-      Fatal(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-// An R_ALPHA_LITERAL relocation may request the linker to create a GOT
-// entry for an external symbol with a non-zero addend. This is an unusual
-// request which is not found in any other targets.
-//
-// Referring an external symbol with a non-zero addend is a bad practice
-// because we need to create as many dynamic relocations as the number of
-// distinctive addends for the same symbol.
-//
-// We don't want to mess up the implementation of the common GOT section
-// for Alpha. So we create another GOT-like section, .alpha_got. Any GOT
-// entry for an R_ALPHA_LITERAL reloc with a non-zero addend is created
-// not in .got but in .alpha_got.
-//
-// Since .alpha_got entries are accessed relative to GP, .alpha_got
-// needs to be close enough to .got. It's actually placed next to .got.
-void AlphaGotSection::add_symbol(Symbol<E> &sym, i64 addend) {
-  assert(addend);
-  std::scoped_lock lock(mu);
-  entries.push_back({&sym, addend});
-}
-
-bool operator<(const AlphaGotSection::Entry &a, const AlphaGotSection::Entry &b) {
-  return std::tuple(a.sym->file->priority, a.sym->sym_idx, a.addend) <
-         std::tuple(b.sym->file->priority, b.sym->sym_idx, b.addend);
-};
-
-u64 AlphaGotSection::get_addr(Symbol<E> &sym, i64 addend) {
-  auto it = std::lower_bound(entries.begin(), entries.end(), Entry{&sym, addend});
-  assert(it != entries.end());
-  return this->shdr.sh_addr + (it - entries.begin()) * sizeof(Word<E>);
-}
-
-i64 AlphaGotSection::get_reldyn_size(Context<E> &ctx) const {
-  i64 n = 0;
-  for (const Entry &e : entries)
-    if (e.sym->is_imported || (ctx.arg.pic && !e.sym->is_absolute()))
-      n++;
-  return n;
-}
-
-void AlphaGotSection::finalize() {
-  sort(entries);
-  remove_duplicates(entries);
-  shdr.sh_size = entries.size() * sizeof(Word<E>);
-}
-
-void AlphaGotSection::copy_buf(Context<E> &ctx) {
-  ElfRel<E> *dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                                    reldyn_offset);
-
-  for (i64 i = 0; i < entries.size(); i++) {
-    Entry &e = entries[i];
-    u64 P = this->shdr.sh_addr + sizeof(Word<E>) * i;
-    ul64 *buf = (ul64 *)(ctx.buf + this->shdr.sh_offset + sizeof(Word<E>) * i);
-
-    if (e.sym->is_imported) {
-      *buf = ctx.arg.apply_dynamic_relocs ? e.addend : 0;
-      *dynrel++ = ElfRel<E>(P, E::R_ABS, e.sym->get_dynsym_idx(ctx), e.addend);
-    } else {
-      *buf = e.sym->get_addr(ctx) + e.addend;
-      if (ctx.arg.pic && !e.sym->is_absolute())
-        *dynrel++ = ElfRel<E>(P, E::R_RELATIVE, 0, *buf);
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-arm32.cc
+++ b/third_party/mold/elf/arch-arm32.cc
@ -1,737 +0,0 @@
-// clang-format off
-// ARM32 is a bit special from the linker's viewpoint because ARM
-// processors support two different instruction encodings: Thumb and
-// ARM (in a narrower sense). Thumb instructions are either 16 bits or
-// 32 bits, while ARM instructions are all 32 bits. Feature-wise,
-// thumb is a subset of ARM, so not all ARM instructions are
-// representable in Thumb.
-//
-// ARM processors originally supported only ARM instructions. Thumb
-// instructions were later added to increase code density.
-//
-// ARM processors runs in either ARM mode or Thumb mode. The mode can
-// be switched using BX (branch and mode exchange)-family instructions.
-// We need to use that instructions to, for example, call a function
-// encoded in Thumb from a function encoded in ARM. Sometimes, the
-// linker even has to emit an interworking thunk code to switch mode.
-//
-// ARM instructions are aligned to 4 byte boundaries. Thumb are to 2
-// byte boundaries.
-//
-// You can distinguish Thumb functions from ARM functions by looking
-// at the least significant bit (LSB) of its "address". If LSB is 0,
-// it's ARM; otherwise, Thumb.
-//
-// For example, if a symbol `foo` is of type STT_FUNC and has value
-// 0x2001, `foo` is a function using Thumb instructions whose address
-// is 0x2000 (not 0x2001, as Thumb instructions are always 2-byte
-// aligned). Likewise, if a function pointer has value 0x2001, it
-// refers a Thumb function at 0x2000.
-//
-// https://github.com/ARM-software/abi-aa/blob/main/aaelf32/aaelf32.rst
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = ARM32;
-
-template <>
-i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
-  switch (rel.r_type) {
-  case R_ARM_ABS32:
-  case R_ARM_REL32:
-  case R_ARM_TARGET1:
-  case R_ARM_BASE_PREL:
-  case R_ARM_GOTOFF32:
-  case R_ARM_GOT_PREL:
-  case R_ARM_GOT_BREL:
-  case R_ARM_TLS_GD32:
-  case R_ARM_TLS_LDM32:
-  case R_ARM_TLS_LDO32:
-  case R_ARM_TLS_IE32:
-  case R_ARM_TLS_LE32:
-  case R_ARM_TLS_GOTDESC:
-  case R_ARM_TARGET2:
-    return *(il32 *)loc;
-  case R_ARM_THM_JUMP11:
-    return sign_extend(*(ul16 *)loc, 10) << 1;
-  case R_ARM_THM_CALL:
-  case R_ARM_THM_JUMP24:
-  case R_ARM_THM_TLS_CALL: {
-    u32 S = bit(*(ul16 *)loc, 10);
-    u32 J1 = bit(*(ul16 *)(loc + 2), 13);
-    u32 J2 = bit(*(ul16 *)(loc + 2), 11);
-    u32 I1 = !(J1 ^ S);
-    u32 I2 = !(J2 ^ S);
-    u32 imm10 = bits(*(ul16 *)loc, 9, 0);
-    u32 imm11 = bits(*(ul16 *)(loc + 2), 10, 0);
-    u32 val = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1);
-    return sign_extend(val, 24);
-  }
-  case R_ARM_CALL:
-  case R_ARM_JUMP24:
-  case R_ARM_PLT32:
-  case R_ARM_TLS_CALL:
-    return sign_extend(*(ul32 *)loc, 23) << 2;
-  case R_ARM_MOVW_PREL_NC:
-  case R_ARM_MOVW_ABS_NC:
-  case R_ARM_MOVT_PREL:
-  case R_ARM_MOVT_ABS: {
-    u32 imm12 = bits(*(ul32 *)loc, 11, 0);
-    u32 imm4 = bits(*(ul32 *)loc, 19, 16);
-    return sign_extend((imm4 << 12) | imm12, 15);
-  }
-  case R_ARM_PREL31:
-    return sign_extend(*(ul32 *)loc, 30);
-  case R_ARM_THM_MOVW_PREL_NC:
-  case R_ARM_THM_MOVW_ABS_NC:
-  case R_ARM_THM_MOVT_PREL:
-  case R_ARM_THM_MOVT_ABS: {
-    u32 imm4 = bits(*(ul16 *)loc, 3, 0);
-    u32 i = bit(*(ul16 *)loc, 10);
-    u32 imm3 = bits(*(ul16 *)(loc + 2), 14, 12);
-    u32 imm8 = bits(*(ul16 *)(loc + 2), 7, 0);
-    u32 val = (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8;
-    return sign_extend(val, 15);
-  }
-  default:
-    return 0;
-  }
-}
-
-static void write_mov_imm(u8 *loc, u32 val) {
-  u32 imm12 = bits(val, 11, 0);
-  u32 imm4 = bits(val, 15, 12);
-  *(ul32 *)loc = (*(ul32 *)loc & 0xfff0f000) | (imm4 << 16) | imm12;
-}
-
-static void write_thm_b_imm(u8 *loc, u32 val) {
-  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/BL--BLX--immediate-
-  u32 sign = bit(val, 24);
-  u32 I1 = bit(val, 23);
-  u32 I2 = bit(val, 22);
-  u32 J1 = !I1 ^ sign;
-  u32 J2 = !I2 ^ sign;
-  u32 imm10 = bits(val, 21, 12);
-  u32 imm11 = bits(val, 11, 1);
-
-  ul16 *buf = (ul16 *)loc;
-  buf[0] = (buf[0] & 0b1111'1000'0000'0000) | (sign << 10) | imm10;
-  buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11;
-}
-
-static void write_thm_mov_imm(u8 *loc, u32 val) {
-  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/MOVT
-  u32 imm4 = bits(val, 15, 12);
-  u32 i = bit(val, 11);
-  u32 imm3 = bits(val, 10, 8);
-  u32 imm8 = bits(val, 7, 0);
-
-  ul16 *buf = (ul16 *)loc;
-  buf[0] = (buf[0] & 0b1111'1011'1111'0000) | (i << 10) | imm4;
-  buf[1] = (buf[1] & 0b1000'1111'0000'0000) | (imm3 << 12) | imm8;
-}
-
-template <>
-void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
-  switch (rel.r_type) {
-  case R_ARM_NONE:
-    break;
-  case R_ARM_ABS32:
-  case R_ARM_REL32:
-  case R_ARM_TARGET1:
-  case R_ARM_BASE_PREL:
-  case R_ARM_GOTOFF32:
-  case R_ARM_GOT_PREL:
-  case R_ARM_GOT_BREL:
-  case R_ARM_TLS_GD32:
-  case R_ARM_TLS_LDM32:
-  case R_ARM_TLS_LDO32:
-  case R_ARM_TLS_IE32:
-  case R_ARM_TLS_LE32:
-  case R_ARM_TLS_GOTDESC:
-  case R_ARM_TARGET2:
-    *(ul32 *)loc = val;
-    break;
-  case R_ARM_THM_JUMP11:
-    *(ul16 *)loc = (*(ul16 *)loc & 0xf800) | bits(val, 11, 1);
-    break;
-  case R_ARM_THM_CALL:
-  case R_ARM_THM_JUMP24:
-  case R_ARM_THM_TLS_CALL:
-    write_thm_b_imm(loc, val);
-    break;
-  case R_ARM_CALL:
-  case R_ARM_JUMP24:
-  case R_ARM_PLT32:
-    *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
-    break;
-  case R_ARM_MOVW_PREL_NC:
-  case R_ARM_MOVW_ABS_NC:
-  case R_ARM_MOVT_PREL:
-  case R_ARM_MOVT_ABS:
-    write_mov_imm(loc, val);
-    break;
-  case R_ARM_PREL31:
-    *(ul32 *)loc = (*(ul32 *)loc & 0x8000'0000) | (val & 0x7fff'ffff);
-    break;
-  case R_ARM_THM_MOVW_PREL_NC:
-  case R_ARM_THM_MOVW_ABS_NC:
-  case R_ARM_THM_MOVT_PREL:
-  case R_ARM_THM_MOVT_ABS:
-    write_thm_mov_imm(loc, val);
-    break;
-  default:
-    unreachable();
-  }
-}
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn[] = {
-    0xe52d'e004, // push {lr}
-    0xe59f'e004, // ldr lr, 2f
-    0xe08f'e00e, // 1: add lr, pc, lr
-    0xe5be'f008, // ldr pc, [lr, #8]!
-    0x0000'0000, // 2: .word .got.plt - 1b - 8
-    0xe320'f000, // nop
-    0xe320'f000, // nop
-    0xe320'f000, // nop
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16;
-}
-
-static const ul32 plt_entry[] = {
-  0xe59f'c004, // 1: ldr ip, 2f
-  0xe08c'c00f, // add ip, ip, pc
-  0xe59c'f000, // ldr pc, [ip]
-  0x0000'0000, // 2: .word sym@GOT - 1b
-};
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  memcpy(buf, plt_entry, sizeof(plt_entry));
-  *(ul32 *)(buf + 12) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 12;
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  memcpy(buf, plt_entry, sizeof(plt_entry));
-  *(ul32 *)(buf + 12) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 12;
-}
-
-// ARM does not use .eh_frame for exception handling. Instead, it uses
-// .ARM.exidx and .ARM.extab. So this function is empty.
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {}
-
-// ARM and Thumb branch instructions can jump within ±16 MiB.
-static bool is_jump_reachable(i64 val) {
-  return sign_extend(val, 24) == val;
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable {
-    for (; i < output_section->thunks.size(); i++) {
-      i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
-                 addr;
-      if (is_jump_reachable(disp))
-        return disp;
-    }
-    unreachable();
-  };
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = get_addend(*this, rel);
-    u64 P = get_addr() + rel.r_offset;
-    u64 T = S & 1;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    auto get_thumb_thunk_addr = [&] { return get_thunk_addr(i); };
-    auto get_arm_thunk_addr   = [&] { return get_thunk_addr(i) + 4; };
-
-    switch (rel.r_type) {
-    case R_ARM_ABS32:
-    case R_ARM_TARGET1:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_ARM_REL32:
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_ARM_THM_CALL: {
-      if (sym.is_remaining_undef_weak()) {
-        // On ARM, calling an weak undefined symbol jumps to the
-        // next instruction.
-        *(ul32 *)loc = 0x8000'f3af; // NOP.W
-        break;
-      }
-
-      // THM_CALL relocation refers either BL or BLX instruction.
-      // They are different in only one bit. We need to use BL if
-      // the jump target is Thumb. Otherwise, use BLX.
-      i64 val = S + A - P;
-      if (is_jump_reachable(val)) {
-        if (T) {
-          write_thm_b_imm(loc, val);
-          *(ul16 *)(loc + 2) |= 0x1000;  // rewrite to BL
-        } else {
-          write_thm_b_imm(loc, align_to(val, 4));
-          *(ul16 *)(loc + 2) &= ~0x1000; // rewrite to BLX
-        }
-      } else {
-        write_thm_b_imm(loc, align_to(get_arm_thunk_addr() + A - P, 4));
-        *(ul16 *)(loc + 2) &= ~0x1000;  // rewrite to BLX
-      }
-      break;
-    }
-    case R_ARM_BASE_PREL:
-      *(ul32 *)loc = GOT + A - P;
-      break;
-    case R_ARM_GOTOFF32:
-      *(ul32 *)loc = ((S + A) | T) - GOT;
-      break;
-    case R_ARM_GOT_PREL:
-    case R_ARM_TARGET2:
-      *(ul32 *)loc = GOT + G + A - P;
-      break;
-    case R_ARM_GOT_BREL:
-      *(ul32 *)loc = G + A;
-      break;
-    case R_ARM_CALL: {
-      if (sym.is_remaining_undef_weak()) {
-        *(ul32 *)loc = 0xe320'f000; // NOP
-        break;
-      }
-
-      // Just like THM_CALL, ARM_CALL relocation refers either BL or
-      // BLX instruction. We may need to rewrite BL → BLX or BLX → BL.
-      bool is_bl = ((*(ul32 *)loc & 0xff00'0000) == 0xeb00'0000);
-      bool is_blx = ((*(ul32 *)loc & 0xfe00'0000) == 0xfa00'0000);
-      if (!is_bl && !is_blx)
-        Fatal(ctx) << *this << ": R_ARM_CALL refers neither BL nor BLX";
-
-      u64 val = S + A - P;
-      if (is_jump_reachable(val)) {
-        if (T) {
-          *(ul32 *)loc = 0xfa00'0000; // BLX
-          *(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2);
-        } else {
-          *(ul32 *)loc = 0xeb00'0000; // BL
-          *(ul32 *)loc |= bits(val, 25, 2);
-        }
-      } else {
-        *(ul32 *)loc = 0xeb00'0000; // BL
-        *(ul32 *)loc |= bits(get_arm_thunk_addr() + A - P, 25, 2);
-      }
-      break;
-    }
-    case R_ARM_JUMP24: {
-      if (sym.is_remaining_undef_weak()) {
-        *(ul32 *)loc = 0xe320'f000; // NOP
-        break;
-      }
-
-      // These relocs refers a B (unconditional branch) instruction.
-      // Unlike BL or BLX, we can't rewrite B to BX in place when the
-      // processor mode switch is required because BX doesn't takes an
-      // immediate; it takes only a register. So if mode switch is
-      // required, we jump to a linker-synthesized thunk which does the
-      // job with a longer code sequence.
-      u64 val = S + A - P;
-      if (!is_jump_reachable(val) || T)
-        val = get_arm_thunk_addr() + A - P;
-      *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
-      break;
-    }
-    case R_ARM_PLT32:
-      if (sym.is_remaining_undef_weak()) {
-        *(ul32 *)loc = 0xe320'f000; // NOP
-      } else {
-        u64 val = (T ? get_arm_thunk_addr() : S) + A - P;
-        *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
-      }
-      break;
-    case R_ARM_THM_JUMP11:
-      assert(T);
-      check(S + A - P, -(1 << 11), 1 << 11);
-      *(ul16 *)loc &= 0xf800;
-      *(ul16 *)loc |= bits(S + A - P, 11, 1);
-      break;
-    case R_ARM_THM_JUMP19: {
-      i64 val = S + A - P;
-      check(val, -(1 << 19), 1 << 19);
-
-      // sign:J2:J1:imm6:imm11:'0'
-      u32 sign = bit(val, 20);
-      u32 J2 = bit(val, 19);
-      u32 J1 = bit(val, 18);
-      u32 imm6 = bits(val, 17, 12);
-      u32 imm11 = bits(val, 11, 1);
-
-      *(ul16 *)loc &= 0b1111'1011'1100'0000;
-      *(ul16 *)loc |= (sign << 10) | imm6;
-
-      *(ul16 *)(loc + 2) &= 0b1101'0000'0000'0000;
-      *(ul16 *)(loc + 2) |= (J2 << 13) | (J1 << 11) | imm11;
-      break;
-    }
-    case R_ARM_THM_JUMP24: {
-      if (sym.is_remaining_undef_weak()) {
-        *(ul32 *)loc = 0x8000'f3af; // NOP
-        break;
-      }
-
-      // Just like R_ARM_JUMP24, we need to jump to a thunk if we need to
-      // switch processor mode.
-      u64 val = S + A - P;
-      if (!is_jump_reachable(val) || !T)
-        val = get_thumb_thunk_addr() + A - P;
-      write_thm_b_imm(loc, val);
-      break;
-    }
-    case R_ARM_MOVW_PREL_NC:
-      write_mov_imm(loc, ((S + A) | T) - P);
-      break;
-    case R_ARM_MOVW_ABS_NC:
-      write_mov_imm(loc, (S + A) | T);
-      break;
-    case R_ARM_THM_MOVW_PREL_NC:
-      write_thm_mov_imm(loc, ((S + A) | T) - P);
-      break;
-    case R_ARM_PREL31:
-      check(S + A - P, -(1LL << 30), 1LL << 30);
-      *(ul32 *)loc &= 0x8000'0000;
-      *(ul32 *)loc |= (S + A - P) & 0x7fff'ffff;
-      break;
-    case R_ARM_THM_MOVW_ABS_NC:
-      write_thm_mov_imm(loc, (S + A) | T);
-      break;
-    case R_ARM_MOVT_PREL:
-      write_mov_imm(loc, (S + A - P) >> 16);
-      break;
-    case R_ARM_THM_MOVT_PREL:
-      write_thm_mov_imm(loc, (S + A - P) >> 16);
-      break;
-    case R_ARM_MOVT_ABS:
-      write_mov_imm(loc, (S + A) >> 16);
-      break;
-    case R_ARM_THM_MOVT_ABS:
-      write_thm_mov_imm(loc, (S + A) >> 16);
-      break;
-    case R_ARM_TLS_GD32:
-      *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - P;
-      break;
-    case R_ARM_TLS_LDM32:
-      *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - P;
-      break;
-    case R_ARM_TLS_LDO32:
-      *(ul32 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_ARM_TLS_IE32:
-      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - P;
-      break;
-    case R_ARM_TLS_LE32:
-      *(ul32 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_ARM_TLS_GOTDESC:
-      if (sym.has_tlsdesc(ctx)) {
-        // A is odd if the corresponding TLS_CALL is Thumb.
-        if (A & 1)
-          *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 6;
-        else
-          *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 4;
-      } else {
-        *(ul32 *)loc = S - ctx.tp_addr;
-      }
-      break;
-    case R_ARM_TLS_CALL:
-      if (sym.has_tlsdesc(ctx)) {
-        // BL <tls_trampoline>
-        *(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2);
-      } else {
-        // BL -> NOP
-        *(ul32 *)loc = 0xe320'f000;
-      }
-      break;
-    case R_ARM_THM_TLS_CALL:
-      if (sym.has_tlsdesc(ctx)) {
-        u64 val = align_to(get_tls_trampoline_addr(P + 4), 4);
-        write_thm_b_imm(loc, val);
-        *(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX
-      } else {
-        // BL -> NOP.W
-        *(ul32 *)loc = 0x8000'f3af;
-      }
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : get_addend(*this, rel);
-
-    switch (rel.r_type) {
-    case R_ARM_ABS32:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
-      else
-        *(ul32 *)loc = S + A;
-      break;
-    case R_ARM_TLS_LDO32:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
-      else
-        *(ul32 *)loc = S + A - ctx.dtp_addr;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-      break;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_ARM_ABS32:
-    case R_ARM_MOVT_ABS:
-    case R_ARM_THM_MOVT_ABS:
-    case R_ARM_TARGET1:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_ARM_THM_CALL:
-    case R_ARM_CALL:
-    case R_ARM_JUMP24:
-    case R_ARM_PLT32:
-    case R_ARM_THM_JUMP24:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_ARM_GOT_PREL:
-    case R_ARM_GOT_BREL:
-    case R_ARM_TARGET2:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_ARM_MOVT_PREL:
-    case R_ARM_THM_MOVT_PREL:
-    case R_ARM_PREL31:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_ARM_TLS_GD32:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_ARM_TLS_LDM32:
-      ctx.needs_tlsld = true;
-      break;
-    case R_ARM_TLS_IE32:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_ARM_TLS_GOTDESC:
-      if (!relax_tlsdesc(ctx, sym))
-        sym.flags |= NEEDS_TLSDESC;
-      break;
-    case R_ARM_TLS_LE32:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_ARM_REL32:
-    case R_ARM_BASE_PREL:
-    case R_ARM_GOTOFF32:
-    case R_ARM_THM_JUMP11:
-    case R_ARM_THM_JUMP19:
-    case R_ARM_MOVW_PREL_NC:
-    case R_ARM_MOVW_ABS_NC:
-    case R_ARM_THM_MOVW_PREL_NC:
-    case R_ARM_THM_MOVW_ABS_NC:
-    case R_ARM_TLS_LDO32:
-    case R_ARM_TLS_CALL:
-    case R_ARM_THM_TLS_CALL:
-    case R_ARM_V4BX:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-template <>
-void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
-  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
-
-  // TLS trampoline code. ARM32's TLSDESC is designed so that this
-  // common piece of code is factored out from object files to reduce
-  // output size. Since no one provide, the linker has to synthesize it.
-  static ul32 hdr[] = {
-    0xe08e'0000, // add r0, lr, r0
-    0xe590'1004, // ldr r1, [r0, #4]
-    0xe12f'ff11, // bx  r1
-  };
-
-  // This is a range extension and mode switch thunk.
-  // It has two entry points: +0 for Thumb and +4 for ARM.
-  const u8 entry[] = {
-    // .thumb
-    0xfc, 0x46,             //    mov  ip, pc
-    0x60, 0x47,             //    bx   ip  # jumps to the following `ldr` insn
-    // .arm
-    0x04, 0xc0, 0x9f, 0xe5, //    ldr  ip, 2f
-    0x0f, 0xc0, 0x8c, 0xe0, // 1: add  ip, ip, pc
-    0x1c, 0xff, 0x2f, 0xe1, //    bx   ip
-    0x00, 0x00, 0x00, 0x00, // 2: .word sym - 1b
-  };
-
-  static_assert(E::thunk_hdr_size == sizeof(hdr));
-  static_assert(E::thunk_size == sizeof(entry));
-
-  memcpy(buf, hdr, sizeof(hdr));
-
-  for (i64 i = 0; i < symbols.size(); i++) {
-    u8 *loc = buf + sizeof(hdr) + i * sizeof(entry);
-    memcpy(loc, entry, sizeof(entry));
-
-    u64 S = symbols[i]->get_addr(ctx);
-    u64 P = output_section.shdr.sh_addr + offset + sizeof(hdr) + i * sizeof(entry);
-    *(ul32 *)(loc + 16) = S - P - 16;
-  }
-}
-
-// ARM executables use an .ARM.exidx section to look up an exception
-// handling record for the current instruction pointer. The table needs
-// to be sorted by their addresses.
-//
-// Other target uses .eh_frame_hdr instead for the same purpose.
-// I don't know why only ARM uses the different mechanism, but it's
-// likely that it's due to some historical reason.
-//
-// This function sorts .ARM.exidx records.
-void fixup_arm_exidx_section(Context<E> &ctx) {
-  Timer t(ctx, "fixup_arm_exidx_section");
-
-  OutputSection<E> *osec = find_section(ctx, SHT_ARM_EXIDX);
-  if (!osec)
-    return;
-
-  // .ARM.exidx records consists of a signed 31-bit relative address
-  // and a 32-bit value. The relative address indicates the start
-  // address of a function that the record covers. The value is one of
-  // the followings:
-  //
-  // 1. CANTUNWIND indicating that there's no unwinding info for the function,
-  // 2. a compact unwinding record encoded into a 32-bit value, or
-  // 3. a 31-bit relative address which points to a larger record in
-  //    the .ARM.extab section.
-  //
-  // CANTUNWIND is value 1. The most significant bit is set in (2) but
-  // not in (3). So we can distinguished them just by looking at a value.
-  const u32 EXIDX_CANTUNWIND = 1;
-
-  struct Entry {
-    ul32 addr;
-    ul32 val;
-  };
-
-  if (osec->shdr.sh_size % sizeof(Entry))
-    Fatal(ctx) << "invalid .ARM.exidx section size";
-
-  Entry *ent = (Entry *)(ctx.buf + osec->shdr.sh_offset);
-  i64 num_entries = osec->shdr.sh_size / sizeof(Entry);
-
-  // Entry's addresses are relative to themselves. In order to sort
-  // records by addresses, we first translate them so that the addresses
-  // are relative to the beginning of the section.
-  auto is_relative = [](u32 val) {
-    return val != EXIDX_CANTUNWIND && !(val & 0x8000'0000);
-  };
-
-  tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
-    i64 offset = sizeof(Entry) * i;
-    ent[i].addr = sign_extend(ent[i].addr, 30) + offset;
-    if (is_relative(ent[i].val))
-      ent[i].val = 0x7fff'ffff & (ent[i].val + offset);
-  });
-
-  tbb::parallel_sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) {
-    return a.addr < b.addr;
-  });
-
-  // Make addresses relative to themselves.
-  tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
-    i64 offset = sizeof(Entry) * i;
-    ent[i].addr = 0x7fff'ffff & (ent[i].addr - offset);
-    if (is_relative(ent[i].val))
-      ent[i].val = 0x7fff'ffff & (ent[i].val - offset);
-  });
-
-  // .ARM.exidx's sh_link should be set to the .text section index.
-  // Runtime doesn't care about it, but the binutils's strip command does.
-  if (ctx.shdr) {
-    if (Chunk<E> *text = find_section(ctx, ".text")) {
-      osec->shdr.sh_link = text->shndx;
-      ctx.shdr->copy_buf(ctx);
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-arm64.cc
+++ b/third_party/mold/elf/arch-arm64.cc
@ -1,595 +0,0 @@
-// clang-format off
-// This file contains ARM64-specific code. Being new, the ARM64's ELF
-// psABI doesn't have anything peculiar. ARM64 is a clean RISC
-// instruction set that supports PC-relative load/store instructions.
-//
-// Unlike ARM32, instructions length doesn't vary. All ARM64
-// instructions are 4 bytes long.
-//
-// Branch instructions used for function call can jump within ±128 MiB.
-// We need to create range extension thunks to support binaries whose
-// .text is larger than that.
-//
-// Unlike most other targets, the TLSDESC access model is used by default
-// for -fPIC to access thread-local variables instead of the less
-// efficient GD model. You can still enable GD but it needs the
-// -mtls-dialect=trad flag. Since GD is used rarely, we don't need to
-// implement GD → LE relaxation.
-//
-// https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = ARM64;
-
-static void write_adrp(u8 *buf, u64 val) {
-  *(ul32 *)buf |= (bits(val, 13, 12) << 29) | (bits(val, 32, 14) << 5);
-}
-
-static void write_adr(u8 *buf, u64 val) {
-  *(ul32 *)buf |= (bits(val, 1, 0) << 29) | (bits(val, 20, 2) << 5);
-}
-
-static void write_movn_movz(u8 *buf, i64 val) {
-  *(ul32 *)buf &= 0b0000'0000'0110'0000'0000'0000'0001'1111;
-
-  if (val >= 0)
-    *(ul32 *)buf |= 0xd280'0000 | (bits(val, 15, 0) << 5);  // rewrite to movz
-  else
-    *(ul32 *)buf |= 0x9280'0000 | (bits(~val, 15, 0) << 5); // rewrite to movn
-}
-
-static u64 page(u64 val) {
-  return val & 0xffff'ffff'ffff'f000;
-}
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn[] = {
-    0xa9bf'7bf0, // stp  x16, x30, [sp,#-16]!
-    0x9000'0010, // adrp x16, .got.plt[2]
-    0xf940'0211, // ldr  x17, [x16, .got.plt[2]]
-    0x9100'0210, // add  x16, x16, .got.plt[2]
-    0xd61f'0220, // br   x17
-    0xd503'201f, // nop
-    0xd503'201f, // nop
-    0xd503'201f, // nop
-  };
-
-  u64 gotplt = ctx.gotplt->shdr.sh_addr + 16;
-  u64 plt = ctx.plt->shdr.sh_addr;
-
-  memcpy(buf, insn, sizeof(insn));
-  write_adrp(buf + 4, page(gotplt) - page(plt + 4));
-  *(ul32 *)(buf + 8) |= bits(gotplt, 11, 3) << 10;
-  *(ul32 *)(buf + 12) |= (gotplt & 0xfff) << 10;
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static const ul32 insn[] = {
-    0x9000'0010, // adrp x16, .got.plt[n]
-    0xf940'0211, // ldr  x17, [x16, .got.plt[n]]
-    0x9100'0210, // add  x16, x16, .got.plt[n]
-    0xd61f'0220, // br   x17
-  };
-
-  u64 gotplt = sym.get_gotplt_addr(ctx);
-  u64 plt = sym.get_plt_addr(ctx);
-
-  memcpy(buf, insn, sizeof(insn));
-  write_adrp(buf, page(gotplt) - page(plt));
-  *(ul32 *)(buf + 4) |= bits(gotplt, 11, 3) << 10;
-  *(ul32 *)(buf + 8) |= (gotplt & 0xfff) << 10;
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static const ul32 insn[] = {
-    0x9000'0010, // adrp x16, GOT[n]
-    0xf940'0211, // ldr  x17, [x16, GOT[n]]
-    0xd61f'0220, // br   x17
-    0xd503'201f, // nop
-  };
-
-  u64 got = sym.get_got_addr(ctx);
-  u64 plt = sym.get_plt_addr(ctx);
-
-  memcpy(buf, insn, sizeof(insn));
-  write_adrp(buf, page(got) - page(plt));
-  *(ul32 *)(buf + 4) |= bits(got, 11, 3) << 10;
-}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_AARCH64_ABS64:
-    *(ul64 *)loc = val;
-    break;
-  case R_AARCH64_PREL32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  case R_AARCH64_PREL64:
-    *(ul64 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-static bool is_adrp(u8 *loc) {
-  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADRP--Form-PC-relative-address-to-4KB-page-
-  u32 insn = *(ul32 *)loc;
-  return (bits(insn, 31, 24) & 0b1001'1111) == 0b1001'0000;
-}
-
-static bool is_ldr(u8 *loc) {
-  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
-  u32 insn = *(ul32 *)loc;
-  return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1111'1001'0100;
-}
-
-static bool is_add(u8 *loc) {
-  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--immediate---Add--immediate--
-  u32 insn = *(ul32 *)loc;
-  return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1001'0001'0000;
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_AARCH64_ABS64:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_AARCH64_LDST8_ABS_LO12_NC:
-    case R_AARCH64_ADD_ABS_LO12_NC:
-      *(ul32 *)loc |= bits(S + A, 11, 0) << 10;
-      break;
-    case R_AARCH64_LDST16_ABS_LO12_NC:
-      *(ul32 *)loc |= bits(S + A, 11, 1) << 10;
-      break;
-    case R_AARCH64_LDST32_ABS_LO12_NC:
-      *(ul32 *)loc |= bits(S + A, 11, 2) << 10;
-      break;
-    case R_AARCH64_LDST64_ABS_LO12_NC:
-      *(ul32 *)loc |= bits(S + A, 11, 3) << 10;
-      break;
-    case R_AARCH64_LDST128_ABS_LO12_NC:
-      *(ul32 *)loc |= bits(S + A, 11, 4) << 10;
-      break;
-    case R_AARCH64_MOVW_UABS_G0:
-      check(S + A, 0, 1 << 16);
-      *(ul32 *)loc |= bits(S + A, 15, 0) << 5;
-      break;
-    case R_AARCH64_MOVW_UABS_G0_NC:
-      *(ul32 *)loc |= bits(S + A, 15, 0) << 5;
-      break;
-    case R_AARCH64_MOVW_UABS_G1:
-      check(S + A, 0, 1LL << 32);
-      *(ul32 *)loc |= bits(S + A, 31, 16) << 5;
-      break;
-    case R_AARCH64_MOVW_UABS_G1_NC:
-      *(ul32 *)loc |= bits(S + A, 31, 16) << 5;
-      break;
-    case R_AARCH64_MOVW_UABS_G2:
-      check(S + A, 0, 1LL << 48);
-      *(ul32 *)loc |= bits(S + A, 47, 32) << 5;
-      break;
-    case R_AARCH64_MOVW_UABS_G2_NC:
-      *(ul32 *)loc |= bits(S + A, 47, 32) << 5;
-      break;
-    case R_AARCH64_MOVW_UABS_G3:
-      *(ul32 *)loc |= bits(S + A, 63, 48) << 5;
-      break;
-    case R_AARCH64_ADR_GOT_PAGE:
-      if (sym.has_got(ctx)) {
-        i64 val = page(G + GOT + A) - page(P);
-        check(val, -(1LL << 32), 1LL << 32);
-        write_adrp(loc, val);
-      } else {
-        // Relax GOT-loading ADRP+LDR to an immediate ADRP+ADD
-        i64 val = page(S + A) - page(P);
-        check(val, -(1LL << 32), 1LL << 32);
-        write_adrp(loc, val);
-
-        u32 reg = bits(*(ul32 *)loc, 4, 0);
-        *(ul32 *)(loc + 4) = 0x9100'0000 | (reg << 5) | reg; // ADD
-        *(ul32 *)(loc + 4) |= bits(S + A, 11, 0) << 10;
-        i++;
-      }
-      break;
-    case R_AARCH64_ADR_PREL_PG_HI21: {
-      // The ARM64 psABI defines that an `ADRP x0, foo` and `ADD x0, x0,
-      // :lo12: foo` instruction pair to materialize a PC-relative address
-      // in a register can be relaxed to `NOP` followed by `ADR x0, foo`
-      // if foo is in PC ± 1 MiB.
-      if (ctx.arg.relax && i + 1 < rels.size() &&
-          sign_extend(S + A - P - 4, 20) == S + A - P - 4) {
-        const ElfRel<E> &rel2 = rels[i + 1];
-        if (rel2.r_type == R_AARCH64_ADD_ABS_LO12_NC &&
-            rel2.r_sym == rel.r_sym &&
-            rel2.r_offset == rel.r_offset + 4 &&
-            rel2.r_addend == rel.r_addend &&
-            is_adrp(loc) &&
-            is_add(loc + 4)) {
-          u32 reg1 = bits(*(ul32 *)loc, 4, 0);
-          u32 reg2 = bits(*(ul32 *)(loc + 4), 4, 0);
-          if (reg1 == reg2) {
-            *(ul32 *)loc = 0xd503'201f;              // nop
-            *(ul32 *)(loc + 4) = 0x1000'0000 | reg1; // adr
-            write_adr(loc + 4, S + A - P - 4);
-            i++;
-            break;
-          }
-        }
-      }
-
-      i64 val = page(S + A) - page(P);
-      check(val, -(1LL << 32), 1LL << 32);
-      write_adrp(loc, val);
-      break;
-    }
-    case R_AARCH64_ADR_PREL_LO21:
-      check(S + A - P, -(1LL << 20), 1LL << 20);
-      write_adr(loc, S + A - P);
-      break;
-    case R_AARCH64_CALL26:
-    case R_AARCH64_JUMP26: {
-      if (sym.is_remaining_undef_weak()) {
-        // On ARM, calling an weak undefined symbol jumps to the
-        // next instruction.
-        *(ul32 *)loc = 0xd503'201f; // nop
-        break;
-      }
-
-      i64 val = S + A - P;
-      if (val < -(1 << 27) || (1 << 27) <= val)
-        val = get_thunk_addr(i) + A - P;
-      *(ul32 *)loc |= bits(val, 27, 2);
-      break;
-    }
-    case R_AARCH64_PLT32:
-      check(S + A - P, -(1LL << 31), 1LL << 31);
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_AARCH64_CONDBR19:
-    case R_AARCH64_LD_PREL_LO19:
-      check(S + A - P, -(1LL << 20), 1LL << 20);
-      *(ul32 *)loc |= bits(S + A - P, 20, 2) << 5;
-      break;
-    case R_AARCH64_PREL16:
-      check(S + A - P, -(1LL << 15), 1LL << 15);
-      *(ul16 *)loc = S + A - P;
-      break;
-    case R_AARCH64_PREL32:
-      check(S + A - P, -(1LL << 31), 1LL << 32);
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_AARCH64_PREL64:
-      *(ul64 *)loc = S + A - P;
-      break;
-    case R_AARCH64_LD64_GOT_LO12_NC:
-      *(ul32 *)loc |= bits(G + GOT + A, 11, 3) << 10;
-      break;
-    case R_AARCH64_LD64_GOTPAGE_LO15: {
-      i64 val = G + GOT + A - page(GOT);
-      check(val, 0, 1 << 15);
-      *(ul32 *)loc |= bits(val, 14, 3) << 10;
-      break;
-    }
-    case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: {
-      i64 val = page(sym.get_gottp_addr(ctx) + A) - page(P);
-      check(val, -(1LL << 32), 1LL << 32);
-      write_adrp(loc, val);
-      break;
-    }
-    case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
-      *(ul32 *)loc |= bits(sym.get_gottp_addr(ctx) + A, 11, 3) << 10;
-      break;
-    case R_AARCH64_TLSLE_MOVW_TPREL_G0: {
-      i64 val = S + A - ctx.tp_addr;
-      check(val, -(1 << 15), 1 << 15);
-      write_movn_movz(loc, val);
-      break;
-    }
-    case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
-      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 15, 0) << 5;
-      break;
-    case R_AARCH64_TLSLE_MOVW_TPREL_G1: {
-      i64 val = S + A - ctx.tp_addr;
-      check(val, -(1LL << 31), 1LL << 31);
-      write_movn_movz(loc, val >> 16);
-      break;
-    }
-    case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
-      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 31, 16) << 5;
-      break;
-    case R_AARCH64_TLSLE_MOVW_TPREL_G2: {
-      i64 val = S + A - ctx.tp_addr;
-      check(val, -(1LL << 47), 1LL << 47);
-      write_movn_movz(loc, val >> 32);
-      break;
-    }
-    case R_AARCH64_TLSLE_ADD_TPREL_HI12: {
-      i64 val = S + A - ctx.tp_addr;
-      check(val, 0, 1LL << 24);
-      *(ul32 *)loc |= bits(val, 23, 12) << 10;
-      break;
-    }
-    case R_AARCH64_TLSLE_ADD_TPREL_LO12:
-      check(S + A - ctx.tp_addr, 0, 1 << 12);
-      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10;
-      break;
-    case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
-      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10;
-      break;
-    case R_AARCH64_TLSGD_ADR_PAGE21: {
-      i64 val = page(sym.get_tlsgd_addr(ctx) + A) - page(P);
-      check(val, -(1LL << 32), 1LL << 32);
-      write_adrp(loc, val);
-      break;
-    }
-    case R_AARCH64_TLSGD_ADD_LO12_NC:
-      *(ul32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A, 11, 0) << 10;
-      break;
-    case R_AARCH64_TLSDESC_ADR_PAGE21:
-      if (sym.has_tlsdesc(ctx)) {
-        i64 val = page(sym.get_tlsdesc_addr(ctx) + A) - page(P);
-        check(val, -(1LL << 32), 1LL << 32);
-        write_adrp(loc, val);
-      } else {
-        // adrp x0, 0 -> movz x0, #tls_ofset_hi, lsl #16
-        i64 val = (S + A - ctx.tp_addr);
-        check(val, -(1LL << 32), 1LL << 32);
-        *(ul32 *)loc = 0xd2a0'0000 | (bits(val, 32, 16) << 5);
-      }
-      break;
-    case R_AARCH64_TLSDESC_LD64_LO12:
-      if (sym.has_tlsdesc(ctx)) {
-        *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 3) << 10;
-      } else {
-        // ldr x2, [x0] -> movk x0, #tls_ofset_lo
-        u32 offset_lo = (S + A - ctx.tp_addr) & 0xffff;
-        *(ul32 *)loc = 0xf280'0000 | (offset_lo << 5);
-      }
-      break;
-    case R_AARCH64_TLSDESC_ADD_LO12:
-      if (sym.has_tlsdesc(ctx)) {
-        *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 0) << 10;
-      } else {
-        // add x0, x0, #0 -> nop
-        *(ul32 *)loc = 0xd503'201f;
-      }
-      break;
-    case R_AARCH64_TLSDESC_CALL:
-      if (!sym.has_tlsdesc(ctx)) {
-        // blr x2 -> nop
-        *(ul32 *)loc = 0xd503'201f;
-      }
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_AARCH64_ABS64:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul64 *)loc = *val;
-      else
-        *(ul64 *)loc = S + A;
-      break;
-    case R_AARCH64_ABS32: {
-      i64 val = S + A;
-      check(val, 0, 1LL << 32);
-      *(ul32 *)loc = val;
-      break;
-    }
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-      break;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = (u8 *)(contents.data() + rel.r_offset);
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_AARCH64_ABS64:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_AARCH64_ADR_GOT_PAGE:
-      // An ADR_GOT_PAGE and GOT_LO12_NC relocation pair is used to load a
-      // symbol's address from GOT. If the GOT value is a link-time
-      // constant, we may be able to rewrite the ADRP+LDR instruction pair
-      // with an ADRP+ADD, eliminating a GOT memory load.
-      if (ctx.arg.relax && sym.is_relative() && !sym.is_imported &&
-          !sym.is_ifunc() && i + 1 < rels.size()) {
-        // ADRP+LDR must be consecutive and use the same register to relax.
-        const ElfRel<E> &rel2 = rels[i + 1];
-        if (rel2.r_type == R_AARCH64_LD64_GOT_LO12_NC &&
-            rel2.r_offset == rel.r_offset + 4 &&
-            rel2.r_sym == rel.r_sym &&
-            rel.r_addend == 0 &&
-            rel2.r_addend == 0 &&
-            is_adrp(loc) &&
-            is_ldr(loc + 4)) {
-          u32 rd = bits(*(ul32 *)loc, 4, 0);
-          u32 rn = bits(*(ul32 *)(loc + 4), 9, 5);
-          u32 rt = bits(*(ul32 *)(loc + 4), 4, 0);
-          if (rd == rn && rn == rt) {
-            i++;
-            break;
-          }
-        }
-      }
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_AARCH64_LD64_GOT_LO12_NC:
-    case R_AARCH64_LD64_GOTPAGE_LO15:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_AARCH64_CALL26:
-    case R_AARCH64_JUMP26:
-    case R_AARCH64_PLT32:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
-    case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_AARCH64_ADR_PREL_PG_HI21:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_AARCH64_TLSGD_ADR_PAGE21:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_AARCH64_TLSDESC_ADR_PAGE21:
-    case R_AARCH64_TLSDESC_LD64_LO12:
-    case R_AARCH64_TLSDESC_ADD_LO12:
-      if (!relax_tlsdesc(ctx, sym))
-        sym.flags |= NEEDS_TLSDESC;
-      break;
-    case R_AARCH64_TLSLE_MOVW_TPREL_G0:
-    case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
-    case R_AARCH64_TLSLE_MOVW_TPREL_G1:
-    case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
-    case R_AARCH64_TLSLE_MOVW_TPREL_G2:
-    case R_AARCH64_TLSLE_ADD_TPREL_HI12:
-    case R_AARCH64_TLSLE_ADD_TPREL_LO12:
-    case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_AARCH64_ADD_ABS_LO12_NC:
-    case R_AARCH64_ADR_PREL_LO21:
-    case R_AARCH64_CONDBR19:
-    case R_AARCH64_LD_PREL_LO19:
-    case R_AARCH64_LDST16_ABS_LO12_NC:
-    case R_AARCH64_LDST32_ABS_LO12_NC:
-    case R_AARCH64_LDST64_ABS_LO12_NC:
-    case R_AARCH64_LDST128_ABS_LO12_NC:
-    case R_AARCH64_LDST8_ABS_LO12_NC:
-    case R_AARCH64_MOVW_UABS_G0:
-    case R_AARCH64_MOVW_UABS_G0_NC:
-    case R_AARCH64_MOVW_UABS_G1:
-    case R_AARCH64_MOVW_UABS_G1_NC:
-    case R_AARCH64_MOVW_UABS_G2:
-    case R_AARCH64_MOVW_UABS_G2_NC:
-    case R_AARCH64_MOVW_UABS_G3:
-    case R_AARCH64_PREL16:
-    case R_AARCH64_PREL32:
-    case R_AARCH64_PREL64:
-    case R_AARCH64_TLSGD_ADD_LO12_NC:
-    case R_AARCH64_TLSDESC_CALL:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-template <>
-void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
-  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
-
-  static const ul32 data[] = {
-    0x9000'0010, // adrp x16, 0   # R_AARCH64_ADR_PREL_PG_HI21
-    0x9100'0210, // add  x16, x16 # R_AARCH64_ADD_ABS_LO12_NC
-    0xd61f'0200, // br   x16
-  };
-
-  static_assert(E::thunk_size == sizeof(data));
-
-  for (i64 i = 0; i < symbols.size(); i++) {
-    u64 S = symbols[i]->get_addr(ctx);
-    u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size;
-
-    u8 *loc = buf + i * E::thunk_size;
-    memcpy(loc , data, sizeof(data));
-    write_adrp(loc, page(S) - page(P));
-    *(ul32 *)(loc + 4) |= bits(S, 11, 0) << 10;
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-i386.cc
+++ b/third_party/mold/elf/arch-i386.cc
@ -1,565 +0,0 @@
-// clang-format off
-// i386 is similar to x86-64 but lacks PC-relative memory access
-// instructions. So it's not straightforward to support position-
-// independent code (PIC) on that target.
-//
-// If an object file is compiled with -fPIC, a function that needs to load
-// a value from memory first obtains its own address with the following
-// code
-//
-//   call __x86.get_pc_thunk.bx
-//
-// where __x86.get_pc_thunk.bx is defined as
-//
-//   __x86.get_pc_thunk.bx:
-//     mov (%esp), %ebx  # move the return address to %ebx
-//     ret
-//
-// . With the function's own address (or, more precisely, the address
-// immediately after the call instruction), the function can compute an
-// absolute address of a variable with its address + link-time constant.
-//
-// Executing call-mov-ret isn't very cheap, and allocating one register to
-// store PC isn't cheap too, especially given that i386 has only 8
-// general-purpose registers. But that's the cost of PIC on i386. You need
-// to pay it when creating a .so and a position-independent executable.
-//
-// When a position-independent function calls another function, it sets
-// %ebx to the address of .got. Position-independent PLT entries use that
-// register to load values from .got.plt/.got.
-//
-// If we are creating a position-dependent executable (PDE), we can't
-// assume that %ebx is set to .got. For PDE, we need to create position-
-// dependent PLT entries which don't use %ebx.
-//
-// https://github.com/rui314/psabi/blob/main/i386.pdf
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = I386;
-
-template <>
-i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
-  switch (rel.r_type) {
-  case R_386_8:
-  case R_386_PC8:
-    return *loc;
-  case R_386_16:
-  case R_386_PC16:
-    return *(ul16 *)loc;
-  case R_386_32:
-  case R_386_PC32:
-  case R_386_GOT32:
-  case R_386_GOT32X:
-  case R_386_PLT32:
-  case R_386_GOTOFF:
-  case R_386_GOTPC:
-  case R_386_TLS_LDM:
-  case R_386_TLS_GOTIE:
-  case R_386_TLS_LE:
-  case R_386_TLS_IE:
-  case R_386_TLS_GD:
-  case R_386_TLS_LDO_32:
-  case R_386_SIZE32:
-  case R_386_TLS_GOTDESC:
-    return *(ul32 *)loc;
-  default:
-    return 0;
-  }
-}
-
-template <>
-void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
-  switch (rel.r_type) {
-  case R_386_NONE:
-    break;
-  case R_386_8:
-  case R_386_PC8:
-    *loc = val;
-    break;
-  case R_386_16:
-  case R_386_PC16:
-    *(ul16 *)loc = val;
-    break;
-  case R_386_32:
-  case R_386_PC32:
-  case R_386_GOT32:
-  case R_386_GOT32X:
-  case R_386_PLT32:
-  case R_386_GOTOFF:
-  case R_386_GOTPC:
-  case R_386_TLS_LDM:
-  case R_386_TLS_GOTIE:
-  case R_386_TLS_LE:
-  case R_386_TLS_IE:
-  case R_386_TLS_GD:
-  case R_386_TLS_LDO_32:
-  case R_386_SIZE32:
-  case R_386_TLS_GOTDESC:
-    *(ul32 *)loc = val;
-    break;
-  default:
-    unreachable();
-  }
-}
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
-      0x51,                   // push   %ecx
-      0x8d, 0x8b, 0, 0, 0, 0, // lea    GOTPLT+4(%ebx), %ecx
-      0xff, 0x31,             // push   (%ecx)
-      0xff, 0x61, 0x04,       // jmp    *0x4(%ecx)
-    };
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 7) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr + 4;
-  } else {
-    static const u8 insn[] = {
-      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
-      0x51,                   // push   %ecx
-      0xb9, 0, 0, 0, 0,       // mov    GOTPLT+4, %ecx
-      0xff, 0x31,             // push   (%ecx)
-      0xff, 0x61, 0x04,       // jmp    *0x4(%ecx)
-      0xcc,                   // (padding)
-    };
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 6) = ctx.gotplt->shdr.sh_addr + 4;
-  }
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
-      0xb9, 0, 0, 0, 0,       // mov $reloc_offset, %ecx
-      0xff, 0xa3, 0, 0, 0, 0, // jmp *foo@GOT(%ebx)
-      0xcc,                   // (padding)
-    };
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
-    *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
-  } else {
-    static const u8 insn[] = {
-      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
-      0xb9, 0, 0, 0, 0,       // mov $reloc_offset, %ecx
-      0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT
-      0xcc,                   // (padding)
-    };
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
-    *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx);
-  }
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0xf3, 0x0f, 0x1e, 0xfb,             // endbr32
-      0xff, 0xa3, 0, 0, 0, 0,             // jmp *foo@GOT(%ebx)
-      0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding)
-    };
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 6) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr;
-  } else {
-    static const u8 insn[] = {
-      0xf3, 0x0f, 0x1e, 0xfb,             // endbr32
-      0xff, 0x25, 0, 0, 0, 0,             // jmp *foo@GOT
-      0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding)
-    };
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 6) = sym.get_got_addr(ctx);
-  }
-}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_386_32:
-    *(ul32 *)loc = val;
-    break;
-  case R_386_PC32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-static u32 relax_got32x(u8 *loc) {
-  // mov imm(%reg1), %reg2 -> lea imm(%reg1), %reg2
-  if (loc[0] == 0x8b)
-    return 0x8d00 | loc[1];
-  return 0;
-}
-
-// Relax GD to LE
-static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
-  static const u8 insn[] = {
-    0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
-    0x81, 0xc0, 0, 0, 0, 0, // add $tp_offset, %eax
-  };
-
-  switch (rel.r_type) {
-  case R_386_PLT32:
-  case R_386_PC32:
-    memcpy(loc - 3, insn, sizeof(insn));
-    *(ul32 *)(loc + 5) = val;
-    break;
-  case R_386_GOT32:
-  case R_386_GOT32X:
-    memcpy(loc - 2, insn, sizeof(insn));
-    *(ul32 *)(loc + 6) = val;
-    break;
-  default:
-    unreachable();
-  }
-}
-
-// Relax LD to LE
-static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
-  switch (rel.r_type) {
-  case R_386_PLT32:
-  case R_386_PC32: {
-    static const u8 insn[] = {
-      0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
-      0x2d, 0, 0, 0, 0,       // sub $tls_size, %eax
-    };
-    memcpy(loc - 2, insn, sizeof(insn));
-    *(ul32 *)(loc + 5) = val;
-    break;
-  }
-  case R_386_GOT32:
-  case R_386_GOT32X: {
-    static const u8 insn[] = {
-      0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
-      0x2d, 0, 0, 0, 0,       // sub $tls_size, %eax
-      0x90,                   // nop
-    };
-    memcpy(loc - 2, insn, sizeof(insn));
-    *(ul32 *)(loc + 5) = val;
-    break;
-  }
-  default:
-    unreachable();
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = get_addend(*this, rel);
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_386_8:
-      check(S + A, 0, 1 << 8);
-      *loc = S + A;
-      break;
-    case R_386_16:
-      check(S + A, 0, 1 << 16);
-      *(ul16 *)loc = S + A;
-      break;
-    case R_386_32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_386_PC8:
-      check(S + A - P, -(1 << 7), 1 << 7);
-      *loc = S + A - P;
-      break;
-    case R_386_PC16:
-      check(S + A - P, -(1 << 15), 1 << 15);
-      *(ul16 *)loc = S + A - P;
-      break;
-    case R_386_PC32:
-    case R_386_PLT32:
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_386_GOT32:
-      *(ul32 *)loc = G + A;
-      break;
-    case R_386_GOT32X:
-      if (sym.has_got(ctx)) {
-        *(ul32 *)loc = G + A;
-      } else {
-        u32 insn = relax_got32x(loc - 2);
-        assert(insn);
-        loc[-2] = insn >> 8;
-        loc[-1] = insn;
-        *(ul32 *)loc = S + A - GOT;
-      }
-      break;
-    case R_386_GOTOFF:
-      *(ul32 *)loc = S + A - GOT;
-      break;
-    case R_386_GOTPC:
-      *(ul32 *)loc = GOT + A - P;
-      break;
-    case R_386_TLS_GOTIE:
-      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
-      break;
-    case R_386_TLS_LE:
-      *(ul32 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_386_TLS_IE:
-      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A;
-      break;
-    case R_386_TLS_GD:
-      if (sym.has_tlsgd(ctx)) {
-        *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
-      } else {
-        relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr);
-        i++;
-      }
-      break;
-    case R_386_TLS_LDM:
-      if (ctx.got->has_tlsld(ctx)) {
-        *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
-      } else {
-        relax_ld_to_le(loc, rels[i + 1], ctx.tp_addr - ctx.tls_begin);
-        i++;
-      }
-      break;
-    case R_386_TLS_LDO_32:
-      *(ul32 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_386_SIZE32:
-      *(ul32 *)loc = sym.esym().st_size + A;
-      break;
-    case R_386_TLS_GOTDESC:
-      if (sym.has_tlsdesc(ctx)) {
-        *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - GOT;
-      } else {
-        static const u8 insn[] = {
-          0x8d, 0x05, 0, 0, 0, 0, // lea 0, %eax
-        };
-        memcpy(loc - 2, insn, sizeof(insn));
-        *(ul32 *)loc = S + A - ctx.tp_addr;
-      }
-      break;
-    case R_386_TLS_DESC_CALL:
-      if (!sym.has_tlsdesc(ctx)) {
-        // call *(%eax) -> nop
-        loc[0] = 0x66;
-        loc[1] = 0x90;
-      }
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : get_addend(*this, rel);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_386_8:
-      check(S + A, 0, 1 << 8);
-      *loc = S + A;
-      break;
-    case R_386_16:
-      check(S + A, 0, 1 << 16);
-      *(ul16 *)loc = S + A;
-      break;
-    case R_386_32:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
-      else
-        *(ul32 *)loc = S + A;
-      break;
-    case R_386_PC8:
-      check(S + A, -(1 << 7), 1 << 7);
-      *loc = S + A;
-      break;
-    case R_386_PC16:
-      check(S + A, -(1 << 15), 1 << 15);
-      *(ul16 *)loc = S + A;
-      break;
-    case R_386_PC32:
-      *(ul32 *)loc = S + A;
-      break;
-    case R_386_GOTPC:
-      *(ul32 *)loc = GOT + A;
-      break;
-    case R_386_GOTOFF:
-      *(ul32 *)loc = S + A - GOT;
-      break;
-    case R_386_TLS_LDO_32:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
-      else
-        *(ul32 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_386_SIZE32:
-      *(ul32 *)loc = sym.esym().st_size + A;
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = (u8 *)(contents.data() + rel.r_offset);
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_386_8:
-    case R_386_16:
-      scan_absrel(ctx, sym, rel);
-      break;
-    case R_386_32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_386_PC8:
-    case R_386_PC16:
-    case R_386_PC32:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_386_GOT32:
-    case R_386_GOTPC:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_386_GOT32X: {
-      // We always want to relax GOT32X because static PIE doesn't
-      // work without it.
-      bool do_relax = !sym.is_imported && sym.is_relative() &&
-                      relax_got32x(loc - 2);
-      if (!do_relax)
-        sym.flags |= NEEDS_GOT;
-      break;
-    }
-    case R_386_PLT32:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_386_TLS_GOTIE:
-    case R_386_TLS_IE:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_386_TLS_GD:
-      if (i + 1 == rels.size())
-        Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
-
-      if (u32 ty = rels[i + 1].r_type;
-          ty != R_386_PLT32 && ty != R_386_PC32 &&
-          ty != R_386_GOT32 && ty != R_386_GOT32X)
-        Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
-
-      // We always relax if -static because libc.a doesn't contain
-      // __tls_get_addr().
-      if (ctx.arg.is_static ||
-          (ctx.arg.relax && !ctx.arg.shared && !sym.is_imported))
-        i++;
-      else
-        sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_386_TLS_LDM:
-      if (i + 1 == rels.size())
-        Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
-
-      if (u32 ty = rels[i + 1].r_type;
-          ty != R_386_PLT32 && ty != R_386_PC32 &&
-          ty != R_386_GOT32 && ty != R_386_GOT32X)
-        Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
-
-      // We always relax if -static because libc.a doesn't contain
-      // __tls_get_addr().
-      if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared))
-        i++;
-      else
-        ctx.needs_tlsld = true;
-      break;
-    case R_386_TLS_GOTDESC:
-      if (!relax_tlsdesc(ctx, sym))
-        sym.flags |= NEEDS_TLSDESC;
-      break;
-    case R_386_TLS_LE:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_386_GOTOFF:
-    case R_386_TLS_LDO_32:
-    case R_386_SIZE32:
-    case R_386_TLS_DESC_CALL:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-m68k.cc
+++ b/third_party/mold/elf/arch-m68k.cc
@ -1,326 +0,0 @@
-// clang-format off
-// This file contains code for the Motorola 68000 series microprocessors,
-// which is often abbreviated as m68k. Running a Unix-like system on a
-// m68k-based machine today is probably a retro-computing hobby activity,
-// but the processor was a popular choice to build Unix computers during
-// '80s. Early Sun workstations for example used m68k. Macintosh until
-// 1994 were based on m68k as well until they switched to PowerPC (and
-// then to x86 and to ARM.)
-//
-// From the linker's point of view, it is not hard to support m68k. It's
-// just a 32-bit big-endian CISC ISA. Compared to comtemporary i386,
-// m68k's psABI is actually simpler because m68k has PC-relative memory
-// access instructions and therefore can support position-independent
-// code without too much hassle.
-//
-// https://github.com/rui314/psabi/blob/main/m68k.pdf
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = M68K;
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const u8 insn[] = {
-    0x2f, 0x00,                         // move.l %d0, -(%sp)
-    0x2f, 0x3b, 0x01, 0x70, 0, 0, 0, 0, // move.l (GOTPLT+4, %pc), -(%sp)
-    0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp    ([GOTPLT+8, %pc])
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ub32 *)(buf + 6) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr;
-  *(ub32 *)(buf + 14) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 4;
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static const u8 insn[] = {
-    0x20, 0x3c, 0, 0, 0, 0,             // move.l PLT_OFFSET, %d0
-    0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp    ([GOTPLT_ENTRY, %pc])
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ub32 *)(buf + 2) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
-  *(ub32 *)(buf + 10) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8;
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static const u8 insn[] = {
-    0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOT_ENTRY, %pc])
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ub32 *)(buf + 4) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 2;
-}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_68K_32:
-    *(ub32 *)loc = val;
-    break;
-  case R_68K_PC32:
-    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    auto write16 = [&](u64 val) {
-      check(val, 0, 1 << 16);
-      *(ub16 *)loc = val;
-    };
-
-    auto write16s = [&](u64 val) {
-      check(val, -(1 << 15), 1 << 15);
-      *(ub16 *)loc = val;
-    };
-
-    auto write8 = [&](u64 val) {
-      check(val, 0, 1 << 8);
-      *loc = val;
-    };
-
-    auto write8s = [&](u64 val) {
-      check(val, -(1 << 7), 1 << 7);
-      *loc = val;
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_68K_32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_68K_16:
-      write16(S + A);
-      break;
-    case R_68K_8:
-      write8(S + A);
-      break;
-    case R_68K_PC32:
-    case R_68K_PLT32:
-      *(ub32 *)loc = S + A - P;
-      break;
-    case R_68K_PC16:
-    case R_68K_PLT16:
-      write16s(S + A - P);
-      break;
-    case R_68K_PC8:
-    case R_68K_PLT8:
-      write8s(S + A - P);
-      break;
-    case R_68K_GOTPCREL32:
-      *(ub32 *)loc = GOT + A - P;
-      break;
-    case R_68K_GOTPCREL16:
-      write16s(GOT + A - P);
-      break;
-    case R_68K_GOTPCREL8:
-      write8s(GOT + A - P);
-      break;
-    case R_68K_GOTOFF32:
-      *(ub32 *)loc = G + A;
-      break;
-    case R_68K_GOTOFF16:
-      write16(G + A);
-      break;
-    case R_68K_GOTOFF8:
-      write8(G + A);
-      break;
-    case R_68K_TLS_GD32:
-      *(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
-      break;
-    case R_68K_TLS_GD16:
-      write16(sym.get_tlsgd_addr(ctx) + A - GOT);
-      break;
-    case R_68K_TLS_GD8:
-      write8(sym.get_tlsgd_addr(ctx) + A - GOT);
-      break;
-    case R_68K_TLS_LDM32:
-      *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
-      break;
-    case R_68K_TLS_LDM16:
-      write16(ctx.got->get_tlsld_addr(ctx) + A - GOT);
-      break;
-    case R_68K_TLS_LDM8:
-      write8(ctx.got->get_tlsld_addr(ctx) + A - GOT);
-      break;
-    case R_68K_TLS_LDO32:
-      *(ub32 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_68K_TLS_LDO16:
-      write16s(S + A - ctx.dtp_addr);
-      break;
-    case R_68K_TLS_LDO8:
-      write8s(S + A - ctx.dtp_addr);
-      break;
-    case R_68K_TLS_IE32:
-      *(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
-      break;
-    case R_68K_TLS_IE16:
-      write16(sym.get_gottp_addr(ctx) + A - GOT);
-      break;
-    case R_68K_TLS_IE8:
-      write8(sym.get_gottp_addr(ctx) + A - GOT);
-      break;
-    case R_68K_TLS_LE32:
-      *(ub32 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_68K_TLS_LE16:
-      write16(S + A - ctx.tp_addr);
-      break;
-    case R_68K_TLS_LE8:
-      write8(S + A - ctx.tp_addr);
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_68K_32:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ub32 *)loc = *val;
-      else
-        *(ub32 *)loc = S + A;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      Error(ctx) << sym << ": GNU ifunc symbol is not supported on m68k";
-
-    switch (rel.r_type) {
-    case R_68K_32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_68K_16:
-    case R_68K_8:
-      scan_absrel(ctx, sym, rel);
-      break;
-    case R_68K_PC32:
-    case R_68K_PC16:
-    case R_68K_PC8:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_68K_GOTPCREL32:
-    case R_68K_GOTPCREL16:
-    case R_68K_GOTPCREL8:
-    case R_68K_GOTOFF32:
-    case R_68K_GOTOFF16:
-    case R_68K_GOTOFF8:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_68K_PLT32:
-    case R_68K_PLT16:
-    case R_68K_PLT8:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_68K_TLS_GD32:
-    case R_68K_TLS_GD16:
-    case R_68K_TLS_GD8:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_68K_TLS_LDM32:
-    case R_68K_TLS_LDM16:
-    case R_68K_TLS_LDM8:
-      ctx.needs_tlsld = true;
-      break;
-    case R_68K_TLS_IE32:
-    case R_68K_TLS_IE16:
-    case R_68K_TLS_IE8:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_68K_TLS_LE32:
-    case R_68K_TLS_LE16:
-    case R_68K_TLS_LE8:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_68K_TLS_LDO32:
-    case R_68K_TLS_LDO16:
-    case R_68K_TLS_LDO8:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-ppc32.cc
+++ b/third_party/mold/elf/arch-ppc32.cc
@ -1,452 +0,0 @@
-// clang-format off
-// This file implements the PowerPC 32-bit ISA. For 64-bit PowerPC, see
-// arch-ppc64v1.cpp and arch-ppc64v2.cpp.
-//
-// PPC32 is a RISC ISA. It has 32 general-purpose registers (GPRs).
-// r0, r11 and r12 are reserved for static linkers, so we can use these
-// registers in PLTs and range extension thunks. In addition to that, it
-// has a few special registers. Notable ones are LR which holds a return
-// address and CTR which we can use to store a branch target address.
-//
-// It feels that the PPC32 psABI is unnecessarily complicated at first
-// glance, but that is mainly stemmed from the fact that the ISA lacks
-// PC-relative load/store instructions. Since machine instructions cannot
-// load data relative to its own address, it is not straightforward to
-// support position-independent code (PIC) on PPC32.
-//
-// A position-independent function typically contains the following code
-// in the prologue to obtain its own address:
-//
-//    mflr  r0        // save the current return address to %r0
-//    bcl   20, 31, 4 // call the next instruction as if it were a function
-//    mtlr  r12       // save the return address to %r12
-//    mtlr  r0        // restore the original return address
-//
-// An object file compiled with -fPIC contains a data section named
-// `.got2` to store addresses of locally-defined global variables and
-// constants. A PIC function usually computes its .got2+0x8000 and set it
-// to %r30. This scheme allows the function to access global objects
-// defined in the same input file with a single %r30-relative load/store
-// instruction with a 16-bit offset, given that .got2 is smaller than
-// 0x10000 (or 65536) bytes.
-//
-// Since each object file has its own .got2, %r30 refers to different
-// places in a merged .got2 for two functions that came from different
-// input files. Therefore, %r30 makes sense only within a single function.
-//
-// Technically, we can reuse a %r30 value in our PLT if we create a PLT
-// _for each input file_ (that's what GNU ld seems to be doing), but that
-// doesn't seems to be worth its complexity. Our PLT simply doesn't rely
-// on a %r30 value.
-//
-// https://github.com/rui314/psabi/blob/main/ppc32.pdf
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = PPC32;
-
-static u64 lo(u64 x)    { return x & 0xffff; }
-static u64 hi(u64 x)    { return x >> 16; }
-static u64 ha(u64 x)    { return (x + 0x8000) >> 16; }
-static u64 high(u64 x)  { return (x >> 16) & 0xffff; }
-static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ub32 insn[] = {
-    // Get the address of this PLT section
-    0x7c08'02a6, //    mflr    r0
-    0x429f'0005, //    bcl     20, 31, 4
-    0x7d88'02a6, // 1: mflr    r12
-    0x7c08'03a6, //    mtlr    r0
-
-    // Compute the runtime address of GOTPLT+12
-    0x3d8c'0000, //    addis   r12, r12, (GOTPLT - 1b)@higha
-    0x398c'0000, //    addi    r12, r12, (GOTPLT - 1b)@lo
-
-    // Compute the PLT entry offset
-    0x7d6c'5850, //    sub     r11, r11, r12
-    0x1d6b'0003, //    mulli   r11, r11, 3
-
-    // Load GOTPLT[2] and branch to GOTPLT[1]
-    0x800c'fff8, //    lwz     r0,  -8(r12)
-    0x7c09'03a6, //    mtctr   r0
-    0x818c'fffc, //    lwz     r12, -4(r12)
-    0x4e80'0420, //    bctr
-    0x6000'0000, //    nop
-    0x6000'0000, //    nop
-    0x6000'0000, //    nop
-    0x6000'0000, //    nop
-  };
-
-  static_assert(sizeof(insn) == E::plt_hdr_size);
-  memcpy(buf, insn, sizeof(insn));
-
-  ub32 *loc = (ub32 *)buf;
-  loc[4] |= higha(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
-  loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
-}
-
-static const ub32 plt_entry[] = {
-  // Get the address of this PLT entry
-  0x7c08'02a6, // mflr    r0
-  0x429f'0005, // bcl     20, 31, 4
-  0x7d88'02a6, // mflr    r12
-  0x7c08'03a6, // mtlr    r0
-
-  // Load an address from the GOT/GOTPLT entry and jump to that address
-  0x3d6c'0000, // addis   r11, r12, OFFSET@higha
-  0x396b'0000, // addi    r11, r11, OFFSET@lo
-  0x818b'0000, // lwz     r12, 0(r11)
-  0x7d89'03a6, // mtctr   r12
-  0x4e80'0420, // bctr
-};
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static_assert(E::plt_size == sizeof(plt_entry));
-  memcpy(buf, plt_entry, sizeof(plt_entry));
-
-  ub32 *loc = (ub32 *)buf;
-  i64 offset = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8;
-  loc[4] |= higha(offset);
-  loc[5] |= lo(offset);
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static_assert(E::pltgot_size == sizeof(plt_entry));
-  memcpy(buf, plt_entry, sizeof(plt_entry));
-
-  ub32 *loc = (ub32 *)buf;
-  i64 offset = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 8;
-  loc[4] |= higha(offset);
-  loc[5] |= lo(offset);
-}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_PPC_ADDR32:
-    *(ub32 *)loc = val;
-    break;
-  case R_PPC_REL32:
-    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  u64 GOT2 = file.ppc32_got2 ? file.ppc32_got2->get_addr() : 0;
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_PPC_ADDR32:
-    case R_PPC_UADDR32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_PPC_ADDR14:
-      *(ub32 *)loc |= bits(S + A, 15, 2) << 2;
-      break;
-    case R_PPC_ADDR16:
-    case R_PPC_UADDR16:
-    case R_PPC_ADDR16_LO:
-      *(ub16 *)loc = lo(S + A);
-      break;
-    case R_PPC_ADDR16_HI:
-      *(ub16 *)loc = hi(S + A);
-      break;
-    case R_PPC_ADDR16_HA:
-      *(ub16 *)loc = ha(S + A);
-      break;
-    case R_PPC_ADDR24:
-      *(ub32 *)loc |= bits(S + A, 25, 2) << 2;
-      break;
-    case R_PPC_ADDR30:
-      *(ub32 *)loc |= bits(S + A, 31, 2) << 2;
-      break;
-    case R_PPC_PLT16_LO:
-      *(ub16 *)loc = lo(G + GOT - A - GOT2);
-      break;
-    case R_PPC_PLT16_HI:
-      *(ub16 *)loc = hi(G + GOT - A - GOT2);
-      break;
-    case R_PPC_PLT16_HA:
-      *(ub16 *)loc = ha(G + GOT - A - GOT2);
-      break;
-    case R_PPC_PLT32:
-      *(ub32 *)loc = G + GOT - A - GOT2;
-      break;
-    case R_PPC_REL14:
-      *(ub32 *)loc |= bits(S + A - P, 15, 2) << 2;
-      break;
-    case R_PPC_REL16:
-    case R_PPC_REL16_LO:
-      *(ub16 *)loc = lo(S + A - P);
-      break;
-    case R_PPC_REL16_HI:
-      *(ub16 *)loc = hi(S + A - P);
-      break;
-    case R_PPC_REL16_HA:
-      *(ub16 *)loc = ha(S + A - P);
-      break;
-    case R_PPC_REL24:
-    case R_PPC_LOCAL24PC: {
-      i64 val = S + A - P;
-      if (sign_extend(val, 25) != val)
-        val = get_thunk_addr(i) - P;
-      *(ub32 *)loc |= bits(val, 25, 2) << 2;
-      break;
-    }
-    case R_PPC_PLTREL24: {
-      i64 val = S - P;
-      if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
-        val = get_thunk_addr(i) - P;
-      *(ub32 *)loc |= bits(val, 25, 2) << 2;
-      break;
-    }
-    case R_PPC_REL32:
-    case R_PPC_PLTREL32:
-      *(ub32 *)loc = S + A - P;
-      break;
-    case R_PPC_GOT16:
-    case R_PPC_GOT16_LO:
-      *(ub16 *)loc = lo(G + A);
-      break;
-    case R_PPC_GOT16_HI:
-      *(ub16 *)loc = hi(G + A);
-      break;
-    case R_PPC_GOT16_HA:
-      *(ub16 *)loc = ha(G + A);
-      break;
-    case R_PPC_TPREL16_LO:
-      *(ub16 *)loc = lo(S + A - ctx.tp_addr);
-      break;
-    case R_PPC_TPREL16_HI:
-      *(ub16 *)loc = hi(S + A - ctx.tp_addr);
-      break;
-    case R_PPC_TPREL16_HA:
-      *(ub16 *)loc = ha(S + A - ctx.tp_addr);
-      break;
-    case R_PPC_DTPREL16_LO:
-      *(ub16 *)loc = lo(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC_DTPREL16_HI:
-      *(ub16 *)loc = hi(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC_DTPREL16_HA:
-      *(ub16 *)loc = ha(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC_GOT_TLSGD16:
-      *(ub16 *)loc = sym.get_tlsgd_addr(ctx) - GOT;
-      break;
-    case R_PPC_GOT_TLSLD16:
-      *(ub16 *)loc = ctx.got->get_tlsld_addr(ctx) - GOT;
-      break;
-    case R_PPC_GOT_TPREL16:
-      *(ub16 *)loc = sym.get_gottp_addr(ctx) - GOT;
-      break;
-    case R_PPC_TLS:
-    case R_PPC_TLSGD:
-    case R_PPC_TLSLD:
-    case R_PPC_PLTSEQ:
-    case R_PPC_PLTCALL:
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_PPC_ADDR32:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ub32 *)loc = *val;
-      else
-        *(ub32 *)loc = S + A;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_PPC_ADDR32:
-    case R_PPC_UADDR32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_PPC_ADDR14:
-    case R_PPC_ADDR16:
-    case R_PPC_UADDR16:
-    case R_PPC_ADDR16_LO:
-    case R_PPC_ADDR16_HI:
-    case R_PPC_ADDR16_HA:
-    case R_PPC_ADDR24:
-    case R_PPC_ADDR30:
-      scan_absrel(ctx, sym, rel);
-      break;
-    case R_PPC_REL14:
-    case R_PPC_REL16:
-    case R_PPC_REL16_LO:
-    case R_PPC_REL16_HI:
-    case R_PPC_REL16_HA:
-    case R_PPC_REL32:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_PPC_GOT16:
-    case R_PPC_GOT16_LO:
-    case R_PPC_GOT16_HI:
-    case R_PPC_GOT16_HA:
-    case R_PPC_PLT16_LO:
-    case R_PPC_PLT16_HI:
-    case R_PPC_PLT16_HA:
-    case R_PPC_PLT32:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_PPC_REL24:
-    case R_PPC_PLTREL24:
-    case R_PPC_PLTREL32:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_PPC_GOT_TLSGD16:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_PPC_GOT_TLSLD16:
-      ctx.needs_tlsld = true;
-      break;
-    case R_PPC_GOT_TPREL16:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_PPC_TPREL16_LO:
-    case R_PPC_TPREL16_HI:
-    case R_PPC_TPREL16_HA:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_PPC_LOCAL24PC:
-    case R_PPC_TLS:
-    case R_PPC_TLSGD:
-    case R_PPC_TLSLD:
-    case R_PPC_DTPREL16_LO:
-    case R_PPC_DTPREL16_HI:
-    case R_PPC_DTPREL16_HA:
-    case R_PPC_PLTSEQ:
-    case R_PPC_PLTCALL:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-template <>
-void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
-  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
-
-  static const ub32 local_thunk[] = {
-    // Get this thunk's address
-    0x7c08'02a6, // mflr    r0
-    0x429f'0005, // bcl     20, 31, 4
-    0x7d88'02a6, // mflr    r12
-    0x7c08'03a6, // mtlr    r0
-
-    // Materialize the destination's address in %r11 and jump to that address
-    0x3d6c'0000, // addis   r11, r12, OFFSET@higha
-    0x396b'0000, // addi    r11, r11, OFFSET@lo
-    0x7d69'03a6, // mtctr   r11
-    0x4e80'0420, // bctr
-    0x6000'0000, // nop
-  };
-
-  static_assert(E::thunk_size == sizeof(plt_entry));
-  static_assert(E::thunk_size == sizeof(local_thunk));
-
-  for (i64 i = 0; i < symbols.size(); i++) {
-    ub32 *loc = (ub32 *)(buf + i * E::thunk_size);
-    Symbol<E> &sym = *symbols[i];
-
-    if (sym.has_plt(ctx)) {
-      memcpy(loc, plt_entry, sizeof(plt_entry));
-      u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx);
-      i64 val = got - get_addr(i) - 8;
-      loc[4] |= higha(val);
-      loc[5] |= lo(val);
-    } else {
-      memcpy(loc, local_thunk, sizeof(local_thunk));
-      i64 val = sym.get_addr(ctx) - get_addr(i) - 8;
-      loc[4] |= higha(val);
-      loc[5] |= lo(val);
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-ppc64v1.cc
+++ b/third_party/mold/elf/arch-ppc64v1.cc
@ -1,687 +0,0 @@
-// clang-format off
-// This file contains code for the 64-bit PowerPC ELFv1 ABI that is
-// commonly used for big-endian PPC systems. Modern PPC systems that use
-// the processor in the little-endian mode use the ELFv2 ABI instead. For
-// ELFv2, see arch-ppc64v2.cc.
-//
-// Even though they are similiar, ELFv1 isn't only different from ELFv2 in
-// endianness. The most notable difference is, in ELFv1, a function
-// pointer doesn't directly refer to the entry point of a function but
-// instead refers to a data structure so-called "function descriptor".
-//
-// The function descriptor is essentially a pair of a function entry point
-// address and a value that should be set to %r2 before calling that
-// function. There is also a third member for "the environment pointer for
-// languages such as Pascal and PL/1" according to the psABI, but it looks
-// like no one acutally uses it. In total, the function descriptor is 24
-// bytes long. Here is why we need it.
-//
-// PPC generally lacks PC-relative data access instructions. Position-
-// independent code sets GOT + 0x8000 to %r2 and access global variables
-// relative to %r2.
-//
-// Each ELF file has its own GOT. If a function calls another function in
-// the same ELF file, it doesn't have to reset %r2. However, if it is in
-// other file (e.g. other .so), it has to set a new value to %r2 so that
-// the register contains the callee's GOT + 0x8000.
-//
-// In this way, you can't call a function just by knowing the function's
-// entry point address. You also need to know a proper %r2 value for the
-// function. This is why a function pointer refers to a tuple of an
-// address and a %r2 value.
-//
-// If a function call is made through PLT, PLT takes care of restoring %r2.
-// Therefore, the caller has to restore %r2 only for function calls
-// through function pointers.
-//
-// .opd (short for "official procedure descriptors") contains function
-// descriptors.
-//
-// You can think OPD as this: even in other targets, a function can have a
-// few different addresses for different purposes. It may not only have an
-// entry point address but may also have PLT and/or GOT addresses.
-// In PPCV1, it may have an OPD address in addition to these. OPD address
-// is used for relocations that refers to the address of a function as a
-// function pointer.
-//
-// https://github.com/rui314/psabi/blob/main/ppc64v1.pdf
-
-#include "third_party/mold/elf/mold.h"
-
-#include "third_party/libcxx/algorithm"
-// MISSING #include <tbb/parallel_for_each.h>
-
-namespace mold::elf {
-
-using E = PPC64V1;
-
-static u64 lo(u64 x)    { return x & 0xffff; }
-static u64 hi(u64 x)    { return x >> 16; }
-static u64 ha(u64 x)    { return (x + 0x8000) >> 16; }
-static u64 high(u64 x)  { return (x >> 16) & 0xffff; }
-static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
-
-// .plt is used only for lazy symbol resolution on PPC64. All PLT
-// calls are made via range extension thunks even if they are within
-// reach. Thunks read addresses from .got.plt and jump there.
-// Therefore, once PLT symbols are resolved and final addresses are
-// written to .got.plt, thunks just skip .plt and directly jump to the
-// resolved addresses.
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ub32 insn[] = {
-    0x7d88'02a6, // mflr    r12
-    0x429f'0005, // bcl     20, 31, 4 // obtain PC
-    0x7d68'02a6, // mflr    r11
-    0xe84b'0024, // ld      r2,36(r11)
-    0x7d88'03a6, // mtlr    r12
-    0x7d62'5a14, // add     r11,r2,r11
-    0xe98b'0000, // ld      r12,0(r11)
-    0xe84b'0008, // ld      r2,8(r11)
-    0x7d89'03a6, // mtctr   r12
-    0xe96b'0010, // ld      r11,16(r11)
-    0x4e80'0420, // bctr
-    // .quad .got.plt - .plt - 8
-    0x0000'0000,
-    0x0000'0000,
-  };
-
-  static_assert(sizeof(insn) == E::plt_hdr_size);
-  memcpy(buf, insn, sizeof(insn));
-  *(ub64 *)(buf + 44) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8;
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  ub32 *loc = (ub32 *)buf;
-  i64 idx = sym.get_plt_idx(ctx);
-
-  // The PPC64 ELFv1 ABI requires PLT entries to be vary in size depending
-  // on their indices. Unlike other targets, .got.plt is filled not by us
-  // but by the loader, so we don't have a control over where the initial
-  // call to the PLT entry jumps to. So we need to strictly follow the PLT
-  // section layout as the loader expect it to be.
-  if (idx < 0x8000) {
-    static const ub32 insn[] = {
-      0x3800'0000, // li      r0, PLT_INDEX
-      0x4b00'0000, // b       plt0
-    };
-
-    memcpy(loc, insn, sizeof(insn));
-    loc[0] |= idx;
-    loc[1] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 4) & 0x00ff'ffff;
-  } else {
-    static const ub32 insn[] = {
-      0x3c00'0000, // lis     r0, PLT_INDEX@high
-      0x6000'0000, // ori     r0, r0, PLT_INDEX@lo
-      0x4b00'0000, // b       plt0
-    };
-
-    memcpy(loc, insn, sizeof(insn));
-    loc[0] |= high(idx);
-    loc[1] |= lo(idx);
-    loc[2] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 8) & 0x00ff'ffff;
-  }
-}
-
-// .plt.got is not necessary on PPC64 because range extension thunks
-// directly read GOT entries and jump there.
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_PPC64_ADDR64:
-    *(ub64 *)loc = val;
-    break;
-  case R_PPC64_REL32:
-    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  case R_PPC64_REL64:
-    *(ub64 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-    u64 TOC = ctx.extra.TOC->value;
-
-    switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      apply_toc_rel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_PPC64_TOC:
-      apply_toc_rel(ctx, *ctx.extra.TOC, rel, loc, TOC, A, P, dynrel);
-      break;
-    case R_PPC64_TOC16_HA:
-      *(ub16 *)loc = ha(S + A - TOC);
-      break;
-    case R_PPC64_TOC16_LO:
-      *(ub16 *)loc = lo(S + A - TOC);
-      break;
-    case R_PPC64_TOC16_DS:
-      check(S + A - TOC, -(1 << 15), 1 << 15);
-      *(ub16 *)loc |= (S + A - TOC) & 0xfffc;
-      break;
-    case R_PPC64_TOC16_LO_DS:
-      *(ub16 *)loc |= (S + A - TOC) & 0xfffc;
-      break;
-    case R_PPC64_REL24: {
-      i64 val = sym.get_addr(ctx, NO_OPD) + A - P;
-      if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
-        val = get_thunk_addr(i) + A - P;
-
-      check(val, -(1 << 25), 1 << 25);
-      *(ub32 *)loc |= bits(val, 25, 2) << 2;
-
-      // If a callee is an external function, PLT saves %r2 to the
-      // caller's r2 save slot. We need to restore it after function
-      // return. To do so, there's usually a NOP as a placeholder
-      // after a BL. 0x6000'0000 is a NOP.
-      if (sym.has_plt(ctx) && *(ub32 *)(loc + 4) == 0x6000'0000)
-        *(ub32 *)(loc + 4) = 0xe841'0028; // ld r2, 40(r1)
-      break;
-    }
-    case R_PPC64_REL32:
-      *(ub32 *)loc = S + A - P;
-      break;
-    case R_PPC64_REL64:
-      *(ub64 *)loc = S + A - P;
-      break;
-    case R_PPC64_REL16_HA:
-      *(ub16 *)loc = ha(S + A - P);
-      break;
-    case R_PPC64_REL16_LO:
-      *(ub16 *)loc = lo(S + A - P);
-      break;
-    case R_PPC64_PLT16_HA:
-      *(ub16 *)loc = ha(G + GOT - TOC);
-      break;
-    case R_PPC64_PLT16_HI:
-      *(ub16 *)loc = hi(G + GOT - TOC);
-      break;
-    case R_PPC64_PLT16_LO:
-      *(ub16 *)loc = lo(G + GOT - TOC);
-      break;
-    case R_PPC64_PLT16_LO_DS:
-      *(ub16 *)loc |= (G + GOT - TOC) & 0xfffc;
-      break;
-    case R_PPC64_GOT_TPREL16_HA:
-      *(ub16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSGD16_HA:
-      *(ub16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSGD16_LO:
-      *(ub16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSLD16_HA:
-      *(ub16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSLD16_LO:
-      *(ub16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC);
-      break;
-    case R_PPC64_DTPREL16_HA:
-      *(ub16 *)loc = ha(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC64_DTPREL16_LO:
-      *(ub16 *)loc = lo(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC64_TPREL16_HA:
-      *(ub16 *)loc = ha(S + A - ctx.tp_addr);
-      break;
-    case R_PPC64_TPREL16_LO:
-      *(ub16 *)loc = lo(S + A - ctx.tp_addr);
-      break;
-    case R_PPC64_GOT_TPREL16_LO_DS:
-      *(ub16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc;
-      break;
-    case R_PPC64_PLTSEQ:
-    case R_PPC64_PLTCALL:
-    case R_PPC64_TLS:
-    case R_PPC64_TLSGD:
-    case R_PPC64_TLSLD:
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ub64 *)loc = *val;
-      else
-        *(ub64 *)loc = S + A;
-      break;
-    case R_PPC64_ADDR32: {
-      i64 val = S + A;
-      check(val, 0, 1LL << 32);
-      *(ub32 *)loc = val;
-      break;
-    }
-    case R_PPC64_DTPREL64:
-      *(ub64 *)loc = S + A - ctx.dtp_addr;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT | NEEDS_PPC_OPD;
-
-    // Any relocation except R_PPC64_REL24 is considered as an
-    // address-taking relocation.
-    if (rel.r_type != R_PPC64_REL24 && sym.get_type() == STT_FUNC)
-      sym.flags |= NEEDS_PPC_OPD;
-
-    switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-    case R_PPC64_TOC:
-      scan_toc_rel(ctx, sym, rel);
-      break;
-    case R_PPC64_GOT_TPREL16_HA:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_PPC64_REL24:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_PPC64_PLT16_HA:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_PPC64_GOT_TLSGD16_HA:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_PPC64_GOT_TLSLD16_HA:
-      ctx.needs_tlsld = true;
-      break;
-    case R_PPC64_TPREL16_HA:
-    case R_PPC64_TPREL16_LO:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_PPC64_REL32:
-    case R_PPC64_REL64:
-    case R_PPC64_TOC16_HA:
-    case R_PPC64_TOC16_LO:
-    case R_PPC64_TOC16_LO_DS:
-    case R_PPC64_TOC16_DS:
-    case R_PPC64_REL16_HA:
-    case R_PPC64_REL16_LO:
-    case R_PPC64_PLT16_HI:
-    case R_PPC64_PLT16_LO:
-    case R_PPC64_PLT16_LO_DS:
-    case R_PPC64_PLTSEQ:
-    case R_PPC64_PLTCALL:
-    case R_PPC64_GOT_TPREL16_LO_DS:
-    case R_PPC64_GOT_TLSGD16_LO:
-    case R_PPC64_GOT_TLSLD16_LO:
-    case R_PPC64_TLS:
-    case R_PPC64_TLSGD:
-    case R_PPC64_TLSLD:
-    case R_PPC64_DTPREL16_HA:
-    case R_PPC64_DTPREL16_LO:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-template <>
-void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
-  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
-
-  // If the destination is .plt.got, we save the current r2, read an
-  // address of a function descriptor from .got, restore %r2 and jump
-  // to the function.
-  static const ub32 pltgot_thunk[] = {
-    // Store the caller's %r2
-    0xf841'0028, // std   %r2, 40(%r1)
-
-    // Load an address of a function descriptor
-    0x3d82'0000, // addis %r12, %r2,  foo@got@toc@ha
-    0xe98c'0000, // ld    %r12, foo@got@toc@lo(%r12)
-
-    // Restore the callee's %r2
-    0xe84c'0008, // ld    %r2,  8(%r12)
-
-    // Jump to the function
-    0xe98c'0000, // ld    %r12, 0(%r12)
-    0x7d89'03a6, // mtctr %r12
-    0x4e80'0420, // bctr
-  };
-
-  // If the destination is .plt, read a function descriptor from .got.plt.
-  static const ub32 plt_thunk[] = {
-    // Store the caller's %r2
-    0xf841'0028, // std   %r2, 40(%r1)
-
-    // Materialize an address of a function descriptor
-    0x3d82'0000, // addis %r12, %r2,  foo@gotplt@toc@ha
-    0x398c'0000, // addi  %r12, %r12, foo@gotplt@toc@lo
-
-    // Restore the callee's %r2
-    0xe84c'0008, // ld    %r2,  8(%r12)
-
-    // Jump to the function
-    0xe98c'0000, // ld    %r12, 0(%r12)
-    0x7d89'03a6, // mtctr %r12
-    0x4e80'0420, // bctr
-  };
-
-  // If the destination is a non-imported function, we directly jump
-  // to the function entry address.
-  static const ub32 local_thunk[] = {
-    0x3d82'0000, // addis r12, r2,  foo@toc@ha
-    0x398c'0000, // addi  r12, r12, foo@toc@lo
-    0x7d89'03a6, // mtctr r12
-    0x4e80'0420, // bctr
-    0x6000'0000, // nop
-    0x6000'0000, // nop
-    0x6000'0000, // nop
-  };
-
-  static_assert(E::thunk_size == sizeof(pltgot_thunk));
-  static_assert(E::thunk_size == sizeof(plt_thunk));
-  static_assert(E::thunk_size == sizeof(local_thunk));
-
-  for (i64 i = 0; i < symbols.size(); i++) {
-    Symbol<E> &sym = *symbols[i];
-    ub32 *loc = (ub32 *)(buf + i * E::thunk_size);
-
-    if (sym.has_got(ctx)) {
-      memcpy(loc, pltgot_thunk, sizeof(pltgot_thunk));
-      i64 val = sym.get_got_addr(ctx) - ctx.extra.TOC->value;
-      loc[1] |= higha(val);
-      loc[2] |= lo(val);
-    } else if(sym.has_plt(ctx)) {
-      memcpy(loc, plt_thunk, sizeof(plt_thunk));
-      i64 val = sym.get_gotplt_addr(ctx) - ctx.extra.TOC->value;
-      loc[1] |= higha(val);
-      loc[2] |= lo(val);
-    } else {
-      memcpy(loc, local_thunk, sizeof(local_thunk));
-      i64 val = sym.get_addr(ctx, NO_OPD) - ctx.extra.TOC->value;
-      loc[0] |= higha(val);
-      loc[1] |= lo(val);
-    }
-  }
-}
-
-static InputSection<E> *get_opd_section(ObjectFile<E> &file) {
-  for (std::unique_ptr<InputSection<E>> &isec : file.sections)
-    if (isec && isec->name() == ".opd")
-      return isec.get();
-  return nullptr;
-}
-
-static ElfRel<E> *
-get_relocation_at(Context<E> &ctx, InputSection<E> &isec, i64 offset) {
-  std::span<ElfRel<E>> rels = isec.get_rels(ctx);
-
-  auto it = std::lower_bound(rels.begin(), rels.end(), offset,
-                             [](const ElfRel<E> &r, i64 offset) {
-    return r.r_offset < offset;
-  });
-
-  if (it == rels.end())
-    return nullptr;
-  if (it->r_offset != offset)
-    return nullptr;
-  return &*it;
-}
-
-struct OpdSymbol {
-  bool operator<(const OpdSymbol &x) const { return r_offset < x.r_offset; }
-
-  u64 r_offset = 0;
-  Symbol<E> *sym = nullptr;
-};
-
-static Symbol<E> *
-get_opd_sym_at(Context<E> &ctx, std::span<OpdSymbol> syms, u64 offset) {
-  auto it = std::lower_bound(syms.begin(), syms.end(), OpdSymbol{offset});
-  if (it == syms.end())
-    return nullptr;
-  if (it->r_offset != offset)
-    return nullptr;
-  return it->sym;
-}
-
-// Compiler creates an .opd entry for each function symbol. The intention
-// is to make it possible to create an output .opd section just by linking
-// input .opd sections in the same manner as we do to other normal input
-// sections.
-//
-// However, in reality, .opd isn't a normal input section. It needs many
-// special treatments as follows:
-//
-// 1. A function symbol refers to not a .text but an .opd. Its address
-//    works fine for address-taking relocations such as R_PPC64_ADDR64.
-//    However, R_PPC64_REL24 (which is used for branch instruction) needs
-//    a function's real address instead of the function's .opd address.
-//    We need to read .opd contents to find out a function entry point
-//    address to apply R_PPC64_REL24.
-//
-// 2. Output .opd entries are needed only for functions whose addresses
-//    are taken. Just copying input .opd sections to an output would
-//    produces lots of dead .opd entries.
-//
-// 3. In this design, all function symbols refer to an .opd section, and
-//    that doesn't work well with graph traversal optimizations such as
-//    garbage collection or identical comdat folding. For example, garbage
-//    collector would mark an .opd alive which in turn mark all functions
-//    thatare referenced by .opd as alive, effectively keeping all
-//    functions as alive.
-//
-// The problem is that the compiler creates a half-baked .opd section, and
-// the linker has to figure out what all these .opd entries and
-// relocations are trying to achieve. It's like the compiler would emit a
-// half-baked .plt section in an object file and the linker has to deal
-// with that. That's not a good design.
-//
-// So, in this function, we undo what the compiler did to .opd. We remove
-// function symbols from .opd and reattach them to their function entry
-// points. We also rewrite relocations that directly refer to an input
-// .opd  section so that they refer to function symbols instead. We then
-// mark input .opd sections as dead.
-//
-// After this function, we mark symbols with the NEEDS_PPC_OPD flag if the
-// symbol needs an .opd entry. We then create an output .opd just like we
-// do for .plt or .got.
-void ppc64v1_rewrite_opd(Context<E> &ctx) {
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    InputSection<E> *opd = get_opd_section(*file);
-    if (!opd)
-      return;
-    opd->is_alive = false;
-
-    // Move symbols from .opd to .text.
-    std::vector<OpdSymbol> opd_syms;
-
-    for (Symbol<E> *sym : file->symbols) {
-      if (sym->file != file || sym->get_input_section() != opd)
-        continue;
-
-      if (u32 ty = sym->get_type(); ty != STT_FUNC && ty != STT_GNU_IFUNC)
-        continue;
-
-      ElfRel<E> *rel = get_relocation_at(ctx, *opd, sym->value);
-      if (!rel)
-        Fatal(ctx) << *file << ": cannot find a relocation in .opd for "
-                   << *sym << " at offset 0x" << std::hex << (u64)sym->value;
-
-      Symbol<E> *sym2 = file->symbols[rel->r_sym];
-      if (sym2->get_type() != STT_SECTION)
-        Fatal(ctx) << *file << ": bad relocation in .opd referring " << *sym2;
-
-      opd_syms.push_back({sym->value, sym});
-
-      sym->set_input_section(sym2->get_input_section());
-      sym->value = rel->r_addend;
-    }
-
-    // Sort symbols so that get_opd_sym_at() can do binary search.
-    sort(opd_syms);
-
-    // Rewrite relocations so that they directly refer to .opd.
-    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
-      if (!isec || !isec->is_alive || isec.get() == opd)
-        continue;
-
-      for (ElfRel<E> &r : isec->get_rels(ctx)) {
-        Symbol<E> &sym = *file->symbols[r.r_sym];
-        if (sym.get_input_section() != opd)
-          continue;
-
-        Symbol<E> *real_sym = get_opd_sym_at(ctx, opd_syms, r.r_addend);
-        if (!real_sym)
-          Fatal(ctx) << *isec << ": cannot find a symbol in .opd for " << r
-                     << " at offset 0x" << std::hex << (u64)r.r_addend;
-
-        r.r_sym = real_sym->sym_idx;
-        r.r_addend = 0;
-      }
-    }
-  });
-}
-
-// When a function is exported, the dynamic symbol for the function should
-// refers to the function's .opd entry. This function marks such symbols
-// with NEEDS_PPC_OPD.
-void ppc64v1_scan_symbols(Context<E> &ctx) {
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (Symbol<E> *sym : file->symbols)
-      if (sym->file == file && sym->is_exported)
-        if (u32 ty = sym->get_type(); ty == STT_FUNC || ty == STT_GNU_IFUNC)
-          sym->flags |= NEEDS_PPC_OPD;
-  });
-
-  // Functions referenced by the ELF header also have to have .opd entries.
-  auto mark = [&](std::string_view name) {
-    if (!name.empty())
-      if (Symbol<E> &sym = *get_symbol(ctx, name); !sym.is_imported)
-        sym.flags |= NEEDS_PPC_OPD;
-  };
-
-  mark(ctx.arg.entry);
-  mark(ctx.arg.init);
-  mark(ctx.arg.fini);
-}
-
-void PPC64OpdSection::add_symbol(Context<E> &ctx, Symbol<E> *sym) {
-  sym->set_opd_idx(ctx, symbols.size());
-  symbols.push_back(sym);
-  this->shdr.sh_size += ENTRY_SIZE;
-}
-
-i64 PPC64OpdSection::get_reldyn_size(Context<E> &ctx) const {
-  if (ctx.arg.pic)
-    return symbols.size() * 2;
-  return 0;
-}
-
-void PPC64OpdSection::copy_buf(Context<E> &ctx) {
-  ub64 *buf = (ub64 *)(ctx.buf + this->shdr.sh_offset);
-
-  ElfRel<E> *rel = nullptr;
-  if (ctx.arg.pic)
-    rel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset + reldyn_offset);
-
-  for (Symbol<E> *sym : symbols) {
-    u64 addr = sym->get_addr(ctx, NO_PLT | NO_OPD);
-    *buf++ = addr;
-    *buf++ = ctx.extra.TOC->value;
-    *buf++ = 0;
-
-    if (ctx.arg.pic) {
-      u64 loc = sym->get_opd_addr(ctx);
-      *rel++ = ElfRel<E>(loc, E::R_RELATIVE, 0, addr);
-      *rel++ = ElfRel<E>(loc + 8, E::R_RELATIVE, 0, ctx.extra.TOC->value);
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-ppc64v2.cc
+++ b/third_party/mold/elf/arch-ppc64v2.cc
@ -1,555 +0,0 @@
-// clang-format off
-// This file implements the PowerPC ELFv2 ABI which was standardized in
-// 2014. Modern little-endian PowerPC systems are based on this ABI.
-// The ABI is often referred to as "ppc64le". This shouldn't be confused
-// with "ppc64" which refers to the original, big-endian PowerPC systems.
-//
-// PPC64 is a bit tricky to support because PC-relative load/store
-// instructions hadn't been available until Power10 which debuted in 2021.
-// Prior to Power10, it wasn't trivial for position-independent code (PIC)
-// to load a value from, for example, .got, as we can't do that with [PC +
-// the offset to the .got entry].
-//
-// In the following, I'll explain how PIC is supported on pre-Power10
-// systems first and then explain what has changed with Power10.
-//
-//
-// Position-independent code on Power9 or earlier:
-//
-// We can get the program counter on older PPC64 systems with the
-// following four instructions
-//
-//   mflr  r1  // save the current link register to r1
-//   bl    .+4 // branch to the next instruction as if it were a function
-//   mflr  r0  // copy the return address to r0
-//   mtlr  r1  // restore the original link register value
-//
-// , but it's too expensive to do if we do this for each load/store.
-//
-// As a workaround, most functions are compiled in such a way that r2 is
-// assumed to always contain the address of .got + 0x8000. With this, we
-// can for example load the first entry of .got with a single instruction
-// `lw r0, -0x8000(r2)`. r2 is called the TOC pointer.
-//
-// There's only one .got for each ELF module. Therefore, if a callee is in
-// the same ELF module, r2 doesn't have to be recomputed. Most function
-// calls are usually within the same ELF module, so this mechanism is
-// efficient.
-//
-// A function compiled for pre-Power10 usually has two entry points,
-// global and local. The global entry point usually 8 bytes precedes
-// the local entry point. In between is the following instructions:
-//
-//   addis r2, r12, .TOC.@ha
-//   addi  r2, r2,  .TOC.@lo + 4;
-//
-// The global entry point assumes that the address of itself is in r12,
-// and it computes its own TOC pointer from r12. It's easy to do so for
-// the callee because the offset between its .got + 0x8000 and the
-// function is known at link-time. The above code sequence then falls
-// through to the local entry point that assumes r2 is .got + 0x8000.
-//
-// So, if a callee's TOC pointer is different from the current one
-// (e.g. calling a function in another .so), we first load the callee's
-// address to r12 (e.g. from .got.plt with a r2-relative load) and branch
-// to that address. Then the callee computes its own TOC pointer using
-// r12.
-//
-//
-// Position-independent code on Power10:
-//
-// Power10 added 8-bytes-long instructions to the ISA. Some of them are
-// PC-relative load/store instructions that take 34 bits offsets.
-// Functions compiled with `-mcpu=power10` use these instructions for PIC.
-// r2 does not have a special meaning in such fucntions.
-//
-// When a fucntion compiled for Power10 calls a function that uses the TOC
-// pointer, we need to compute a correct value for TOC and set it to r2
-// before transferring the control to the callee. Thunks are responsible
-// for doing it.
-//
-// `_NOTOC` relocations such as `R_PPC64_REL24_NOTOC` indicate that the
-// callee does not use TOC (i.e. compiled with `-mcpu=power10`). If a
-// function using TOC is referenced via a `_NOTOC` relocation, that call
-// is made through a range extension thunk.
-//
-//
-// Note on section names: the PPC64 psABI uses a weird naming convention
-// which calls .got.plt .plt. We ignored that part because it's just
-// confusing. Since the runtime only cares about segments, we should be
-// able to name sections whatever we want.
-//
-// https://github.com/rui314/psabi/blob/main/ppc64v2.pdf
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = PPC64V2;
-
-static u64 lo(u64 x)    { return x & 0xffff; }
-static u64 hi(u64 x)    { return x >> 16; }
-static u64 ha(u64 x)    { return (x + 0x8000) >> 16; }
-static u64 high(u64 x)  { return (x >> 16) & 0xffff; }
-static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
-
-static u64 prefix34(u64 x) {
-  return bits(x, 33, 16) | (bits(x, 15, 0) << 32);
-}
-
-// .plt is used only for lazy symbol resolution on PPC64. All PLT
-// calls are made via range extension thunks even if they are within
-// reach. Thunks read addresses from .got.plt and jump there.
-// Therefore, once PLT symbols are resolved and final addresses are
-// written to .got.plt, thunks just skip .plt and directly jump to the
-// resolved addresses.
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn[] = {
-    // Get PC
-    0x7c08'02a6, // mflr    r0
-    0x429f'0005, // bcl     20, 31, 4 // obtain PC
-    0x7d68'02a6, // mflr    r11
-    0x7c08'03a6, // mtlr    r0
-
-    // Compute the PLT entry index
-    0xe80b'002c, // ld      r0, 44(r11)
-    0x7d8b'6050, // subf    r12, r11, r12
-    0x7d60'5a14, // add     r11, r0, r11
-    0x380c'ffcc, // addi    r0, r12, -52
-    0x7800'f082, // rldicl  r0, r0, 62, 2
-
-    // Load .got.plt[0] and .got.plt[1] and branch to .got.plt[0]
-    0xe98b'0000, // ld      r12, 0(r11)
-    0x7d89'03a6, // mtctr   r12
-    0xe96b'0008, // ld      r11, 8(r11)
-    0x4e80'0420, // bctr
-
-    // .quad .got.plt - .plt - 8
-    0x0000'0000,
-    0x0000'0000,
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ul64 *)(buf + 52) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8;
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  // When the control is transferred to a PLT entry, the PLT entry's
-  // address is already set to %r12 by the caller.
-  i64 offset = ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx);
-  *(ul32 *)buf = 0x4b00'0000 | (offset & 0x00ff'ffff);        // b plt0
-}
-
-// .plt.got is not necessary on PPC64 because range extension thunks
-// directly read GOT entries and jump there.
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_PPC64_ADDR64:
-    *(ul64 *)loc = val;
-    break;
-  case R_PPC64_REL32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  case R_PPC64_REL64:
-    *(ul64 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-static u64 get_local_entry_offset(Context<E> &ctx, Symbol<E> &sym) {
-  i64 val = sym.esym().ppc_local_entry;
-  assert(val <= 7);
-  if (val == 7)
-    Fatal(ctx) << sym << ": local entry offset 7 is reserved";
-
-  if (val == 0 || val == 1)
-    return 0;
-  return 1 << val;
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-    u64 TOC = ctx.extra.TOC->value;
-
-    auto r2save_thunk_addr = [&] { return get_thunk_addr(i); };
-    auto no_r2save_thunk_addr = [&] { return get_thunk_addr(i) + 4; };
-
-    switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      if (name() == ".toc")
-        apply_toc_rel(ctx, sym, rel, loc, S, A, P, dynrel);
-      else
-        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_PPC64_TOC16_HA:
-      *(ul16 *)loc = ha(S + A - TOC);
-      break;
-    case R_PPC64_TOC16_LO:
-      *(ul16 *)loc = lo(S + A - TOC);
-      break;
-    case R_PPC64_TOC16_DS:
-    case R_PPC64_TOC16_LO_DS:
-      *(ul16 *)loc |= (S + A - TOC) & 0xfffc;
-      break;
-    case R_PPC64_REL24:
-      if (sym.has_plt(ctx) || !sym.esym().preserves_r2()) {
-        i64 val = r2save_thunk_addr() + A - P;
-        *(ul32 *)loc |= bits(val, 25, 2) << 2;
-
-        // The thunk saves %r2 to the caller's r2 save slot. We need to
-        // restore it after function return. To do so, there's usually a
-        // NOP as a placeholder after a BL. 0x6000'0000 is a NOP.
-        if (*(ul32 *)(loc + 4) == 0x6000'0000)
-          *(ul32 *)(loc + 4) = 0xe841'0018; // ld r2, 24(r1)
-      } else {
-        i64 val = S + get_local_entry_offset(ctx, sym) + A - P;
-        if (sign_extend(val, 25) != val)
-          val = no_r2save_thunk_addr() + A - P;
-        *(ul32 *)loc |= bits(val, 25, 2) << 2;
-      }
-      break;
-    case R_PPC64_REL24_NOTOC:
-      if (sym.has_plt(ctx) || sym.esym().uses_toc()) {
-        i64 val = no_r2save_thunk_addr() + A - P;
-        *(ul32 *)loc |= bits(val, 25, 2) << 2;
-      } else {
-        i64 val = S + A - P;
-        if (sign_extend(val, 25) != val)
-          val = no_r2save_thunk_addr() + A - P;
-        *(ul32 *)loc |= bits(val, 25, 2) << 2;
-      }
-      break;
-    case R_PPC64_REL32:
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_PPC64_REL64:
-      *(ul64 *)loc = S + A - P;
-      break;
-    case R_PPC64_REL16_HA:
-      *(ul16 *)loc = ha(S + A - P);
-      break;
-    case R_PPC64_REL16_LO:
-      *(ul16 *)loc = lo(S + A - P);
-      break;
-    case R_PPC64_PLT16_HA:
-      *(ul16 *)loc = ha(G + GOT - TOC);
-      break;
-    case R_PPC64_PLT16_HI:
-      *(ul16 *)loc = hi(G + GOT - TOC);
-      break;
-    case R_PPC64_PLT16_LO:
-      *(ul16 *)loc = lo(G + GOT - TOC);
-      break;
-    case R_PPC64_PLT16_LO_DS:
-      *(ul16 *)loc |= (G + GOT - TOC) & 0xfffc;
-      break;
-    case R_PPC64_PLT_PCREL34:
-    case R_PPC64_PLT_PCREL34_NOTOC:
-    case R_PPC64_GOT_PCREL34:
-      *(ul64 *)loc |= prefix34(G + GOT - P);
-      break;
-    case R_PPC64_PCREL34:
-      *(ul64 *)loc |= prefix34(S + A - P);
-      break;
-    case R_PPC64_GOT_TPREL16_HA:
-      *(ul16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TPREL16_LO_DS:
-      *(ul16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc;
-      break;
-    case R_PPC64_GOT_TPREL_PCREL34:
-      *(ul64 *)loc |= prefix34(sym.get_gottp_addr(ctx) - P);
-      break;
-    case R_PPC64_GOT_TLSGD16_HA:
-      *(ul16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSGD16_LO:
-      *(ul16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSGD_PCREL34:
-      *(ul64 *)loc |= prefix34(sym.get_tlsgd_addr(ctx) - P);
-      break;
-    case R_PPC64_GOT_TLSLD16_HA:
-      *(ul16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSLD16_LO:
-      *(ul16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC);
-      break;
-    case R_PPC64_GOT_TLSLD_PCREL34:
-      *(ul64 *)loc |= prefix34(ctx.got->get_tlsld_addr(ctx) - P);
-      break;
-    case R_PPC64_DTPREL16_HA:
-      *(ul16 *)loc = ha(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC64_DTPREL16_LO:
-      *(ul16 *)loc = lo(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC64_DTPREL34:
-      *(ul64 *)loc |= prefix34(S + A - ctx.dtp_addr);
-      break;
-    case R_PPC64_TPREL16_HA:
-      *(ul16 *)loc = ha(S + A - ctx.tp_addr);
-      break;
-    case R_PPC64_TPREL16_LO:
-      *(ul16 *)loc = lo(S + A - ctx.tp_addr);
-      break;
-    case R_PPC64_PLTSEQ:
-    case R_PPC64_PLTSEQ_NOTOC:
-    case R_PPC64_PLTCALL:
-    case R_PPC64_PLTCALL_NOTOC:
-    case R_PPC64_TLS:
-    case R_PPC64_TLSGD:
-    case R_PPC64_TLSLD:
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul64 *)loc = *val;
-      else
-        *(ul64 *)loc = S + A;
-      break;
-    case R_PPC64_ADDR32: {
-      i64 val = S + A;
-      check(val, 0, 1LL << 32);
-      *(ul32 *)loc = val;
-      break;
-    }
-    case R_PPC64_DTPREL64:
-      *(ul64 *)loc = S + A - ctx.dtp_addr;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_PPC64_ADDR64:
-      if (name() == ".toc")
-        scan_toc_rel(ctx, sym, rel);
-      else
-        scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_PPC64_GOT_TPREL16_HA:
-    case R_PPC64_GOT_TPREL_PCREL34:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_PPC64_REL24:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_PPC64_REL24_NOTOC:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      ctx.extra.is_power10 = true;
-      break;
-    case R_PPC64_PLT16_HA:
-    case R_PPC64_PLT_PCREL34:
-    case R_PPC64_PLT_PCREL34_NOTOC:
-    case R_PPC64_GOT_PCREL34:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_PPC64_GOT_TLSGD16_HA:
-    case R_PPC64_GOT_TLSGD_PCREL34:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_PPC64_GOT_TLSLD16_HA:
-    case R_PPC64_GOT_TLSLD_PCREL34:
-      ctx.needs_tlsld = true;
-      break;
-    case R_PPC64_TPREL16_HA:
-    case R_PPC64_TPREL16_LO:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_PPC64_REL32:
-    case R_PPC64_REL64:
-    case R_PPC64_TOC16_HA:
-    case R_PPC64_TOC16_LO:
-    case R_PPC64_TOC16_LO_DS:
-    case R_PPC64_TOC16_DS:
-    case R_PPC64_REL16_HA:
-    case R_PPC64_REL16_LO:
-    case R_PPC64_PLT16_HI:
-    case R_PPC64_PLT16_LO:
-    case R_PPC64_PLT16_LO_DS:
-    case R_PPC64_PCREL34:
-    case R_PPC64_PLTSEQ:
-    case R_PPC64_PLTSEQ_NOTOC:
-    case R_PPC64_PLTCALL:
-    case R_PPC64_PLTCALL_NOTOC:
-    case R_PPC64_GOT_TPREL16_LO_DS:
-    case R_PPC64_GOT_TLSGD16_LO:
-    case R_PPC64_GOT_TLSLD16_LO:
-    case R_PPC64_TLS:
-    case R_PPC64_TLSGD:
-    case R_PPC64_TLSLD:
-    case R_PPC64_DTPREL16_HA:
-    case R_PPC64_DTPREL16_LO:
-    case R_PPC64_DTPREL34:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-template <>
-void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
-  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
-
-  // If the destination is PLT, we read an address from .got.plt or .got
-  // and jump there.
-  static const ul32 plt_thunk[] = {
-    0xf841'0018, // std   r2, 24(r1)
-    0x3d82'0000, // addis r12, r2, foo@gotplt@toc@ha
-    0xe98c'0000, // ld    r12, foo@gotplt@toc@lo(r12)
-    0x7d89'03a6, // mtctr r12
-    0x4e80'0420, // bctr
-  };
-
-  static const ul32 plt_thunk_power10[] = {
-    0xf841'0018, // std   r2, 24(r1)
-    0x0410'0000, // pld   r12, foo@gotplt@pcrel
-    0xe580'0000,
-    0x7d89'03a6, // mtctr r12
-    0x4e80'0420, // bctr
-  };
-
-  // If the destination is a non-imported function, we directly jump
-  // to its local entry point.
-  static const ul32 local_thunk[] = {
-    0xf841'0018, // std   r2, 24(r1)
-    0x3d82'0000, // addis r12, r2,  foo@toc@ha
-    0x398c'0000, // addi  r12, r12, foo@toc@lo
-    0x7d89'03a6, // mtctr r12
-    0x4e80'0420, // bctr
-  };
-
-  static const ul32 local_thunk_power10[] = {
-    0xf841'0018, // std   r2, 24(r1)
-    0x0610'0000, // pla   r12, foo@pcrel
-    0x3980'0000,
-    0x7d89'03a6, // mtctr r12
-    0x4e80'0420, // bctr
-  };
-
-  static_assert(E::thunk_size == sizeof(plt_thunk));
-  static_assert(E::thunk_size == sizeof(plt_thunk_power10));
-  static_assert(E::thunk_size == sizeof(local_thunk));
-  static_assert(E::thunk_size == sizeof(local_thunk_power10));
-
-  for (i64 i = 0; i < symbols.size(); i++) {
-    Symbol<E> &sym = *symbols[i];
-    ul32 *loc = (ul32 *)(buf + i * E::thunk_size);
-
-    if (sym.has_plt(ctx)) {
-      u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx);
-
-      if (ctx.extra.is_power10) {
-        memcpy(loc, plt_thunk_power10, E::thunk_size);
-        *(ul64 *)(loc + 1) |= prefix34(got - get_addr(i) - 4);
-      } else {
-        i64 val = got - ctx.extra.TOC->value;
-        memcpy(loc, plt_thunk, E::thunk_size);
-        loc[1] |= higha(val);
-        loc[2] |= lo(val);
-      }
-    } else {
-      if (ctx.extra.is_power10) {
-        memcpy(loc, local_thunk_power10, E::thunk_size);
-        *(ul64 *)(loc + 1) |= prefix34(sym.get_addr(ctx) - get_addr(i) - 4);
-      } else {
-        i64 val = sym.get_addr(ctx) - ctx.extra.TOC->value;
-        memcpy(loc, local_thunk, E::thunk_size);
-        loc[1] |= higha(val);
-        loc[2] |= lo(val);
-      }
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-riscv.cc
+++ b/third_party/mold/elf/arch-riscv.cc
@ -1,938 +0,0 @@
-// clang-format off
-// RISC-V is a clean RISC ISA. It supports PC-relative load/store for
-// position-independent code. Its 32-bit and 64-bit ISAs are almost
-// identical. That is, you can think RV32 as a RV64 without 64-bit
-// operations. In this file, we support both RV64 and RV32.
-//
-// RISC-V is essentially little-endian, but the big-endian version is
-// available as an extension. GCC supports `-mbig-endian` to generate
-// big-endian code. Even in big-endian mode, machine instructions are
-// defined to be encoded in little-endian, though. Only the behavior of
-// load/store instructions are different between LE RISC-V and BE RISC-V.
-//
-// From the linker's point of view, the RISC-V's psABI is unique because
-// sections in input object files can be shrunk while being copied to the
-// output file. That is contrary to other psABIs in which sections are an
-// atomic unit of copying. Let me explain it in more details.
-//
-// Since RISC-V instructions are 16-bit or 32-bit long, there's no way to
-// embed a very large immediate into a branch instruction. In fact, JAL
-// (jump and link) instruction can jump to only within PC ± 1 MiB because
-// its immediate is only 21 bits long. If the destination is out of its
-// reach, we need to use two instructions instead; the first instruction
-// being AUIPC which sets upper 20 bits to a register and the second being
-// JALR with a 12-bit immediate and the register. Combined, they specify a
-// 32 bits displacement.
-//
-// Other RISC ISAs have the same limitation, and they solved the problem by
-// letting the linker create so-called "range extension thunks". It works as
-// follows: the compiler optimistically emits single jump instructions for
-// function calls. If the linker finds that a branch target is out of reach,
-// it emits a small piece of machine code near the branch instruction and
-// redirect the branch to the linker-synthesized code. The code constructs a
-// full 32-bit address in a register and jump to the destination. That
-// linker-synthesized code is called "range extension thunks" or just
-// "thunks".
-//
-// The RISC-V psABI is unique that it works the other way around. That is,
-// for RISC-V, the compiler always emits two instructions (AUIPC + JAL) for
-// function calls. If the linker finds the destination is reachable with a
-// single instruction, it replaces the two instructions with the one and
-// shrink the section size by one instruction length, instead of filling the
-// gap with a nop.
-//
-// With the presence of this relaxation, sections can no longer be
-// considered as an atomic unit. If we delete 4 bytes from the middle of a
-// section, all contents after that point needs to be shifted by 4. Symbol
-// values and relocation offsets have to be adjusted accordingly if they
-// refer to past the deleted bytes.
-//
-// In mold, we use `r_deltas` to memorize how many bytes have be adjusted
-// for relocations. For symbols, we directly mutate their `value` member.
-//
-// RISC-V object files tend to have way more relocations than those for
-// other targets. This is because all branches, including ones that jump
-// within the same section, are explicitly expressed with relocations.
-// Here is why we need them: all control-flow statements such as `if` or
-// `for` are implemented using branch instructions. For other targets, the
-// compiler doesn't emit relocations for such branches because they know
-// at compile-time exactly how many bytes has to be skipped. That's not
-// true to RISC-V because the linker may delete bytes between a branch and
-// its destination. Therefore, all branches including in-section ones have
-// to be explicitly expressed with relocations.
-//
-// Note that this mechanism only shrink sections and never enlarge, as
-// the compiler always emits the longest instruction sequence. This
-// makes the linker implementation a bit simpler because we don't need
-// to worry about oscillation.
-//
-// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc
-
-#include "third_party/mold/elf/mold.h"
-
-// MISSING #include <tbb/parallel_for.h>
-// MISSING #include <tbb/parallel_for_each.h>
-
-namespace mold::elf {
-
-static void write_itype(u8 *loc, u32 val) {
-  *(ul32 *)loc &= 0b000000'00000'11111'111'11111'1111111;
-  *(ul32 *)loc |= bits(val, 11, 0) << 20;
-}
-
-static void write_stype(u8 *loc, u32 val) {
-  *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
-  *(ul32 *)loc |= bits(val, 11, 5) << 25 | bits(val, 4, 0) << 7;
-}
-
-static void write_btype(u8 *loc, u32 val) {
-  *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
-  *(ul32 *)loc |= bit(val, 12) << 31   | bits(val, 10, 5) << 25 |
-                  bits(val, 4, 1) << 8 | bit(val, 11) << 7;
-}
-
-static void write_utype(u8 *loc, u32 val) {
-  *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;
-
-  // U-type instructions are used in combination with I-type
-  // instructions. U-type insn sets an immediate to the upper 20-bits
-  // of a register. I-type insn sign-extends a 12-bits immediate and
-  // adds it to a register value to construct a complete value. 0x800
-  // is added here to compensate for the sign-extension.
-  *(ul32 *)loc |= (val + 0x800) & 0xffff'f000;
-}
-
-static void write_jtype(u8 *loc, u32 val) {
-  *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;
-  *(ul32 *)loc |= bit(val, 20) << 31 | bits(val, 10, 1)  << 21 |
-                  bit(val, 11) << 20 | bits(val, 19, 12) << 12;
-}
-
-static void write_cbtype(u8 *loc, u32 val) {
-  *(ul16 *)loc &= 0b111'000'111'00000'11;
-  *(ul16 *)loc |= bit(val, 8) << 12 | bit(val, 4) << 11 | bit(val, 3) << 10 |
-                  bit(val, 7) << 6  | bit(val, 6) << 5  | bit(val, 2) << 4  |
-                  bit(val, 1) << 3  | bit(val, 5) << 2;
-}
-
-static void write_cjtype(u8 *loc, u32 val) {
-  *(ul16 *)loc &= 0b111'00000000000'11;
-  *(ul16 *)loc |= bit(val, 11) << 12 | bit(val, 4)  << 11 | bit(val, 9) << 10 |
-                  bit(val, 8)  << 9  | bit(val, 10) << 8  | bit(val, 6) << 7  |
-                  bit(val, 7)  << 6  | bit(val, 3)  << 5  | bit(val, 2) << 4  |
-                  bit(val, 1)  << 3  | bit(val, 5)  << 2;
-}
-
-static void overwrite_uleb(u8 *loc, u64 val) {
-  while (*loc & 0b1000'0000) {
-    *loc++ = 0b1000'0000 | (val & 0b0111'1111);
-    val >>= 7;
-  }
-}
-
-// Returns the rd register of an R/I/U/J-type instruction.
-static u32 get_rd(u32 val) {
-  return bits(val, 11, 7);
-}
-
-static void set_rs1(u8 *loc, u32 rs1) {
-  assert(rs1 < 32);
-  *(ul32 *)loc &= 0b111111'11111'00000'111'11111'1111111;
-  *(ul32 *)loc |= rs1 << 15;
-}
-
-template <typename E>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static const ul32 insn_64[] = {
-    0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
-    0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
-    0x0003'be03, // ld     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
-    0xfd43'0313, // addi   t1, t1, -44              # .plt entry
-    0x0003'8293, // addi   t0, t2, %pcrel_lo(1b)    # &.got.plt
-    0x0013'5313, // srli   t1, t1, 1                # .plt entry offset
-    0x0082'b283, // ld     t0, 8(t0)                # link map
-    0x000e'0067, // jr     t3
-  };
-
-  static const ul32 insn_32[] = {
-    0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
-    0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
-    0x0003'ae03, // lw     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
-    0xfd43'0313, // addi   t1, t1, -44              # .plt entry
-    0x0003'8293, // addi   t0, t2, %pcrel_lo(1b)    # &.got.plt
-    0x0023'5313, // srli   t1, t1, 2                # .plt entry offset
-    0x0042'a283, // lw     t0, 4(t0)                # link map
-    0x000e'0067, // jr     t3
-  };
-
-  if constexpr (E::is_64)
-    memcpy(buf, insn_64, sizeof(insn_64));
-  else
-    memcpy(buf, insn_32, sizeof(insn_32));
-
-  u64 gotplt = ctx.gotplt->shdr.sh_addr;
-  u64 plt = ctx.plt->shdr.sh_addr;
-  write_utype(buf, gotplt - plt);
-  write_itype(buf + 8, gotplt - plt);
-  write_itype(buf + 16, gotplt - plt);
-}
-
-static const ul32 plt_entry_64[] = {
-  0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
-  0x000e'3e03, // ld      t3, %pcrel_lo(1b)(t3)
-  0x000e'0367, // jalr    t1, t3
-  0x0000'0013, // nop
-};
-
-static const ul32 plt_entry_32[] = {
-  0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
-  0x000e'2e03, // lw      t3, %pcrel_lo(1b)(t3)
-  0x000e'0367, // jalr    t1, t3
-  0x0000'0013, // nop
-};
-
-template <typename E>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  if constexpr (E::is_64)
-    memcpy(buf, plt_entry_64, sizeof(plt_entry_64));
-  else
-    memcpy(buf, plt_entry_32, sizeof(plt_entry_32));
-
-  u64 gotplt = sym.get_gotplt_addr(ctx);
-  u64 plt = sym.get_plt_addr(ctx);
-  write_utype(buf, gotplt - plt);
-  write_itype(buf + 4, gotplt - plt);
-}
-
-template <typename E>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  if constexpr (E::is_64)
-    memcpy(buf, plt_entry_64, sizeof(plt_entry_64));
-  else
-    memcpy(buf, plt_entry_32, sizeof(plt_entry_32));
-
-  u64 got = sym.get_got_addr(ctx);
-  u64 plt = sym.get_plt_addr(ctx);
-  write_utype(buf, got - plt);
-  write_itype(buf + 4, got - plt);
-}
-
-template <typename E>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_RISCV_ADD32:
-    *(U32<E> *)loc += val;
-    break;
-  case R_RISCV_SUB8:
-    *loc -= val;
-    break;
-  case R_RISCV_SUB16:
-    *(U16<E> *)loc -= val;
-    break;
-  case R_RISCV_SUB32:
-    *(U32<E> *)loc -= val;
-    break;
-  case R_RISCV_SUB6:
-    *loc = (*loc & 0b1100'0000) | ((*loc - val) & 0b0011'1111);
-    break;
-  case R_RISCV_SET6:
-    *loc = (*loc & 0b1100'0000) | (val & 0b0011'1111);
-    break;
-  case R_RISCV_SET8:
-    *loc = val;
-    break;
-  case R_RISCV_SET16:
-    *(U16<E> *)loc = val;
-    break;
-  case R_RISCV_SET32:
-    *(U32<E> *)loc = val;
-    break;
-  case R_RISCV_32_PCREL:
-    *(U32<E> *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <typename E>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  auto get_r_delta = [&](i64 idx) {
-    return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
-  };
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    i64 r_offset = rel.r_offset - get_r_delta(i);
-    i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
-    u8 *loc = base + r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    auto find_paired_reloc = [&] {
-      Symbol<E> &sym = *file.symbols[rels[i].r_sym];
-      assert(sym.get_input_section() == this);
-
-      if (sym.value < r_offset) {
-        for (i64 j = i - 1; j >= 0; j--)
-          if (u32 ty = rels[j].r_type;
-              ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
-              ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20)
-            if (sym.value == rels[j].r_offset - get_r_delta(j))
-              return j;
-      } else {
-        for (i64 j = i + 1; j < rels.size(); j++)
-          if (u32 ty = rels[j].r_type;
-              ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
-              ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20)
-            if (sym.value == rels[j].r_offset - get_r_delta(j))
-              return j;
-      }
-
-      Fatal(ctx) << *this << ": paired relocation is missing: " << i;
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_RISCV_32:
-      if constexpr (E::is_64)
-        *(U32<E> *)loc = S + A;
-      else
-        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_RISCV_64:
-      assert(E::is_64);
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_RISCV_BRANCH:
-      check(S + A - P, -(1 << 12), 1 << 12);
-      write_btype(loc, S + A - P);
-      break;
-    case R_RISCV_JAL:
-      check(S + A - P, -(1 << 20), 1 << 20);
-      write_jtype(loc, S + A - P);
-      break;
-    case R_RISCV_CALL:
-    case R_RISCV_CALL_PLT: {
-      u32 rd = get_rd(*(ul32 *)(contents.data() + rel.r_offset + 4));
-
-      if (removed_bytes == 4) {
-        // auipc + jalr -> jal
-        *(ul32 *)loc = (rd << 7) | 0b1101111;
-        write_jtype(loc, S + A - P);
-      } else if (removed_bytes == 6 && rd == 0) {
-        // auipc + jalr -> c.j
-        *(ul16 *)loc = 0b101'00000000000'01;
-        write_cjtype(loc, S + A - P);
-      } else if (removed_bytes == 6 && rd == 1) {
-        // auipc + jalr -> c.jal
-        assert(!E::is_64);
-        *(ul16 *)loc = 0b001'00000000000'01;
-        write_cjtype(loc, S + A - P);
-      } else {
-        assert(removed_bytes == 0);
-        // Calling an undefined weak symbol does not make sense.
-        // We make such call into an infinite loop. This should
-        // help debugging of a faulty program.
-        u64 val = sym.esym().is_undef_weak() ? 0 : S + A - P;
-        check(val, -(1LL << 31), 1LL << 31);
-        write_utype(loc, val);
-        write_itype(loc + 4, val);
-      }
-      break;
-    }
-    case R_RISCV_GOT_HI20:
-      write_utype(loc, G + GOT + A - P);
-      break;
-    case R_RISCV_TLS_GOT_HI20:
-      write_utype(loc, sym.get_gottp_addr(ctx) + A - P);
-      break;
-    case R_RISCV_TLS_GD_HI20:
-      write_utype(loc, sym.get_tlsgd_addr(ctx) + A - P);
-      break;
-    case R_RISCV_PCREL_HI20:
-      write_utype(loc, S + A - P);
-      break;
-    case R_RISCV_PCREL_LO12_I:
-    case R_RISCV_PCREL_LO12_S: {
-      i64 idx2 = find_paired_reloc();
-      const ElfRel<E> &rel2 = rels[idx2];
-      Symbol<E> &sym2 = *file.symbols[rel2.r_sym];
-
-      u64 S = sym2.get_addr(ctx);
-      u64 A = rel2.r_addend;
-      u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
-      u64 G = sym2.get_got_idx(ctx) * sizeof(Word<E>);
-      u64 val;
-
-      switch (rel2.r_type) {
-      case R_RISCV_GOT_HI20:
-        val = G + GOT + A - P;
-        break;
-      case R_RISCV_TLS_GOT_HI20:
-        val = sym2.get_gottp_addr(ctx) + A - P;
-        break;
-      case R_RISCV_TLS_GD_HI20:
-        val = sym2.get_tlsgd_addr(ctx) + A - P;
-        break;
-      case R_RISCV_PCREL_HI20:
-        val = S + A - P;
-        break;
-      default:
-        unreachable();
-      }
-
-      if (rel.r_type == R_RISCV_PCREL_LO12_I)
-        write_itype(loc, val);
-      else
-        write_stype(loc, val);
-      break;
-    }
-    case R_RISCV_HI20:
-      assert(removed_bytes == 0 || removed_bytes == 4);
-      if (removed_bytes == 0) {
-        check(S + A, -(1LL << 31), 1LL << 31);
-        write_utype(loc, S + A);
-      }
-      break;
-    case R_RISCV_LO12_I:
-    case R_RISCV_LO12_S:
-      if (rel.r_type == R_RISCV_LO12_I)
-        write_itype(loc, S + A);
-      else
-        write_stype(loc, S + A);
-
-      // Rewrite `lw t1, 0(t0)` with `lw t1, 0(x0)` if the address is
-      // accessible relative to the zero register. If the upper 20 bits
-      // are all zero, the corresponding LUI might have been removed.
-      if (bits(S + A, 31, 12) == 0)
-        set_rs1(loc, 0);
-      break;
-    case R_RISCV_TPREL_HI20:
-      assert(removed_bytes == 0 || removed_bytes == 4);
-      if (removed_bytes == 0)
-        write_utype(loc, S + A - ctx.tp_addr);
-      break;
-    case R_RISCV_TPREL_ADD:
-      // This relocation just annotates an ADD instruction that can be
-      // removed when a TPREL is relaxed. No value is needed to be
-      // written.
-      assert(removed_bytes == 0 || removed_bytes == 4);
-      break;
-    case R_RISCV_TPREL_LO12_I:
-    case R_RISCV_TPREL_LO12_S: {
-      i64 val = S + A - ctx.tp_addr;
-      if (rel.r_type == R_RISCV_TPREL_LO12_I)
-        write_itype(loc, val);
-      else
-        write_stype(loc, val);
-
-      // Rewrite `lw t1, 0(t0)` with `lw t1, 0(tp)` if the address is
-      // directly accessible using tp. tp is x4.
-      if (sign_extend(val, 11) == val)
-        set_rs1(loc, 4);
-      break;
-    }
-    case R_RISCV_ADD8:
-      loc += S + A;
-      break;
-    case R_RISCV_ADD16:
-      *(U16<E> *)loc += S + A;
-      break;
-    case R_RISCV_ADD32:
-      *(U32<E> *)loc += S + A;
-      break;
-    case R_RISCV_ADD64:
-      *(U64<E> *)loc += S + A;
-      break;
-    case R_RISCV_SUB8:
-      loc -= S + A;
-      break;
-    case R_RISCV_SUB16:
-      *(U16<E> *)loc -= S + A;
-      break;
-    case R_RISCV_SUB32:
-      *(U32<E> *)loc -= S + A;
-      break;
-    case R_RISCV_SUB64:
-      *(U64<E> *)loc -= S + A;
-      break;
-    case R_RISCV_ALIGN: {
-      // A R_RISCV_ALIGN is followed by a NOP sequence. We need to remove
-      // zero or more bytes so that the instruction after R_RISCV_ALIGN is
-      // aligned to a given alignment boundary.
-      //
-      // We need to guarantee that the NOP sequence is valid after byte
-      // removal (e.g. we can't remove the first 2 bytes of a 4-byte NOP).
-      // For the sake of simplicity, we always rewrite the entire NOP sequence.
-      i64 padding_bytes = rel.r_addend - removed_bytes;
-      assert((padding_bytes & 1) == 0);
-
-      i64 i = 0;
-      for (; i <= padding_bytes - 4; i += 4)
-        *(ul32 *)(loc + i) = 0x0000'0013; // nop
-      if (i < padding_bytes)
-        *(ul16 *)(loc + i) = 0x0001;      // c.nop
-      break;
-    }
-    case R_RISCV_RVC_BRANCH:
-      check(S + A - P, -(1 << 8), 1 << 8);
-      write_cbtype(loc, S + A - P);
-      break;
-    case R_RISCV_RVC_JUMP:
-      check(S + A - P, -(1 << 11), 1 << 11);
-      write_cjtype(loc, S + A - P);
-      break;
-    case R_RISCV_SUB6:
-      *loc = (*loc & 0b1100'0000) | ((*loc - (S + A)) & 0b0011'1111);
-      break;
-    case R_RISCV_SET6:
-      *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
-      break;
-    case R_RISCV_SET8:
-      *loc = S + A;
-      break;
-    case R_RISCV_SET16:
-      *(U16<E> *)loc = S + A;
-      break;
-    case R_RISCV_SET32:
-      *(U32<E> *)loc = S + A;
-      break;
-    case R_RISCV_PLT32:
-    case R_RISCV_32_PCREL:
-      *(U32<E> *)loc = S + A - P;
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <typename E>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_RISCV_32:
-      *(U32<E> *)loc = S + A;
-      break;
-    case R_RISCV_64:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(U64<E> *)loc = *val;
-      else
-        *(U64<E> *)loc = S + A;
-      break;
-    case R_RISCV_ADD8:
-      *loc += S + A;
-      break;
-    case R_RISCV_ADD16:
-      *(U16<E> *)loc += S + A;
-      break;
-    case R_RISCV_ADD32:
-      *(U32<E> *)loc += S + A;
-      break;
-    case R_RISCV_ADD64:
-      *(U64<E> *)loc += S + A;
-      break;
-    case R_RISCV_SUB8:
-      *loc -= S + A;
-      break;
-    case R_RISCV_SUB16:
-      *(U16<E> *)loc -= S + A;
-      break;
-    case R_RISCV_SUB32:
-      *(U32<E> *)loc -= S + A;
-      break;
-    case R_RISCV_SUB64:
-      *(U64<E> *)loc -= S + A;
-      break;
-    case R_RISCV_SUB6:
-      *loc = (*loc & 0b1100'0000) | ((*loc - (S + A)) & 0b0011'1111);
-      break;
-    case R_RISCV_SET6:
-      *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
-      break;
-    case R_RISCV_SET8:
-      *loc = S + A;
-      break;
-    case R_RISCV_SET16:
-      *(U16<E> *)loc = S + A;
-      break;
-    case R_RISCV_SET32:
-      *(U32<E> *)loc = S + A;
-      break;
-    case R_RISCV_SET_ULEB128:
-      overwrite_uleb(loc, S + A);
-      break;
-    case R_RISCV_SUB_ULEB128: {
-      u8 *p = loc;
-      u64 val = read_uleb(p);
-      overwrite_uleb(loc, val - S - A);
-      break;
-    }
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-      break;
-    }
-  }
-}
-
-template <typename E>
-void InputSection<E>::copy_contents_riscv(Context<E> &ctx, u8 *buf) {
-  // If a section is not relaxed, we can copy it as a one big chunk.
-  if (extra.r_deltas.empty()) {
-    uncompress_to(ctx, buf);
-    return;
-  }
-
-  // A relaxed section is copied piece-wise.
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-  i64 pos = 0;
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
-    if (delta == 0)
-      continue;
-    assert(delta > 0);
-
-    const ElfRel<E> &r = rels[i];
-    memcpy(buf, contents.data() + pos, r.r_offset - pos);
-    buf += r.r_offset - pos;
-    pos = r.r_offset + delta;
-  }
-
-  memcpy(buf, contents.data() + pos, contents.size() - pos);
-}
-
-template <typename E>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_RISCV_32:
-      if constexpr (E::is_64)
-        scan_absrel(ctx, sym, rel);
-      else
-        scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_RISCV_HI20:
-      scan_absrel(ctx, sym, rel);
-      break;
-    case R_RISCV_64:
-      if constexpr (!E::is_64)
-        Fatal(ctx) << *this << ": R_RISCV_64 cannot be used on RV32";
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_RISCV_CALL:
-    case R_RISCV_CALL_PLT:
-    case R_RISCV_PLT32:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_RISCV_GOT_HI20:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_RISCV_TLS_GOT_HI20:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_RISCV_TLS_GD_HI20:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_RISCV_32_PCREL:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_RISCV_TPREL_HI20:
-    case R_RISCV_TPREL_LO12_I:
-    case R_RISCV_TPREL_LO12_S:
-    case R_RISCV_TPREL_ADD:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_RISCV_BRANCH:
-    case R_RISCV_JAL:
-    case R_RISCV_PCREL_HI20:
-    case R_RISCV_PCREL_LO12_I:
-    case R_RISCV_PCREL_LO12_S:
-    case R_RISCV_LO12_I:
-    case R_RISCV_LO12_S:
-    case R_RISCV_ADD8:
-    case R_RISCV_ADD16:
-    case R_RISCV_ADD32:
-    case R_RISCV_ADD64:
-    case R_RISCV_SUB8:
-    case R_RISCV_SUB16:
-    case R_RISCV_SUB32:
-    case R_RISCV_SUB64:
-    case R_RISCV_ALIGN:
-    case R_RISCV_RVC_BRANCH:
-    case R_RISCV_RVC_JUMP:
-    case R_RISCV_RELAX:
-    case R_RISCV_SUB6:
-    case R_RISCV_SET6:
-    case R_RISCV_SET8:
-    case R_RISCV_SET16:
-    case R_RISCV_SET32:
-      break;
-    default:
-      Error(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-template <typename E>
-static bool is_resizable(Context<E> &ctx, InputSection<E> *isec) {
-  return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) &&
-         (isec->shdr().sh_flags & SHF_EXECINSTR);
-}
-
-// Returns the distance between a relocated place and a symbol.
-template <typename E>
-static i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
-                            InputSection<E> &isec, const ElfRel<E> &rel) {
-  // We handle absolute symbols as if they were infinitely far away
-  // because `shrink_section` may increase a distance between a branch
-  // instruction and an absolute symbol. Branching to an absolute
-  // location is extremely rare in real code, though.
-  if (sym.is_absolute())
-    return INT32_MAX;
-
-  // Likewise, relocations against weak undefined symbols won't be relaxed.
-  if (sym.esym().is_undef_weak())
-    return INT32_MAX;
-
-  // Compute a distance between the relocated place and the symbol.
-  i64 S = sym.get_addr(ctx);
-  i64 A = rel.r_addend;
-  i64 P = isec.get_addr() + rel.r_offset;
-  return S + A - P;
-}
-
-// Scan relocations to shrink sections.
-template <typename E>
-static void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
-  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
-  isec.extra.r_deltas.resize(rels.size() + 1);
-
-  i64 delta = 0;
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &r = rels[i];
-    Symbol<E> &sym = *isec.file.symbols[r.r_sym];
-    isec.extra.r_deltas[i] = delta;
-
-    // Handling R_RISCV_ALIGN is mandatory.
-    //
-    // R_RISCV_ALIGN refers to NOP instructions. We need to eliminate some
-    // or all of the instructions so that the instruction that immediately
-    // follows the NOPs is aligned to a specified alignment boundary.
-    if (r.r_type == R_RISCV_ALIGN) {
-      // The total bytes of NOPs is stored to r_addend, so the next
-      // instruction is r_addend away.
-      u64 loc = isec.get_addr() + r.r_offset - delta;
-      u64 next_loc = loc + r.r_addend;
-      u64 alignment = bit_ceil(r.r_addend + 1);
-      assert(alignment <= (1 << isec.p2align));
-      delta += next_loc - align_to(loc, alignment);
-      continue;
-    }
-
-    // Handling other relocations is optional.
-    if (!ctx.arg.relax || i == rels.size() - 1 ||
-        rels[i + 1].r_type != R_RISCV_RELAX)
-      continue;
-
-    // Linker-synthesized symbols haven't been assigned their final
-    // values when we are shrinking sections because actual values can
-    // be computed only after we fix the file layout. Therefore, we
-    // assume that relocations against such symbols are always
-    // non-relaxable.
-    if (sym.file == ctx.internal_obj)
-      continue;
-
-    switch (r.r_type) {
-    case R_RISCV_CALL:
-    case R_RISCV_CALL_PLT: {
-      // These relocations refer to an AUIPC + JALR instruction pair to
-      // allow to jump to anywhere in PC ± 2 GiB. If the jump target is
-      // close enough to PC, we can use C.J, C.JAL or JAL instead.
-      i64 dist = compute_distance(ctx, sym, isec, r);
-      if (dist & 1)
-        break;
-
-      i64 rd = get_rd(*(ul32 *)(isec.contents.data() + r.r_offset + 4));
-
-      if (rd == 0 && sign_extend(dist, 11) == dist && use_rvc) {
-        // If rd is x0 and the jump target is within ±2 KiB, we can use
-        // C.J, saving 6 bytes.
-        delta += 6;
-      } else if (rd == 1 && sign_extend(dist, 11) == dist && use_rvc && !E::is_64) {
-        // If rd is x1 and the jump target is within ±2 KiB, we can use
-        // C.JAL. This is RV32 only because C.JAL is RV32-only instruction.
-        delta += 6;
-      } else if (sign_extend(dist, 20) == dist) {
-        // If the jump target is within ±1 MiB, we can use JAL.
-        delta += 4;
-      }
-      break;
-    }
-    case R_RISCV_HI20:
-      // If the upper 20 bits are all zero, we can remove LUI.
-      // The corresponding instructions referred to by LO12_I/LO12_S
-      // relocations will use the zero register instead.
-      if (bits(sym.get_addr(ctx), 31, 12) == 0)
-        delta += 4;
-      break;
-    case R_RISCV_TPREL_HI20:
-    case R_RISCV_TPREL_ADD:
-      // These relocations are used to add a high 20-bit value to the
-      // thread pointer. The following two instructions materializes
-      // TP + HI20(foo) in %r5, for example.
-      //
-      //  lui  a5,%tprel_hi(foo)         # R_RISCV_TPREL_HI20 (symbol)
-      //  add  a5,a5,tp,%tprel_add(foo)  # R_RISCV_TPREL_ADD (symbol)
-      //
-      // Then thread-local variable `foo` is accessed with a low 12-bit
-      // offset like this:
-      //
-      //  sw   t0,%tprel_lo(foo)(a5)     # R_RISCV_TPREL_LO12_S (symbol)
-      //
-      // However, if the variable is at TP ±2 KiB, TP + HI20(foo) is the
-      // same as TP, so we can instead access the thread-local variable
-      // directly using TP like this:
-      //
-      //  sw   t0,%tprel_lo(foo)(tp)
-      //
-      // Here, we remove `lui` and `add` if the offset is within ±2 KiB.
-      if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
-          sign_extend(val, 11) == val)
-        delta += 4;
-      break;
-    }
-  }
-
-  isec.extra.r_deltas[rels.size()] = delta;
-  isec.sh_size -= delta;
-}
-
-// Shrink sections by interpreting relocations.
-//
-// This operation seems to be optional, because by default longest
-// instructions are being used. However, calling this function is actually
-// mandatory because of R_RISCV_ALIGN. R_RISCV_ALIGN is a directive to the
-// linker to align the location referred to by the relocation to a
-// specified byte boundary. We at least have to interpret them to satisfy
-// the alignment constraints.
-template <typename E>
-i64 riscv_resize_sections(Context<E> &ctx) {
-  Timer t(ctx, "riscv_resize_sections");
-
-  // True if we can use the 2-byte instructions. This is usually true on
-  // Unix because RV64GC is generally considered the baseline hardware.
-  bool use_rvc = get_eflags(ctx) & EF_RISCV_RVC;
-
-  // Find all the relocations that can be relaxed.
-  // This step should only shrink sections.
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (std::unique_ptr<InputSection<E>> &isec : file->sections)
-      if (is_resizable(ctx, isec.get()))
-        shrink_section(ctx, *isec, use_rvc);
-  });
-
-  // Fix symbol values.
-  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
-    for (Symbol<E> *sym : file->symbols) {
-      if (sym->file != file)
-        continue;
-
-      InputSection<E> *isec = sym->get_input_section();
-      if (!isec || isec->extra.r_deltas.empty())
-        continue;
-
-      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
-      auto it = std::lower_bound(rels.begin(), rels.end(), sym->value,
-                                 [&](const ElfRel<E> &r, u64 val) {
-        return r.r_offset < val;
-      });
-
-      sym->value -= isec->extra.r_deltas[it - rels.begin()];
-    }
-  });
-
-  // Re-compute section offset again to finalize them.
-  compute_section_sizes(ctx);
-  return set_osec_offsets(ctx);
-}
-
-#define INSTANTIATE(E)                                                       \
-  template void write_plt_header(Context<E> &, u8 *);                        \
-  template void write_plt_entry(Context<E> &, u8 *, Symbol<E> &);            \
-  template void write_pltgot_entry(Context<E> &, u8 *, Symbol<E> &);         \
-  template void                                                              \
-  EhFrameSection<E>::apply_reloc(Context<E> &, const ElfRel<E> &, u64, u64); \
-  template void InputSection<E>::apply_reloc_alloc(Context<E> &, u8 *);      \
-  template void InputSection<E>::apply_reloc_nonalloc(Context<E> &, u8 *);   \
-  template void InputSection<E>::copy_contents_riscv(Context<E> &, u8 *);    \
-  template void InputSection<E>::scan_relocations(Context<E> &);             \
-  template i64 riscv_resize_sections(Context<E> &);
-
-INSTANTIATE(RV64LE);
-INSTANTIATE(RV64BE);
-INSTANTIATE(RV32LE);
-INSTANTIATE(RV32BE);
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-s390x.cc
+++ b/third_party/mold/elf/arch-s390x.cc
@ -1,491 +0,0 @@
-// clang-format off
-// This file contains code for the IBM z/Architecture 64-bit ISA, which is
-// commonly referred to as "s390x" on Linux.
-//
-// z/Architecture is a 64-bit CISC ISA developed by IBM around 2000 for
-// IBM's "big iron" mainframe computers. The computers are direct
-// descendents of IBM System/360 all the way back in 1966. I've never
-// actually seen a mainframe, and you probaly haven't either, but it looks
-// like the mainframe market is still large enough to sustain its ecosystem.
-// Ubuntu for example provides the official support for s390x as of 2022.
-// Since they are being actively maintained, we need to support them.
-//
-// As an instruction set, s390x isn't particularly odd. It has 16 general-
-// purpose registers. Instructions are 2, 4 or 6 bytes long and always
-// aligned to 2 bytes boundaries. Despite unfamiliarty, I found that it
-// just feels like an x86-64 in a parallel universe.
-//
-// Here is the register usage in this ABI:
-//
-//   r0-r1: reserved as scratch registers so we can use them in our PLT
-//   r2:    parameter passing and return values
-//   r3-r6: parameter passing
-//   r12:   address of GOT if position-independent code
-//   r14:   return address
-//   r15:   stack pointer
-//   a1:    upper 32 bits of TP (thread pointer)
-//   a2:    lower 32 bits of TP (thread pointer)
-//
-// Thread-local storage (TLS) is supported on s390x in the same way as it
-// is on other targets with one exeption. On other targets, __tls_get_addr
-// is used to get an address of a thread-local variable. On s390x,
-// __tls_get_offset is used instead. The difference is __tls_get_offset
-// returns an address of a thread-local variable as an offset from TP. So
-// we need to add TP to a return value before use. I don't know why it is
-// different, but that is the way it is.
-//
-// https://github.com/rui314/psabi/blob/main/s390x.pdf
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = S390X;
-
-static void write_mid20(u8 *loc, u64 val) {
-  *(ub32 *)loc |= (bits(val, 11, 0) << 16) | (bits(val, 19, 12) << 8);
-}
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  static u8 insn[] = {
-    0xe3, 0x00, 0xf0, 0x38, 0x00, 0x24, // stg   %r0, 56(%r15)
-    0xc0, 0x10, 0, 0, 0, 0,             // larl  %r1, GOTPLT_OFFSET
-    0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc   48(8, %r15), 8(%r1)
-    0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg    %r1, 16(%r1)
-    0x07, 0xf1,                         // br    %r1
-    0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ub32 *)(buf + 8) = (ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 6) >> 1;
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static u8 insn[] = {
-    0xc0, 0x10, 0, 0, 0, 0,             // larl  %r1, GOTPLT_ENTRY_OFFSET
-    0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg    %r1, (%r1)
-    0xc0, 0x01, 0, 0, 0, 0,             // lgfi  %r0, PLT_INDEX
-    0x07, 0xf1,                         // br    %r1
-    0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
-    0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ub32 *)(buf + 2) = (sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx)) >> 1;
-  *(ub32 *)(buf + 14) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static u8 insn[] = {
-    0xc0, 0x10, 0, 0, 0, 0,             // larl  %r1, GOT_ENTRY_OFFSET
-    0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg    %r1, (%r1)
-    0x07, 0xf1,                         // br    %r1
-    0x07, 0x00,                         // nopr
-  };
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ub32 *)(buf + 2) = (sym.get_got_addr(ctx) - sym.get_plt_addr(ctx)) >> 1;
-}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_390_PC32:
-    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  case R_390_64:
-    *(ub64 *)loc = val;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    auto check_dbl = [&](i64 val, i64 lo, i64 hi) {
-      check(val, lo, hi);
-
-      // R_390_*DBL relocs should never refer a symbol at an odd address
-      if (val & 1)
-        Error(ctx) << *this << ": misaligned symbol " << sym
-                   << " for relocation " << rel;
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_390_64:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_390_8:
-      check(S + A, 0, 1 << 8);
-      *loc = S + A;
-      break;
-    case R_390_12:
-      check(S + A, 0, 1 << 12);
-      *(ul16 *)loc |= bits(S + A, 11, 0);
-      break;
-    case R_390_16:
-      check(S + A, 0, 1 << 16);
-      *(ub16 *)loc = S + A;
-      break;
-    case R_390_20:
-      check(S + A, 0, 1 << 20);
-      write_mid20(loc, S + A);
-      break;
-    case R_390_32:
-    case R_390_PLT32:
-      check(S + A, 0, 1LL << 32);
-      *(ub32 *)loc = S + A;
-      break;
-    case R_390_PLT64:
-      *(ub64 *)loc = S + A;
-      break;
-    case R_390_PC12DBL:
-    case R_390_PLT12DBL:
-      check_dbl(S + A - P, -(1 << 12), 1 << 12);
-      *(ul16 *)loc |= bits(S + A - P, 12, 1);
-      break;
-    case R_390_PC16:
-      check(S + A - P, -(1 << 15), 1 << 15);
-      *(ub16 *)loc = S + A - P;
-      break;
-    case R_390_PC32:
-      check(S + A - P, -(1LL << 31), 1LL << 31);
-      *(ub32 *)loc = S + A - P;
-      break;
-    case R_390_PC64:
-      *(ub64 *)loc = S + A - P;
-      break;
-    case R_390_PC16DBL:
-    case R_390_PLT16DBL:
-      check_dbl(S + A - P, -(1 << 16), 1 << 16);
-      *(ub16 *)loc = (S + A - P) >> 1;
-      break;
-    case R_390_PC24DBL:
-    case R_390_PLT24DBL:
-      check_dbl(S + A - P, -(1 << 24), 1 << 24);
-      *(ub32 *)loc |= bits(S + A - P, 24, 1);
-      break;
-    case R_390_PC32DBL:
-    case R_390_PLT32DBL:
-      check_dbl(S + A - P, -(1LL << 32), 1LL << 32);
-      *(ub32 *)loc = (S + A - P) >> 1;
-      break;
-    case R_390_GOT12:
-    case R_390_GOTPLT12:
-      check(G + A, 0, 1 << 12);
-      *(ul16 *)loc |= bits(G + A, 11, 0);
-      break;
-    case R_390_GOT16:
-    case R_390_GOTPLT16:
-      check(G + A, 0, 1 << 16);
-      *(ub16 *)loc = G + A;
-      break;
-    case R_390_GOT20:
-    case R_390_GOTPLT20:
-      check(G + A, 0, 1 << 20);
-      write_mid20(loc, G + A);
-      break;
-    case R_390_GOT32:
-    case R_390_GOTPLT32:
-      check(G + A, 0, 1LL << 32);
-      *(ub32 *)loc = G + A;
-      break;
-    case R_390_GOT64:
-    case R_390_GOTPLT64:
-      *(ub64 *)loc = G + A;
-      break;
-    case R_390_GOTOFF16:
-    case R_390_PLTOFF16:
-      check(S + A - GOT, -(1 << 15), 1 << 15);
-      *(ub16 *)loc = S + A - GOT;
-      break;
-    case R_390_GOTOFF32:
-    case R_390_PLTOFF32:
-      check(S + A - GOT, -(1LL << 31), 1LL << 31);
-      *(ub32 *)loc = S + A - GOT;
-      break;
-    case R_390_GOTOFF64:
-    case R_390_PLTOFF64:
-      *(ub64 *)loc = S + A - GOT;
-      break;
-    case R_390_GOTPC:
-      *(ub64 *)loc = GOT + A - P;
-      break;
-    case R_390_GOTPCDBL:
-      check_dbl(GOT + A - P, -(1LL << 32), 1LL << 32);
-      *(ub32 *)loc = (GOT + A - P) >> 1;
-      break;
-    case R_390_GOTENT:
-      check(GOT + G + A - P, -(1LL << 32), 1LL << 32);
-      *(ub32 *)loc = (GOT + G + A - P) >> 1;
-      break;
-    case R_390_TLS_LE32:
-      *(ub32 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_390_TLS_LE64:
-      *(ub64 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_390_TLS_GOTIE20:
-      write_mid20(loc, sym.get_gottp_addr(ctx) + A - GOT);
-      break;
-    case R_390_TLS_IEENT:
-      *(ub32 *)loc = (sym.get_gottp_addr(ctx) + A - P) >> 1;
-      break;
-    case R_390_TLS_GD32:
-      if (sym.has_tlsgd(ctx))
-        *(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
-      else if (sym.has_gottp(ctx))
-        *(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
-      else
-        *(ub32 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_390_TLS_GD64:
-      if (sym.has_tlsgd(ctx))
-        *(ub64 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
-      else if (sym.has_gottp(ctx))
-        *(ub64 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
-      else
-        *(ub64 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_390_TLS_GDCALL:
-      if (sym.has_tlsgd(ctx)) {
-        // do nothing
-      } else if (sym.has_gottp(ctx)) {
-        // lg %r2, 0(%r2, %r12)
-        static u8 insn[] = { 0xe3, 0x22, 0xc0, 0x00, 0x00, 0x04 };
-        memcpy(loc, insn, sizeof(insn));
-      } else {
-        // nop
-        static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
-        memcpy(loc, insn, sizeof(insn));
-      }
-      break;
-    case R_390_TLS_LDM32:
-      if (ctx.got->has_tlsld(ctx))
-        *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
-      break;
-    case R_390_TLS_LDM64:
-      if (ctx.got->has_tlsld(ctx))
-        *(ub64 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
-      break;
-    case R_390_TLS_LDO32:
-      if (ctx.got->has_tlsld(ctx))
-        *(ub32 *)loc = S + A - ctx.dtp_addr;
-      else
-        *(ub32 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_390_TLS_LDO64:
-      if (ctx.got->has_tlsld(ctx))
-        *(ub64 *)loc = S + A - ctx.dtp_addr;
-      else
-        *(ub64 *)loc = S + A - ctx.tp_addr;
-      break;
-    case R_390_TLS_LDCALL:
-      if (!ctx.got->has_tlsld(ctx)) {
-        // nop
-        static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
-        memcpy(loc, insn, sizeof(insn));
-      }
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_390_32: {
-      i64 val = S + A;
-      check(val, 0, 1LL << 32);
-      *(ub32 *)loc = val;
-      break;
-    }
-    case R_390_64:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ub64 *)loc = *val;
-      else
-        *(ub64 *)loc = S + A;
-      break;
-    case R_390_TLS_LDO64:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ub64 *)loc = *val;
-      else
-        *(ub64 *)loc = S + A - ctx.dtp_addr;
-      break;
-    default:
-      Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_390_64:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_390_8:
-    case R_390_12:
-    case R_390_16:
-    case R_390_20:
-    case R_390_32:
-      scan_absrel(ctx, sym, rel);
-      break;
-    case R_390_PC16:
-    case R_390_PC16DBL:
-    case R_390_PC32:
-    case R_390_PC32DBL:
-    case R_390_PC64:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_390_GOT12:
-    case R_390_GOT16:
-    case R_390_GOT20:
-    case R_390_GOT32:
-    case R_390_GOT64:
-    case R_390_GOTOFF16:
-    case R_390_GOTOFF32:
-    case R_390_GOTOFF64:
-    case R_390_GOTPLT12:
-    case R_390_GOTPLT16:
-    case R_390_GOTPLT20:
-    case R_390_GOTPLT32:
-    case R_390_GOTPLT64:
-    case R_390_GOTPC:
-    case R_390_GOTPCDBL:
-    case R_390_GOTENT:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_390_PLT12DBL:
-    case R_390_PLT16DBL:
-    case R_390_PLT24DBL:
-    case R_390_PLT32:
-    case R_390_PLT32DBL:
-    case R_390_PLT64:
-    case R_390_PLTOFF16:
-    case R_390_PLTOFF32:
-    case R_390_PLTOFF64:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_390_TLS_GOTIE20:
-    case R_390_TLS_IEENT:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_390_TLS_GD32:
-    case R_390_TLS_GD64:
-      // We always want to relax calls to __tls_get_offset() in statically-
-      // linked executables because __tls_get_offset() in libc.a just calls
-      // abort().
-      if (ctx.arg.is_static ||
-          (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared)) {
-        // do nothing
-      } else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared &&
-                 !ctx.arg.z_dlopen) {
-        sym.flags |= NEEDS_GOTTP;
-      } else {
-        sym.flags |= NEEDS_TLSGD;
-      }
-      break;
-    case R_390_TLS_LDM32:
-    case R_390_TLS_LDM64: {
-      bool do_relax = ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared);
-      if (!do_relax)
-        ctx.needs_tlsld = true;
-      break;
-    }
-    case R_390_TLS_LE32:
-    case R_390_TLS_LE64:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_390_TLS_LDO32:
-    case R_390_TLS_LDO64:
-    case R_390_TLS_GDCALL:
-    case R_390_TLS_LDCALL:
-      break;
-    default:
-      Fatal(ctx) << *this << ": scan_relocations: " << rel;
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-sh4.cc
+++ b/third_party/mold/elf/arch-sh4.cc
@ -1,355 +0,0 @@
-// clang-format off
-// SH-4 (SuperH 4) is a 32-bit RISC ISA developed by Hitachi in the early
-// '90s. Some relatively powerful systems were developed with SH-4.
-// A notable example is Sega's Dreamcast game console which debuted in 1998.
-// Hitachi later spun off its semiconductor division as an independent
-// company, Renesas, and Renesas is still selling SH-4 processors for the
-// embedded market. It has never been as popular as ARM is, and its
-// popularity continues to decline though.
-//
-// SH-4's most distinctive feature compared to other RISC ISAs is that its
-// instructions are 16 bits in length instead of more common 32 bits for
-// better code density. This difference affects various aspects of its
-// instruction set as shown below:
-//
-//  - SH-4 has 16 general-purpose registers (GPRs) instead of the most
-//    commmon 32 GPR configuration to save one bit to specify a register.
-//
-//  - Binary instructions such as ADD normally take three register in
-//    RISC ISAs (e.g. x ← y ⊕ z where x, y and z are registers), but
-//    SH-4's instructions take only two registers. The result of an
-//    operation is written to one of the source registers (e.g. x ← x ⊕ y).
-//
-//  - Usual RISC ISAs have "load high" and "load low" instructions to set
-//    an immediate to most significant and least significant bits in a
-//    register to construct a full 32-bit value in a register. This
-//    technique is hard to use in SH-4, as 16 bit instructions are too
-//    small to contain large immediates. On SH-4, large immediates are
-//    loaded from memory using `mov.l` PC-relative load instruction.
-//
-//  - Many RISC ISAs are, despite their name, actually fairly complex.
-//    They tend to have hundreds if not thousands of different instructions.
-//    SH-4 doesn't really have that many instructions because its 16-bit
-//    machine code simply can't encode many different opcodes. As a
-//    result, the number of relocations the linker has to support is also
-//    small.
-//
-// Beside these, SH-4 has a delay branch slot just like contemporary MIPS
-// and SPARC. That is, one instruction after a branch instruction will
-// always be executed even if the branch is taken. Delay branch slot allows
-// a pipelined CPU to start and finish executing an instruction after a
-// branch regardless of the branch's condition, simplifying the processor's
-// implementation. It's considered a bad premature optimization nowadays,
-// though. Modern RISC processors don't have it.
-//
-// Here are notes about the SH-4 psABI:
-//
-//  - If a source file is compiled with -fPIC, each function starts
-//    with a piece of code to store the address of .got to %r12.
-//    We can use the register in our PLT for position-independent output.
-//
-//  - Even though it uses the RELA-type relocations, relocation addends
-//    are stored not to the r_addend field but to the relocated section
-//    contents for some reason. Therefore, it's effectively REL.
-//
-//  - It looks like the ecosystem has bit-rotted. Some tests, especially
-//    one using C++ exceptions, don't pass even with GNU ld.
-//
-//  - GCC/SH4 tends to write dynamically-relocated data into .text, so the
-//    output from the linker contains lots of text relocations. That's not
-//    a problem with embedded programming, I guess.
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = SH4;
-
-// Even though SH-4 uses RELA-type relocations, addends are stored to
-// relocated places for some reason.
-template <>
-i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
-  switch (rel.r_type) {
-  case R_SH_DIR32:
-  case R_SH_REL32:
-  case R_SH_TLS_GD_32:
-  case R_SH_TLS_LD_32:
-  case R_SH_TLS_LDO_32:
-  case R_SH_TLS_IE_32:
-  case R_SH_TLS_LE_32:
-  case R_SH_TLS_DTPMOD32:
-  case R_SH_TLS_DTPOFF32:
-  case R_SH_TLS_TPOFF32:
-  case R_SH_GOT32:
-  case R_SH_PLT32:
-  case R_SH_GOTOFF:
-  case R_SH_GOTPC:
-  case R_SH_GOTPLT32:
-    return *(ul32 *)loc;
-  default:
-    return 0;
-  }
-}
-
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0x02, 0xd2, //    mov.l   1f, r2
-      0xcc, 0x32, //    add     r12, r2
-      0x22, 0x50, //    mov.l   @(8, r2), r0
-      0x21, 0x52, //    mov.l   @(4, r2), r2
-      0x2b, 0x40, //    jmp     @r0
-      0x00, 0xe0, //    mov     #0, r0
-      0, 0, 0, 0, // 1: .long GOTPLT
-    };
-
-    static_assert(sizeof(insn) == E::plt_hdr_size);
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr;
-  } else {
-    static const u8 insn[] = {
-      0x02, 0xd2, //    mov.l   1f, r2
-      0x22, 0x50, //    mov.l   @(8, r2), r0
-      0x21, 0x52, //    mov.l   @(4, r2), r2
-      0x2b, 0x40, //    jmp     @r0
-      0x00, 0xe0, //    mov     #0, r0
-      0x09, 0x00, //    nop
-      0, 0, 0, 0, // 1: .long GOTPLT
-    };
-
-    static_assert(sizeof(insn) == E::plt_hdr_size);
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr;
-  }
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0xce, 0x00, //    mov.l   @(r0, r12), r0
-      0x2b, 0x40, //    jmp     @r0
-      0x01, 0xd1, //    mov.l   2f, r1
-      0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
-      0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
-    };
-
-    static_assert(sizeof(insn) == E::plt_size);
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
-    *(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
-  } else {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0x02, 0x60, //    mov.l   @r0, r0
-      0x2b, 0x40, //    jmp     @r0
-      0x01, 0xd1, //    mov.l   2f, r1
-      0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
-      0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
-    };
-
-    static_assert(sizeof(insn) == E::plt_size);
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx);
-    *(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
-  }
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  if (ctx.arg.pic) {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0xce, 0x00, //    mov.l   @(r0, r12), r0
-      0x2b, 0x40, //    jmp     @r0
-      0x09, 0x00, //    nop
-      0, 0, 0, 0, // 1: .long GOT_ENTRY
-    };
-
-    static_assert(sizeof(insn) == E::pltgot_size);
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr;
-  } else {
-    static const u8 insn[] = {
-      0x01, 0xd0, //    mov.l   1f, r0
-      0x02, 0x60, //    mov.l   @r0, r0
-      0x2b, 0x40, //    jmp     @r0
-      0x09, 0x00, //    nop
-      0, 0, 0, 0, // 1: .long GOT_ENTRY
-    };
-
-    static_assert(sizeof(insn) == E::pltgot_size);
-    memcpy(buf, insn, sizeof(insn));
-    *(ul32 *)(buf + 8) = sym.get_got_addr(ctx);
-  }
-}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_SH_DIR32:
-    *(ul32 *)loc = val;
-    break;
-  case R_SH_REL32:
-    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = get_addend(loc, rel);
-    u64 P = get_addr() + rel.r_offset;
-    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_SH_DIR32:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_SH_REL32:
-    case R_SH_PLT32:
-      *(ul32 *)loc = S + A - P;
-      break;
-    case R_SH_GOT32:
-      *(ul32 *)loc = G;
-      break;
-    case R_SH_GOTPC:
-      *(ul32 *)loc = GOT + A - P;
-      break;
-    case R_SH_GOTOFF:
-      *(ul32 *)loc = S + A - GOT;
-      break;
-    case R_SH_TLS_GD_32:
-      *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
-      break;
-    case R_SH_TLS_LD_32:
-      *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
-      break;
-    case R_SH_TLS_LDO_32:
-      *(ul32 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_SH_TLS_IE_32:
-      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
-      break;
-    case R_SH_TLS_LE_32:
-      *(ul32 *)loc = S + A - ctx.tp_addr;
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : get_addend(loc, rel);
-
-    switch (rel.r_type) {
-    case R_SH_DIR32:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ul32 *)loc = *val;
-      else
-        *(ul32 *)loc = S + A;
-      break;
-    default:
-      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
-                 << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      Error(ctx) << sym << ": GNU ifunc symbol is not supported on sh4";
-
-    switch (rel.r_type) {
-    case R_SH_DIR32:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_SH_REL32:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_SH_GOT32:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_SH_PLT32:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_SH_TLS_GD_32:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_SH_TLS_LD_32:
-      ctx.needs_tlsld = true;
-      break;
-    case R_SH_TLS_IE_32:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_SH_TLS_LE_32:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_SH_GOTPC:
-    case R_SH_GOTOFF:
-    case R_SH_TLS_LDO_32:
-      break;
-    default:
-      Fatal(ctx) << *this << ": unknown relocation: " << rel;
-    }
-  }
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/arch-sparc64.cc
+++ b/third_party/mold/elf/arch-sparc64.cc
@ -1,622 +0,0 @@
-// clang-format off
-// SPARC is a RISC ISA developed by Sun Microsystems.
-//
-// The byte order of the processor is big-endian. Anything larger than a
-// byte is stored in the "reverse" order compared to little-endian
-// processors such as x86-64.
-//
-// All instructions are 4 bytes long and aligned to 4 bytes boundaries.
-//
-// A notable feature of SPARC is that, unlike other RISC ISAs, it doesn't
-// need range extension thunks. It is because the SPARC's CALL instruction
-// contains a whopping 30 bits immediate. The processor scales it by 4 to
-// extend it to 32 bits (this is doable because all instructions are
-// aligned to 4 bytes boundaries, so the least significant two bits are
-// always zero). That means CALL's reach is PC ± 2 GiB, elinating the
-// need of range extension thunks. It comes with the cost that the CALL
-// instruction alone takes 1/4th of the instruction encoding space,
-// though.
-//
-// SPARC has 32 general purpose registers. CALL instruction saves a return
-// address to %o7, which is an alias for %r15. Thread pointer is stored to
-// %g7 which is %r7.
-//
-// SPARC does not have PC-relative load/store instructions. To access data
-// in the position-independent manner, we usually first set the address of
-// .got to, for example, %l7, with the following piece of code
-//
-//   sethi  %hi(. - _GLOBAL_OFFSET_TABLE_), %l7
-//   add  %l7, %lo(. - _GLOBAL_OFFSET_TABLE_), %l7
-//   call __sparc_get_pc_thunk.l7
-//   nop
-//
-// where __sparc_get_pc_thunk.l7 is defined as
-//
-//   retl
-//   add  %o7, %l7, %l7
-//
-// . SETHI and the following ADD materialize a 32 bits offset to .got.
-// CALL instruction sets a return address to $o7, and the subsequent ADD
-// adds it to the GOT offset to materialize the absolute address of .got.
-//
-// Note that we have a NOP after CALL and an ADD after RETL because of
-// SPARC's delay branch slots. That is, the SPARC processor always
-// executes one instruction after a branch even if the branch is taken.
-// This may seem like an odd behavior, and indeed it is considered as such
-// (that's a premature optimization for the early pipelined SPARC
-// processors), but that's been a part of the ISA's spec so that's what it
-// is.
-//
-// Note also that the .got address obtained this way is not shared between
-// functions, so functions can use an arbitrary register to hold the .got
-// address. That also means each function needs to execute the above piece
-// of code to become position-independent.
-//
-// This scheme is very similar to i386. That may not be a coincidence
-// because the i386 ELF psABI is created by Sun Microsystems too.
-//
-// https://github.com/rui314/psabi/blob/main/sparc.pdf
-
-#include "third_party/mold/elf/mold.h"
-
-namespace mold::elf {
-
-using E = SPARC64;
-
-// SPARC's PLT section is writable despite containing executable code.
-// We don't need to write the PLT header entry because the dynamic loader
-// will do that for us.
-//
-// We also don't need a .got.plt section to store the result of lazy PLT
-// symbol resolution because the dynamic symbol resolver directly mutates
-// instructions in PLT so that they jump to the right places next time.
-// That's why each PLT entry contains lots of NOPs; they are a placeholder
-// for the runtime to add more instructions.
-//
-// Self-modifying code is nowadays considered really bad from the security
-// point of view, though.
-template <>
-void write_plt_header(Context<E> &ctx, u8 *buf) {
-  memset(buf, 0, E::plt_hdr_size);
-}
-
-template <>
-void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static ub32 insn[] = {
-    0x0300'0000, // sethi (. - .PLT0), %g1
-    0x3068'0000, // ba,a  %xcc, .PLT1
-    0x0100'0000, // nop
-    0x0100'0000, // nop
-    0x0100'0000, // nop
-    0x0100'0000, // nop
-    0x0100'0000, // nop
-    0x0100'0000, // nop
-  };
-
-  u64 plt0 = ctx.plt->shdr.sh_addr;
-  u64 plt1 = ctx.plt->shdr.sh_addr + E::plt_size;
-  u64 entry = sym.get_plt_addr(ctx);
-
-  memcpy(buf, insn, sizeof(insn));
-  *(ub32 *)buf |= bits(entry - plt0, 21, 0);
-  *(ub32 *)(buf + 4) |= bits(plt1 - entry - 4, 20, 2);
-}
-
-template <>
-void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
-  static ub32 entry[] = {
-    0x8a10'000f, // mov  %o7, %g5
-    0x4000'0002, // call . + 8
-    0xc25b'e014, // ldx  [ %o7 + 20 ], %g1
-    0xc25b'c001, // ldx  [ %o7 + %g1 ], %g1
-    0x81c0'4000, // jmp  %g1
-    0x9e10'0005, // mov  %g5, %o7
-    0x0000'0000, // .quad $plt_entry - $got_entry
-    0x0000'0000,
-  };
-
-  memcpy(buf, entry, sizeof(entry));
-  *(ub64 *)(buf + 24) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 4;
-}
-
-template <>
-void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
-                                    u64 offset, u64 val) {
-  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
-
-  switch (rel.r_type) {
-  case R_NONE:
-    break;
-  case R_SPARC_64:
-  case R_SPARC_UA64:
-    *(ub64 *)loc = val;
-    break;
-  case R_SPARC_DISP32:
-    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
-    break;
-  default:
-    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  ElfRel<E> *dynrel = nullptr;
-  if (ctx.reldyn)
-    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
-                           file.reldyn_offset + this->reldyn_offset);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE)
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    u64 S = sym.get_addr(ctx);
-    u64 A = rel.r_addend;
-    u64 P = (get_addr() + rel.r_offset);
-    u64 G = (sym.get_got_idx(ctx) * sizeof(Word<E>));
-    u64 GOT = ctx.got->shdr.sh_addr;
-
-    switch (rel.r_type) {
-    case R_SPARC_64:
-      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
-      break;
-    case R_SPARC_5:
-      check(S + A, 0, 1 << 5);
-      *(ub32 *)loc |= bits(S + A, 4, 0);
-      break;
-    case R_SPARC_6:
-      check(S + A, 0, 1 << 6);
-      *(ub32 *)loc |= bits(S + A, 5, 0);
-      break;
-    case R_SPARC_7:
-      check(S + A, 0, 1 << 7);
-      *(ub32 *)loc |= bits(S + A, 6, 0);
-      break;
-    case R_SPARC_8:
-      check(S + A, 0, 1 << 8);
-      *(u8 *)loc = S + A;
-      break;
-    case R_SPARC_10:
-      check(S + A, 0, 1 << 10);
-      *(ub32 *)loc |= bits(S + A, 9, 0);
-      break;
-    case R_SPARC_LO10:
-    case R_SPARC_LOPLT10:
-      *(ub32 *)loc |= bits(S + A, 9, 0);
-      break;
-    case R_SPARC_11:
-      check(S + A, 0, 1 << 11);
-      *(ub32 *)loc |= bits(S + A, 10, 0);
-      break;
-    case R_SPARC_13:
-      check(S + A, 0, 1 << 13);
-      *(ub32 *)loc |= bits(S + A, 12, 0);
-      break;
-    case R_SPARC_16:
-    case R_SPARC_UA16:
-      check(S + A, 0, 1 << 16);
-      *(ub16 *)loc = S + A;
-      break;
-    case R_SPARC_22:
-      check(S + A, 0, 1 << 22);
-      *(ub32 *)loc |= bits(S + A, 21, 0);
-      break;
-    case R_SPARC_32:
-    case R_SPARC_UA32:
-    case R_SPARC_PLT32:
-      check(S + A, 0, 1LL << 32);
-      *(ub32 *)loc = S + A;
-      break;
-    case R_SPARC_PLT64:
-    case R_SPARC_UA64:
-    case R_SPARC_REGISTER:
-      *(ub64 *)loc = S + A;
-      break;
-    case R_SPARC_DISP8:
-      check(S + A - P, -(1 << 7), 1 << 7);
-      *(u8 *)loc = S + A - P;
-      break;
-    case R_SPARC_DISP16:
-      check(S + A - P, -(1 << 15), 1 << 15);
-      *(ub16 *)loc = S + A - P;
-      break;
-    case R_SPARC_DISP32:
-    case R_SPARC_PCPLT32:
-      check(S + A - P, -(1LL << 31), 1LL << 31);
-      *(ub32 *)loc = S + A - P;
-      break;
-    case R_SPARC_DISP64:
-      *(ub64 *)loc = S + A - P;
-      break;
-    case R_SPARC_WDISP16: {
-      i64 val = S + A - P;
-      check(val, -(1 << 16), 1 << 16);
-      *(ub16 *)loc |= (bit(val, 16) << 21) | bits(val, 15, 2);
-      break;
-    }
-    case R_SPARC_WDISP19:
-      check(S + A - P, -(1 << 20), 1 << 20);
-      *(ub32 *)loc |= bits(S + A - P, 20, 2);
-      break;
-    case R_SPARC_WDISP22:
-      check(S + A - P, -(1 << 23), 1 << 23);
-      *(ub32 *)loc |= bits(S + A - P, 23, 2);
-      break;
-    case R_SPARC_WDISP30:
-    case R_SPARC_WPLT30:
-      check(S + A - P, -(1LL << 31), 1LL << 31);
-      *(ub32 *)loc |= bits(S + A - P, 31, 2);
-      break;
-    case R_SPARC_HI22:
-    case R_SPARC_HIPLT22:
-    case R_SPARC_LM22:
-      *(ub32 *)loc |= bits(S + A, 31, 10);
-      break;
-    case R_SPARC_GOT10:
-      *(ub32 *)loc |= bits(G, 9, 0);
-      break;
-    case R_SPARC_GOT13:
-      check(G, 0, 1 << 12);
-      *(ub32 *)loc |= bits(G, 12, 0);
-      break;
-    case R_SPARC_GOT22:
-      *(ub32 *)loc |= bits(G, 31, 10);
-      break;
-    case R_SPARC_GOTDATA_HIX22: {
-      i64 val = S + A - GOT;
-      *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
-      break;
-    }
-    case R_SPARC_GOTDATA_LOX10: {
-      i64 val = S + A - GOT;
-      *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
-      break;
-    }
-    case R_SPARC_GOTDATA_OP_HIX22:
-      // We always have to relax a GOT load to a load immediate if a
-      // symbol is local, because R_SPARC_GOTDATA_OP cannot represent
-      // an addend for a local symbol.
-      if (sym.is_imported || sym.is_ifunc()) {
-        *(ub32 *)loc |= bits(G, 31, 10);
-      } else if (sym.is_absolute()) {
-        i64 val = S + A;
-        *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
-      } else {
-        i64 val = S + A - GOT;
-        *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
-      }
-      break;
-    case R_SPARC_GOTDATA_OP_LOX10: {
-      if (sym.is_imported || sym.is_ifunc()) {
-        *(ub32 *)loc |= bits(G, 9, 0);
-      } else if (sym.is_absolute()) {
-        i64 val = S + A;
-        *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
-      } else {
-        i64 val = S + A - GOT;
-        *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
-      }
-      break;
-    }
-    case R_SPARC_GOTDATA_OP:
-      if (sym.is_imported || sym.is_ifunc())
-        break;
-
-      if (sym.is_absolute()) {
-        // ldx [ %g2 + %g1 ], %g1  →  nop
-        *(ub32 *)loc = 0x0100'0000;
-      } else {
-        // ldx [ %g2 + %g1 ], %g1  →  add %g2, %g1, %g1
-        *(ub32 *)loc &= 0b00'11111'000000'11111'1'11111111'11111;
-        *(ub32 *)loc |= 0b10'00000'000000'00000'0'00000000'00000;
-      }
-      break;
-    case R_SPARC_PC10:
-    case R_SPARC_PCPLT10:
-      *(ub32 *)loc |= bits(S + A - P, 9, 0);
-      break;
-    case R_SPARC_PC22:
-    case R_SPARC_PCPLT22:
-    case R_SPARC_PC_LM22:
-      *(ub32 *)loc |= bits(S + A - P, 31, 10);
-      break;
-    case R_SPARC_OLO10:
-      *(ub32 *)loc |= bits(bits(S + A, 9, 0) + rel.r_type_data, 12, 0);
-      break;
-    case R_SPARC_HH22:
-      *(ub32 *)loc |= bits(S + A, 63, 42);
-      break;
-    case R_SPARC_HM10:
-      *(ub32 *)loc |= bits(S + A, 41, 32);
-      break;
-    case R_SPARC_PC_HH22:
-      *(ub32 *)loc |= bits(S + A - P, 63, 42);
-      break;
-    case R_SPARC_PC_HM10:
-      *(ub32 *)loc |= bits(S + A - P, 41, 32);
-      break;
-    case R_SPARC_HIX22:
-      *(ub32 *)loc |= bits(~(S + A), 31, 10);
-      break;
-    case R_SPARC_LOX10:
-      *(ub32 *)loc |= bits(S + A, 9, 0) | 0b1'1100'0000'0000;
-      break;
-    case R_SPARC_H44:
-      *(ub32 *)loc |= bits(S + A, 43, 22);
-      break;
-    case R_SPARC_M44:
-      *(ub32 *)loc |= bits(S + A, 21, 12);
-      break;
-    case R_SPARC_L44:
-      *(ub32 *)loc |= bits(S + A, 11, 0);
-      break;
-    case R_SPARC_TLS_GD_HI22:
-      *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10);
-      break;
-    case R_SPARC_TLS_GD_LO10:
-      *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0);
-      break;
-    case R_SPARC_TLS_GD_CALL:
-    case R_SPARC_TLS_LDM_CALL: {
-      u64 addr;
-      if (ctx.arg.is_static)
-        addr = ctx.extra.tls_get_addr_sec->shdr.sh_addr;
-      else
-        addr = ctx.extra.tls_get_addr_sym->get_addr(ctx);
-
-      *(ub32 *)loc |= bits(addr + A - P, 31, 2);
-      break;
-    }
-    case R_SPARC_TLS_LDM_HI22:
-      *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10);
-      break;
-    case R_SPARC_TLS_LDM_LO10:
-      *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0);
-      break;
-    case R_SPARC_TLS_LDO_HIX22:
-      *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 31, 10);
-      break;
-    case R_SPARC_TLS_LDO_LOX10:
-      *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 9, 0);
-      break;
-    case R_SPARC_TLS_IE_HI22:
-      *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 31, 10);
-      break;
-    case R_SPARC_TLS_IE_LO10:
-      *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 9, 0);
-      break;
-    case R_SPARC_TLS_LE_HIX22:
-      *(ub32 *)loc |= bits(~(S + A - ctx.tp_addr), 31, 10);
-      break;
-    case R_SPARC_TLS_LE_LOX10:
-      *(ub32 *)loc |= bits(S + A - ctx.tp_addr, 9, 0) | 0b1'1100'0000'0000;
-      break;
-    case R_SPARC_SIZE32:
-      *(ub32 *)loc = sym.esym().st_size + A;
-      break;
-    case R_SPARC_TLS_GD_ADD:
-    case R_SPARC_TLS_LDM_ADD:
-    case R_SPARC_TLS_LDO_ADD:
-    case R_SPARC_TLS_IE_LD:
-    case R_SPARC_TLS_IE_LDX:
-    case R_SPARC_TLS_IE_ADD:
-      break;
-    default:
-      unreachable();
-    }
-  }
-}
-
-template <>
-void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
-
-    auto check = [&](i64 val, i64 lo, i64 hi) {
-      if (val < lo || hi <= val)
-        Error(ctx) << *this << ": relocation " << rel << " against "
-                   << sym << " out of range: " << val << " is not in ["
-                   << lo << ", " << hi << ")";
-    };
-
-    SectionFragment<E> *frag;
-    i64 frag_addend;
-    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
-
-    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
-    u64 A = frag ? frag_addend : (i64)rel.r_addend;
-
-    switch (rel.r_type) {
-    case R_SPARC_64:
-    case R_SPARC_UA64:
-      if (std::optional<u64> val = get_tombstone(sym, frag))
-        *(ub64 *)loc = *val;
-      else
-        *(ub64 *)loc = S + A;
-      break;
-    case R_SPARC_32:
-    case R_SPARC_UA32: {
-      i64 val = S + A;
-      check(val, 0, 1LL << 32);
-      *(ub32 *)loc = val;
-      break;
-    }
-    case R_SPARC_TLS_DTPOFF32:
-      *(ub32 *)loc = S + A - ctx.dtp_addr;
-      break;
-    case R_SPARC_TLS_DTPOFF64:
-      *(ub64 *)loc = S + A - ctx.dtp_addr;
-      break;
-    default:
-      Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel;
-    }
-  }
-}
-
-template <>
-void InputSection<E>::scan_relocations(Context<E> &ctx) {
-  assert(shdr().sh_flags & SHF_ALLOC);
-
-  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
-  std::span<const ElfRel<E>> rels = get_rels(ctx);
-
-  // Scan relocations
-  for (i64 i = 0; i < rels.size(); i++) {
-    const ElfRel<E> &rel = rels[i];
-    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
-      continue;
-
-    Symbol<E> &sym = *file.symbols[rel.r_sym];
-
-    if (sym.is_ifunc())
-      sym.flags |= NEEDS_GOT | NEEDS_PLT;
-
-    switch (rel.r_type) {
-    case R_SPARC_64:
-      scan_dyn_absrel(ctx, sym, rel);
-      break;
-    case R_SPARC_8:
-    case R_SPARC_5:
-    case R_SPARC_6:
-    case R_SPARC_7:
-    case R_SPARC_10:
-    case R_SPARC_11:
-    case R_SPARC_13:
-    case R_SPARC_16:
-    case R_SPARC_22:
-    case R_SPARC_32:
-    case R_SPARC_REGISTER:
-    case R_SPARC_UA16:
-    case R_SPARC_UA32:
-    case R_SPARC_UA64:
-    case R_SPARC_PC_HM10:
-    case R_SPARC_OLO10:
-    case R_SPARC_LOX10:
-    case R_SPARC_HM10:
-    case R_SPARC_M44:
-    case R_SPARC_HIX22:
-    case R_SPARC_LO10:
-    case R_SPARC_L44:
-    case R_SPARC_LM22:
-    case R_SPARC_HI22:
-    case R_SPARC_H44:
-    case R_SPARC_HH22:
-      scan_absrel(ctx, sym, rel);
-      break;
-    case R_SPARC_PLT32:
-    case R_SPARC_WPLT30:
-    case R_SPARC_WDISP30:
-    case R_SPARC_HIPLT22:
-    case R_SPARC_LOPLT10:
-    case R_SPARC_PCPLT32:
-    case R_SPARC_PCPLT22:
-    case R_SPARC_PCPLT10:
-    case R_SPARC_PLT64:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_PLT;
-      break;
-    case R_SPARC_GOT13:
-    case R_SPARC_GOT10:
-    case R_SPARC_GOT22:
-    case R_SPARC_GOTDATA_HIX22:
-      sym.flags |= NEEDS_GOT;
-      break;
-    case R_SPARC_GOTDATA_OP_HIX22:
-      if (sym.is_imported)
-        sym.flags |= NEEDS_GOT;
-      break;
-    case R_SPARC_DISP16:
-    case R_SPARC_DISP32:
-    case R_SPARC_DISP64:
-    case R_SPARC_DISP8:
-    case R_SPARC_PC10:
-    case R_SPARC_PC22:
-    case R_SPARC_PC_LM22:
-    case R_SPARC_WDISP16:
-    case R_SPARC_WDISP19:
-    case R_SPARC_WDISP22:
-    case R_SPARC_PC_HH22:
-      scan_pcrel(ctx, sym, rel);
-      break;
-    case R_SPARC_TLS_GD_HI22:
-      sym.flags |= NEEDS_TLSGD;
-      break;
-    case R_SPARC_TLS_LDM_HI22:
-      ctx.needs_tlsld = true;
-      break;
-    case R_SPARC_TLS_IE_HI22:
-      sym.flags |= NEEDS_GOTTP;
-      break;
-    case R_SPARC_TLS_GD_CALL:
-    case R_SPARC_TLS_LDM_CALL:
-      if (!ctx.arg.is_static && ctx.extra.tls_get_addr_sym->is_imported)
-        ctx.extra.tls_get_addr_sym->flags |= NEEDS_PLT;
-      break;
-    case R_SPARC_TLS_LE_HIX22:
-    case R_SPARC_TLS_LE_LOX10:
-      check_tlsle(ctx, sym, rel);
-      break;
-    case R_SPARC_GOTDATA_OP_LOX10:
-    case R_SPARC_GOTDATA_OP:
-    case R_SPARC_GOTDATA_LOX10:
-    case R_SPARC_TLS_GD_LO10:
-    case R_SPARC_TLS_GD_ADD:
-    case R_SPARC_TLS_LDM_LO10:
-    case R_SPARC_TLS_LDM_ADD:
-    case R_SPARC_TLS_LDO_HIX22:
-    case R_SPARC_TLS_LDO_LOX10:
-    case R_SPARC_TLS_LDO_ADD:
-    case R_SPARC_TLS_IE_ADD:
-    case R_SPARC_TLS_IE_LD:
-    case R_SPARC_TLS_IE_LDX:
-    case R_SPARC_TLS_IE_LO10:
-    case R_SPARC_SIZE32:
-      break;
-    default:
-      Fatal(ctx) << *this << ": scan_relocations: " << rel;
-    }
-  }
-}
-
-// __tls_get_addr is not defined by libc.a, so we can't use that function
-// in statically-linked executables. This section provides a replacement.
-void SparcTlsGetAddrSection::copy_buf(Context<E> &ctx) {
-  ub32 *buf = (ub32 *)(ctx.buf + this->shdr.sh_offset);
-
-  static const ub32 insn[] = {
-    0x0300'0000, // sethi  %hi(TP_SIZE), %g1
-    0x8210'6000, // or   %g1, %lo(TP_SIZE), %g1
-    0x8221'c001, // sub  %g7, %g1, %g1
-    0xd05a'2008, // ldx  [ %o0 + 8 ], %o0
-    0x81c3'e008, // retl
-    0x9000'4008, // add  %g1, %o0, %o0
-  };
-
-  assert(this->shdr.sh_size == sizeof(insn));
-  memcpy(buf, insn, sizeof(insn));
-
-  buf[0] |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10);
-  buf[1] |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0);
-}
-
-} // namespace mold::elf
--- a/third_party/mold/elf/cmdline.cc
+++ b/third_party/mold/elf/cmdline.cc
@ -1,6 +1,6 @@
 // clang-format off
 #include "third_party/mold/elf/mold.h"
-// MISSING #include "../common/cmdline.h"
+#include "third_party/mold/cmdline.h"

 #include "third_party/libcxx/regex"
 #include "third_party/libcxx/sstream"
@ -36,7 +36,6 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/ok.h"
 #include "libc/time/time.h"
-#include "third_party/getopt/getopt.internal.h"
 #include "third_party/musl/crypt.h"
 #include "third_party/musl/lockf.h"
 #endif
--- a/third_party/mold/elf/elf.h
+++ b/third_party/mold/elf/elf.h
@ -1,7 +1,7 @@
 // clang-format off
 #pragma once

-// MISSING #include "../common/integers.h"
+#include "third_party/mold/integers.h"

 #include "third_party/libcxx/ostream"
 #include "third_party/libcxx/string"
--- a/third_party/mold/elf/main.cc
+++ b/third_party/mold/elf/main.cc
@ -1,9 +1,8 @@
 // clang-format off
 #include "third_party/mold/elf/mold.h"
-// MISSING #include "../common/archive-file.h"
-// MISSING #include "../common/cmdline.h"
-// MISSING #include "../common/output-file.h"
-
+#include "third_party/mold/archive-file.h"
+#include "third_party/mold/cmdline.h"
+#include "third_party/mold/output-file.h"
 #include "third_party/libcxx/cstring"
 #include "third_party/libcxx/functional"
 #include "third_party/libcxx/iomanip"
--- a/third_party/mold/elf/mold.h
+++ b/third_party/mold/elf/mold.h
@ -2,7 +2,7 @@
 #pragma once

 #include "third_party/mold/elf/elf.h"
-// MISSING #include "../common/common.h"
+#include "third_party/mold/common.h"

 #include "third_party/libcxx/atomic"
 #include "third_party/libcxx/bitset"
@ -15,16 +15,19 @@
 #include "third_party/libcxx/memory"
 #include "third_party/libcxx/mutex"
 #include "third_party/libcxx/optional"
-// MISSING #include <span>
+#include "third_party/libcxx/span"
 #include "third_party/libcxx/sstream"
 #include "third_party/libcxx/string"
 #include "third_party/libcxx/string_view"
+
+#include "third_party/mold/fake_tbb.h"
 // MISSING #include <tbb/concurrent_hash_map.h>
 // MISSING #include <tbb/concurrent_unordered_map.h>
 // MISSING #include <tbb/concurrent_vector.h>
 // MISSING #include <tbb/enumerable_thread_specific.h>
 // MISSING #include <tbb/spin_mutex.h>
 // MISSING #include <tbb/task_group.h>
+
 #include "third_party/libcxx/type_traits"
 #include "third_party/libcxx/unordered_map"
 #include "third_party/libcxx/unordered_set"
@ -42,7 +45,6 @@
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/ok.h"
 #include "libc/time/time.h"
-#include "third_party/getopt/getopt.internal.h"
 #include "third_party/musl/crypt.h"
 #include "third_party/musl/lockf.h"
 #endif
--- a/third_party/mold/fake_tbb.h
+++ b/third_party/mold/fake_tbb.h
@ -6,6 +6,16 @@ namespace tbb {
    template <typename T>
    using concurrent_vector = std::vector<T>;

+template <
+    class Key,
+    class T,
+    class Hash = std::hash<Key>,
+    class KeyEqual = std::equal_to<Key>,
+    class Allocator = std::allocator< std::pair<const Key, T> > >
+    using concurrent_hash_map = std::unordered_map<Key, T, Hash, KeyEqual, Allocator>;
+
+    using spin_mutex = std::mutex;
+
    template<typename InputIterator, typename Function>
    void parallel_for_each(InputIterator first, InputIterator last, const Function& f) {
    }
@ -22,5 +32,35 @@ namespace tbb {
    void parallel_for(Index first, Index last, const Function& f) {
    }

+    enum task_group_status {
+        not_complete,
+        complete,
+        canceled
+    };
+
+    class task_group {
+    public:
+        task_group() {};
+        ~task_group() {};
+
+        template<typename Func>
+        void run( Func&& f ) {
+
+        };
+
+        template<typename Func>
+        task_group_status run_and_wait( const Func& f ) {
+            return task_group_status::complete;
+        };
+
+        task_group_status wait() {
+            return task_group_status::complete;
+        };
+        
+        void cancel() {
+
+        };
+    };
+
 }
 #endif
--- a/third_party/mold/git-hash.cc
+++ b/third_party/mold/git-hash.cc
@ -0,0 +1,5 @@
+#include "third_party/libcxx/string"
+
+namespace mold {
+std::string mold_git_hash = "d4d93d7fb72dd19c44aafa4dd5397e35787d33ad";
+}
--- a/third_party/mold/hyperloglog.cc
+++ b/third_party/mold/hyperloglog.cc
@ -5,9 +5,10 @@
 // For more info, read
 // https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog

+// TODO(fzakaria): changed from libcxx because pow symbol wasn't present.
 #include "third_party/mold/common.h"

-#include "third_party/libcxx/cmath"
+#include "libc/math.h"

 namespace mold {

--- a/third_party/mold/mold.mk
+++ b/third_party/mold/mold.mk
@ -6,7 +6,7 @@ PKGS += THIRD_PARTY_MOLD
 THIRD_PARTY_MOLD_ARTIFACTS += THIRD_PARTY_MOLD_A
 THIRD_PARTY_MOLD = $(THIRD_PARTY_MOLD_A_DEPS) $(THIRD_PARTY_MOLD_A)
 THIRD_PARTY_MOLD_A = o/$(MODE)/third_party/mold/mold.a
-THIRD_PARTY_MOLD_FILES := $(wildcard third_party/mold/*)
+THIRD_PARTY_MOLD_FILES := $(wildcard third_party/mold/*) $(wildcard third_party/mold/elf/*)
 THIRD_PARTY_MOLD_HDRS = $(filter %.h,$(THIRD_PARTY_MOLD_FILES))
 THIRD_PARTY_MOLD_SRCS = $(filter %.cc,$(THIRD_PARTY_MOLD_FILES))
 THIRD_PARTY_MOLD_OBJS = $(THIRD_PARTY_MOLD_SRCS:%.cc=o/$(MODE)/%.o)
@ -16,6 +16,9 @@ THIRD_PARTY_MOLD_A_DIRECTDEPS =				\
 	LIBC_STR							\
 	LIBC_INTRIN                 		\
 	LIBC_STDIO							\
+	LIBC_CALLS							\
+	LIBC_TINYMATH						\
+	LIBC_SYSV							\
 	LIBC_RUNTIME              			\
 	THIRD_PARTY_ZSTD					\
 	THIRD_PARTY_XXHASH					\
@ -35,6 +38,8 @@ $(THIRD_PARTY_MOLD_OBJS): private			\
 			-fno-asynchronous-unwind-tables \
 			-Wno-sign-compare				\
 			-Wno-unused-function			\
+			-DMOLD_X86_64=1					\
+			-DMOLD_TARGET=X86_64

 THIRD_PARTY_MOLD_CHECKS =				\
 	$(THIRD_PARTY_MOLD_A).pkg			\