wip

2025-08-08 10:50:28 +00:00 · 2023-06-26 16:32:01 +00:00 · 2023-06-26 16:32:01 +00:00 · 29b8816f1f
commit 29b8816f1f
parent 95fbdb4f76
391 changed files with 41166 additions and 6 deletions
--- a/third_party/libcxx/atomic
+++ b/third_party/libcxx/atomic
@ -781,7 +781,7 @@ bool __cxx_atomic_compare_exchange_strong(
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 bool __cxx_atomic_compare_exchange_strong(
    __cxx_atomic_base_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order __success,
    memory_order __failure) {
@ -835,7 +835,7 @@ _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp, typename _Td>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_add(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta,
                           memory_order __order) {
  return __atomic_fetch_add(&__a->__a_value, __delta * __skip_amt<_Tp>::value,
@ -851,7 +851,7 @@ _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp, typename _Td>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_sub(__cxx_atomic_base_impl<_Tp>* __a, _Td __delta,
                           memory_order __order) {
  return __atomic_fetch_sub(&__a->__a_value, __delta * __skip_amt<_Tp>::value,
@ -867,7 +867,7 @@ _Tp __cxx_atomic_fetch_and(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a,
                           _Tp __pattern, memory_order __order) {
  return __atomic_fetch_and(&__a->__a_value, __pattern,
@ -875,7 +875,7 @@ _Tp __cxx_atomic_fetch_and(__cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_base_impl<_Tp>* __a,
                          _Tp __pattern, memory_order __order) {
  return __atomic_fetch_or(&__a->__a_value, __pattern,
@ -883,7 +883,7 @@ _Tp __cxx_atomic_fetch_or(volatile __cxx_atomic_base_impl<_Tp>* __a,
 }

 template <typename _Tp>
-_LIBCPP_INLINE_VISIBILITY
+_LIBCPP_INLINE_VISIBILITY inline
 _Tp __cxx_atomic_fetch_or(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern,
                          memory_order __order) {
  return __atomic_fetch_or(&__a->__a_value, __pattern,
--- a/third_party/libcxx/libcxx.mk
+++ b/third_party/libcxx/libcxx.mk
@ -110,6 +110,7 @@ THIRD_PARTY_LIBCXX_A_HDRS =					\
 	third_party/libcxx/refstring.hh				\
 	third_party/libcxx/regex				\
 	third_party/libcxx/scoped_allocator			\
+	third_party/libcxx/span			\
 	third_party/libcxx/set					\
 	third_party/libcxx/span					\
 	third_party/libcxx/sstream				\
--- a/third_party/libcxx/span
+++ b/third_party/libcxx/span
@ -1,5 +1,8 @@
 // -*- C++ -*-
+<<<<<<< HEAD
 // clang-format off
+=======
+>>>>>>> 80151924e (wip)
 //===------------------------------ span ---------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@ -130,10 +133,17 @@ template<class Container>
 */

 #include "third_party/libcxx/__config"
+<<<<<<< HEAD
 #include "third_party/libcxx/iterator"     // for iterators
 #include "third_party/libcxx/array"        // for array
 #include "third_party/libcxx/type_traits"  // for remove_cv, etc
 #include "third_party/libcxx/cstddef"      // for byte
+=======
+#include "third_party/libcxx/cstddef"      // for ptrdiff_t
+#include "third_party/libcxx/iterator"     // for iterators
+#include "third_party/libcxx/array"        // for array
+#include "third_party/libcxx/type_traits"  // for remove_cv, etc
+>>>>>>> 80151924e (wip)

 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@ -588,4 +598,8 @@ template<class _Container>

 _LIBCPP_END_NAMESPACE_STD

+<<<<<<< HEAD
 #endif // _LIBCPP_SPAN
+=======
+#endif // _LIBCPP_SPAN
+>>>>>>> 80151924e (wip)
--- a/third_party/mold/README.cosmo
+++ b/third_party/mold/README.cosmo
@ -0,0 +1,18 @@
+DESCRIPTION
+
+  Mold: A Modern Linker 🦠
+
+  mold is a faster drop-in replacement for existing Unix linkers.
+  It is several times quicker than the LLVM lld linker, the second-fastest open-source linker,
+  which I initially developed a few years ago. mold aims to enhance developer productivity by minimizing build time,
+  particularly in rapid debug-edit-rebuild cycles.
+
+SOURCE
+
+  https://github.com/rui314/mold
+
+  commit d4d93d7fb72dd19c44aafa4dd5397e35787d33ad
+  Author: Rui Ueyama <ruiu@bluewhale.systems>
+  Date:   Mon Jun 19 12:35:20 2023 +0900
+
+      Format
--- a/third_party/mold/archive-file.h
+++ b/third_party/mold/archive-file.h
@ -0,0 +1,178 @@
+// clang-format off
+// This file contains functions to read an archive file (.a file).
+// An archive file is just a bundle of object files. It's similar to
+// tar or zip, but the contents are not compressed.
+//
+// An archive file is either "regular" or "thin". A regular archive
+// contains object files directly, while a thin archive contains only
+// pathnames. In the latter case, actual file contents have to be read
+// from given pathnames. A regular archive is sometimes called "fat"
+// archive as opposed to "thin".
+//
+// If an archive file is given to the linker, the linker pulls out
+// object files that are needed to resolve undefined symbols. So,
+// bunding object files as an archive and giving that archive to the
+// linker has a different meaning than directly giving the same set of
+// object files to the linker. The former links only needed object
+// files, while the latter links all the given object files.
+//
+// Therefore, if you link libc.a for example, not all the libc
+// functions are linked to your binary. Instead, only object files
+// that provides functions and variables used in your program get
+// linked. To make this efficient, static library functions are
+// usually separated to each object file in an archive file. You can
+// see the contents of libc.a by running `ar t
+// /usr/lib/x86_64-linux-gnu/libc.a`.
+
+#pragma once
+
+#include "third_party/mold/common.h"
+#include "third_party/mold/filetype.h"
+
+namespace mold {
+
+struct ArHdr {
+  char ar_name[16];
+  char ar_date[12];
+  char ar_uid[6];
+  char ar_gid[6];
+  char ar_mode[8];
+  char ar_size[10];
+  char ar_fmag[2];
+
+  bool starts_with(std::string_view s) const {
+    return std::string_view(ar_name, s.size()) == s;
+  }
+
+  bool is_strtab() const {
+    return starts_with("// ");
+  }
+
+  bool is_symtab() const {
+    return starts_with("/ ") || starts_with("/SYM64/ ");
+  }
+
+  std::string read_name(std::string_view strtab, u8 *&ptr) const {
+    // BSD-style long filename
+    if (starts_with("#1/")) {
+      int namelen = atoi(ar_name + 3);
+      std::string name{(char *)ptr, (size_t)namelen};
+      ptr += namelen;
+
+      if (size_t pos = name.find('\0'))
+        name = name.substr(0, pos);
+      return name;
+    }
+
+    // SysV-style long filename
+    if (starts_with("/")) {
+      const char *start = strtab.data() + atoi(ar_name + 1);
+      return {start, (const char *)strstr(start, "/\n")};
+    }
+
+    // Short fileanme
+    if (const char *end = (char *)memchr(ar_name, '/', sizeof(ar_name)))
+      return {ar_name, end};
+    return {ar_name, sizeof(ar_name)};
+  }
+};
+
+template <typename Context, typename MappedFile>
+std::vector<MappedFile *>
+read_thin_archive_members(Context &ctx, MappedFile *mf) {
+  u8 *begin = mf->data;
+  u8 *data = begin + 8;
+  std::vector<MappedFile *> vec;
+  std::string_view strtab;
+
+  while (data < begin + mf->size) {
+    // Each header is aligned to a 2 byte boundary.
+    if ((begin - data) % 2)
+      data++;
+
+    ArHdr &hdr = *(ArHdr *)data;
+    u8 *body = data + sizeof(hdr);
+    u64 size = atol(hdr.ar_size);
+
+    // Read a string table.
+    if (hdr.is_strtab()) {
+      strtab = {(char *)body, (size_t)size};
+      data = body + size;
+      continue;
+    }
+
+    // Skip a symbol table.
+    if (hdr.is_symtab()) {
+      data = body + size;
+      continue;
+    }
+
+    if (!hdr.starts_with("#1/") && !hdr.starts_with("/"))
+      Fatal(ctx) << mf->name << ": filename is not stored as a long filename";
+
+    std::string name = hdr.read_name(strtab, body);
+
+    // Skip if symbol table
+    if (name == "__.SYMDEF" || name == "__.SYMDEF SORTED")
+      continue;
+
+    std::string path = name.starts_with('/') ?
+      name : (filepath(mf->name).parent_path() / name).string();
+    vec.push_back(MappedFile::must_open(ctx, path));
+    vec.back()->thin_parent = mf;
+    data = body;
+  }
+  return vec;
+}
+
+template <typename Context, typename MappedFile>
+std::vector<MappedFile *> read_fat_archive_members(Context &ctx, MappedFile *mf) {
+  u8 *begin = mf->data;
+  u8 *data = begin + 8;
+  std::vector<MappedFile *> vec;
+  std::string_view strtab;
+
+  while (begin + mf->size - data >= 2) {
+    if ((begin - data) % 2)
+      data++;
+
+    ArHdr &hdr = *(ArHdr *)data;
+    u8 *body = data + sizeof(hdr);
+    u64 size = atol(hdr.ar_size);
+    data = body + size;
+
+    // Read if string table
+    if (hdr.is_strtab()) {
+      strtab = {(char *)body, (size_t)size};
+      continue;
+    }
+
+    // Skip if symbol table
+    if (hdr.is_symtab())
+      continue;
+
+    // Read the name field
+    std::string name = hdr.read_name(strtab, body);
+
+    // Skip if symbol table
+    if (name == "__.SYMDEF" || name == "__.SYMDEF SORTED")
+      continue;
+
+    vec.push_back(mf->slice(ctx, name, body - begin, data - body));
+  }
+  return vec;
+}
+
+template <typename Context, typename MappedFile>
+std::vector<MappedFile *> read_archive_members(Context &ctx, MappedFile *mf) {
+  switch (get_file_type(ctx, mf)) {
+  case FileType::AR:
+    return read_fat_archive_members(ctx, mf);
+  case FileType::THIN_AR:
+    return read_thin_archive_members(ctx, mf);
+  default:
+    unreachable();
+  }
+}
+
+} // namespace mold
--- a/third_party/mold/cmdline.h
+++ b/third_party/mold/cmdline.h
@ -0,0 +1,91 @@
+// clang-format off
+#pragma once
+
+#include "third_party/mold/common.h"
+
+namespace mold {
+
+template <typename Context>
+std::vector<std::string_view>
+read_response_file(Context &ctx, std::string_view path) {
+  std::vector<std::string_view> vec;
+  MappedFile<Context> *mf = MappedFile<Context>::must_open(ctx, std::string(path));
+  u8 *data = mf->data;
+
+  auto read_quoted = [&](i64 i, char quote) {
+    std::string buf;
+    while (i < mf->size && data[i] != quote) {
+      if (data[i] == '\\') {
+        buf.append(1, data[i + 1]);
+        i += 2;
+      } else {
+        buf.append(1, data[i++]);
+      }
+    }
+    if (i >= mf->size)
+      Fatal(ctx) << path << ": premature end of input";
+    vec.push_back(save_string(ctx, buf));
+    return i + 1;
+  };
+
+  auto read_unquoted = [&](i64 i) {
+    std::string buf;
+
+    while (i < mf->size) {
+      if (data[i] == '\\' && i + 1 < mf->size) {
+        buf.append(1, data[i + 1]);
+        i += 2;
+        continue;
+      }
+
+      if (!isspace(data[i])) {
+        buf.append(1, data[i++]);
+        continue;
+      }
+
+      break;
+    }
+
+    vec.push_back(save_string(ctx, buf));
+    return i;
+  };
+
+  for (i64 i = 0; i < mf->size;) {
+    if (isspace(data[i]))
+      i++;
+    else if (data[i] == '\'')
+      i = read_quoted(i + 1, '\'');
+    else if (data[i] == '\"')
+      i = read_quoted(i + 1, '\"');
+    else
+      i = read_unquoted(i);
+  }
+  return vec;
+}
+
+// Replace "@path/to/some/text/file" with its file contents.
+template <typename Context>
+std::vector<std::string_view> expand_response_files(Context &ctx, char **argv) {
+  std::vector<std::string_view> vec;
+  for (i64 i = 0; argv[i]; i++) {
+    if (argv[i][0] == '@')
+      append(vec, read_response_file(ctx, argv[i] + 1));
+    else
+      vec.push_back(argv[i]);
+  }
+  return vec;
+}
+
+static inline std::string_view string_trim(std::string_view str) {
+  size_t pos = str.find_first_not_of(" \t");
+  if (pos == str.npos)
+    return "";
+  str = str.substr(pos);
+
+  pos = str.find_last_not_of(" \t");
+  if (pos == str.npos)
+    return str;
+  return str.substr(0, pos + 1);
+}
+
+} // namespace mold
--- a/third_party/mold/common.h
+++ b/third_party/mold/common.h
--- a/third_party/mold/compress.cc
+++ b/third_party/mold/compress.cc
@ -0,0 +1,186 @@
+// clang-format off
+// This file implements a multi-threaded zlib and zstd compression
+// routine.
+//
+// zlib-compressed data can be merged just by concatenation as long as
+// each piece of data is flushed with Z_SYNC_FLUSH. In this file, we
+// split input data into multiple shards, compress them individually
+// and concatenate them. We then append a header, a trailer and a
+// checksum so that the concatenated data is valid zlib-format data.
+//
+// zstd-compressed data can be merged in the same way.
+//
+// Using threads to compress data has a downside. Since the dictionary
+// is reset on boundaries of shards, compression ratio is sacrificed
+// a little bit. However, if a shard size is large enough, that loss
+// is negligible in practice.
+
+#include "third_party/mold/common.h"
+
+// MISSING #include <tbb/parallel_for_each.h>
+// MISSING #include <zlib.h>
+// MISSING #include <zstd.h>
+
+#define CHECK(fn)                               \
+  do {                                          \
+    [[maybe_unused]] int r = (fn);              \
+    assert(r == Z_OK);                          \
+  } while (0)
+
+namespace mold {
+
+static constexpr i64 SHARD_SIZE = 1024 * 1024;
+
+static std::vector<std::string_view> split(std::string_view input) {
+  std::vector<std::string_view> shards;
+
+  while (input.size() >= SHARD_SIZE) {
+    shards.push_back(input.substr(0, SHARD_SIZE));
+    input = input.substr(SHARD_SIZE);
+  }
+  if (!input.empty())
+    shards.push_back(input);
+  return shards;
+}
+
+static std::vector<u8> zlib_compress(std::string_view input) {
+  // Initialize zlib stream. Since debug info is generally compressed
+  // pretty well with lower compression levels, we chose compression
+  // level 1.
+  z_stream strm;
+  strm.zalloc = Z_NULL;
+  strm.zfree = Z_NULL;
+  strm.opaque = Z_NULL;
+
+  CHECK(deflateInit2(&strm, 1, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY));
+
+  // Set an input buffer
+  strm.avail_in = input.size();
+  strm.next_in = (u8 *)input.data();
+
+  // Set an output buffer. deflateBound() returns an upper bound
+  // on the compression size. +16 for Z_SYNC_FLUSH.
+  std::vector<u8> buf(deflateBound(&strm, strm.avail_in) + 16);
+
+  // Compress data. It writes all compressed bytes except the last
+  // partial byte, so up to 7 bits can be held to be written to the
+  // buffer.
+  strm.avail_out = buf.size();
+  strm.next_out = buf.data();
+  CHECK(deflate(&strm, Z_BLOCK));
+
+  // This is a workaround for libbacktrace before 2022-04-06.
+  //
+  // Zlib is a bit stream, and what Z_SYNC_FLUSH does is to write a
+  // three bit value indicating the start of an uncompressed data
+  // block followed by four byte data 00 00 ff ff which indicates that
+  // the length of the block is zero. libbacktrace uses its own zlib
+  // inflate routine, and it had a bug that if that particular three
+  // bit value happens to end at a byte boundary, it accidentally
+  // skipped the next byte.
+  //
+  // In order to avoid triggering that bug, we should avoid calling
+  // deflate() with Z_SYNC_FLUSH if the current bit position is 5.
+  // If it's 5, we insert an empty block consisting of 10 bits so
+  // that the bit position is 7 in the next byte.
+  //
+  // https://github.com/ianlancetaylor/libbacktrace/pull/87
+  int nbits;
+  deflatePending(&strm, Z_NULL, &nbits);
+  if (nbits == 5)
+    CHECK(deflatePrime(&strm, 10, 2));
+  CHECK(deflate(&strm, Z_SYNC_FLUSH));
+
+  assert(strm.avail_out > 0);
+  buf.resize(buf.size() - strm.avail_out);
+  buf.shrink_to_fit();
+  deflateEnd(&strm);
+  return buf;
+}
+
+ZlibCompressor::ZlibCompressor(u8 *buf, i64 size) {
+  std::string_view input{(char *)buf, (size_t)size};
+  std::vector<std::string_view> inputs = split(input);
+  std::vector<u64> adlers(inputs.size());
+  shards.resize(inputs.size());
+
+  // Compress each shard
+  tbb::parallel_for((i64)0, (i64)inputs.size(), [&](i64 i) {
+    adlers[i] = adler32(1, (u8 *)inputs[i].data(), inputs[i].size());
+    shards[i] = zlib_compress(inputs[i]);
+  });
+
+  // Combine checksums
+  checksum = adlers[0];
+  for (i64 i = 1; i < inputs.size(); i++)
+    checksum = adler32_combine(checksum, adlers[i], inputs[i].size());
+
+  // Comput the total size
+  compressed_size = 8; // the header and the trailer
+  for (std::vector<u8> &shard : shards)
+    compressed_size += shard.size();
+}
+
+void ZlibCompressor::write_to(u8 *buf) {
+  // Write a zlib-format header
+  buf[0] = 0x78;
+  buf[1] = 0x9c;
+
+  // Copy compressed data
+  std::vector<i64> offsets(shards.size());
+  offsets[0] = 2; // +2 for header
+  for (i64 i = 1; i < shards.size(); i++)
+    offsets[i] = offsets[i - 1] + shards[i - 1].size();
+
+  tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+    memcpy(&buf[offsets[i]], shards[i].data(), shards[i].size());
+  });
+
+  // Write a trailer
+  u8 *end = buf + compressed_size;
+  end[-6] = 3;
+  end[-5] = 0;
+
+  // Write a checksum
+  *(ub32 *)(end - 4) = checksum;
+}
+
+static std::vector<u8> zstd_compress(std::string_view input) {
+  std::vector<u8> buf(ZSTD_COMPRESSBOUND(input.size()));
+  constexpr int level = 3; // compression level; must be between 1 to 22
+
+  size_t sz = ZSTD_compress(buf.data(), buf.size(), input.data(), input.size(),
+                            level);
+  assert(!ZSTD_isError(sz));
+  buf.resize(sz);
+  buf.shrink_to_fit();
+  return buf;
+}
+
+ZstdCompressor::ZstdCompressor(u8 *buf, i64 size) {
+  std::string_view input{(char *)buf, (size_t)size};
+  std::vector<std::string_view> inputs = split(input);
+  shards.resize(inputs.size());
+
+  // Compress each shard
+  tbb::parallel_for((i64)0, (i64)inputs.size(), [&](i64 i) {
+    shards[i] = zstd_compress(inputs[i]);
+  });
+
+  compressed_size = 0;
+  for (std::vector<u8> &shard : shards)
+    compressed_size += shard.size();
+}
+
+void ZstdCompressor::write_to(u8 *buf) {
+  // Copy compressed data
+  std::vector<i64> offsets(shards.size());
+  for (i64 i = 1; i < shards.size(); i++)
+    offsets[i] = offsets[i - 1] + shards[i - 1].size();
+
+  tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) {
+    memcpy(&buf[offsets[i]], shards[i].data(), shards[i].size());
+  });
+}
+
+} // namespace mold
--- a/third_party/mold/config.h.in
+++ b/third_party/mold/config.h.in
@ -0,0 +1,4 @@
+// clang-format off
+#define MOLD_VERSION "@mold_VERSION@"
+#define MOLD_LIBDIR "@CMAKE_INSTALL_FULL_LIBDIR@"
+#cmakedefine01 MOLD_IS_SOLD
--- a/third_party/mold/demangle.cc
+++ b/third_party/mold/demangle.cc
@ -0,0 +1,52 @@
+// clang-format off
+#include "third_party/mold/common.h"
+
+#include "third_party/libcxx/cstdlib"
+
+#ifndef _WIN32
+// MISSING #include <cxxabi.h>
+#endif
+
+// MISSING #include "../third-party/rust-demangle/rust-demangle.h"
+
+namespace mold {
+
+std::string_view demangle(std::string_view name) {
+  static thread_local char *p;
+  if (p)
+    free(p);
+
+  // Try to demangle as a Rust symbol. Since legacy-style Rust symbols
+  // are also valid as a C++ mangled name, we need to call this before
+  // cpp_demangle.
+  p = rust_demangle(std::string(name).c_str(), 0);
+  if (p)
+    return p;
+
+  // Try to demangle as a C++ symbol.
+  if (std::optional<std::string_view> s = cpp_demangle(name))
+    return *s;
+  return name;
+}
+
+std::optional<std::string_view> cpp_demangle(std::string_view name) {
+  static thread_local char *buf;
+  static thread_local size_t buflen;
+
+  // TODO(cwasser): Actually demangle Symbols on Windows using e.g.
+  // `UnDecorateSymbolName` from Dbghelp, maybe even Itanium symbols?
+#ifndef _WIN32
+  if (name.starts_with("_Z")) {
+    int status;
+    char *p = abi::__cxa_demangle(std::string(name).c_str(), buf, &buflen, &status);
+    if (status == 0) {
+      buf = p;
+      return p;
+    }
+  }
+#endif
+
+  return {};
+}
+
+} // namespace mold
--- a/third_party/mold/elf/arch-alpha.cc
+++ b/third_party/mold/elf/arch-alpha.cc
@ -0,0 +1,331 @@
+// clang-format off
+// Alpha is a 64-bit RISC ISA developed by DEC (Digital Equipment
+// Corporation) in the early '90s. It aimed to be an ISA that would last
+// 25 years. DEC expected Alpha would become 1000x faster during that time
+// span. Since the ISA was developed from scratch for future machines,
+// it's 64-bit from the beginning. There's no 32-bit variant.
+//
+// DEC ported its own Unix (Tru64) to Alpha. Microsoft also ported Windows
+// NT to it. But it wasn't a huge commercial success.
+//
+// DEC was acquired by Compaq in 1997. In the late '90s, Intel and
+// Hewlett-Packard were advertising that their upcoming Itanium processor
+// would achieve significantly better performance than RISC processors, so
+// Compaq decided to discontinue the Alpha processor line to switch to
+// Itanium. Itanium resulted in a miserable failure, but it still suceeded
+// to wipe out several RISC processors just by promising overly optimistic
+// perf numbers. Alpha as an ISA would probably have been fine after 25
+// years since its introduction (which is 1992 + 25 = 2017), but the
+// company and its market didn't last that long.
+//
+// From the linker's point of view, there are a few peculiarities in its
+// psABI as shown below:
+//
+//  - Alpha lacks PC-relative memory load/store instructions, so it uses
+//    register-relative load/store instructions in position-independent
+//    code. Specifically, GP (which is an alias for $r29) is always
+//    maintained to refer to .got+0x8000, and global variables' addresses
+//    are loaded in a GP-relative manner.
+//
+//  - It looks like even function addresses are first loaded to register
+//    in a GP-relative manner before calling it. We can relax it to
+//    convert the instruction sequence with a direct branch instruction,
+//    but by default, object files don't use a direct branch to call a
+//    function. Therefore, by default, we don't need to create a PLT.
+//    Any function call is made by first reading its address from GOT and
+//    jump to the address.
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = ALPHA;
+
+// A 32-bit immediate can be materialized in a register with a "load high"
+// and a "load low" instruction sequence. The first instruction sets the
+// upper 16 bits in a register, and the second one set the lower 16
+// bits. When doing so, they sign-extend an immediate.  Therefore, if the
+// 15th bit of an immediate happens to be 1, setting a "low half" value
+// negates the upper 16 bit values that has already been set in a
+// register. To compensate that, we need to add 0x8000 when setting the
+// upper 16 bits.
+static u32 hi(u32 val) {
+  return bits(val + 0x8000, 31, 16);
+}
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_ALPHA_SREL32:
+    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+    u64 GP = ctx.got->shdr.sh_addr + 0x8000;
+
+    switch (rel.r_type) {
+    case R_ALPHA_REFQUAD:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_ALPHA_GPREL32:
+      *(ul32 *)loc = S + A - GP;
+      break;
+    case R_ALPHA_LITERAL:
+      if (A)
+        *(ul16 *)loc = ctx.extra.got->get_addr(sym, A) - GP;
+      else
+        *(ul16 *)loc = GOT + G - GP;
+      break;
+    case R_ALPHA_BRSGP:
+      *(ul32 *)loc |= bits(S + A - P - 4, 22, 0);
+      break;
+    case R_ALPHA_GPDISP:
+      *(ul16 *)loc = hi(GP - P);
+      *(ul16 *)(loc + A) = GP - P;
+      break;
+    case R_ALPHA_SREL32:
+      *(ul32 *)loc = S + A - P;
+      break;
+    case R_ALPHA_GPRELHIGH:
+      *(ul16 *)loc = hi(S + A - GP);
+      break;
+    case R_ALPHA_GPRELLOW:
+      *(ul16 *)loc = S + A - GP;
+      break;
+    case R_ALPHA_TLSGD:
+      *(ul16 *)loc = sym.get_tlsgd_addr(ctx) - GP;
+      break;
+    case R_ALPHA_TLSLDM:
+      *(ul16 *)loc = ctx.got->get_tlsld_addr(ctx) - GP;
+      break;
+    case R_ALPHA_DTPRELHI:
+      *(ul16 *)loc = hi(S + A - ctx.dtp_addr);
+      break;
+    case R_ALPHA_DTPRELLO:
+      *(ul16 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_ALPHA_GOTTPREL:
+      *(ul16 *)loc = sym.get_gottp_addr(ctx) + A - GP;
+      break;
+    case R_ALPHA_TPRELHI:
+      *(ul16 *)loc = hi(S + A - ctx.tp_addr);
+      break;
+    case R_ALPHA_TPRELLO:
+      *(ul16 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_ALPHA_LITUSE:
+    case R_ALPHA_HINT:
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_ALPHA_REFLONG:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul32 *)loc = *val;
+      else
+        *(ul32 *)loc = S + A;
+      break;
+    case R_ALPHA_REFQUAD:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul64 *)loc = *val;
+      else
+        *(ul64 *)loc = S + A;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      Error(ctx) << sym << ": GNU ifunc symbol is not supported on Alpha";
+
+    switch (rel.r_type) {
+    case R_ALPHA_REFQUAD:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_ALPHA_LITERAL:
+      if (rel.r_addend)
+        ctx.extra.got->add_symbol(sym, rel.r_addend);
+      else
+        sym.flags |= NEEDS_GOT;
+      break;
+    case R_ALPHA_SREL32:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_ALPHA_BRSGP:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_ALPHA_TLSGD:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_ALPHA_TLSLDM:
+      ctx.needs_tlsld = true;
+      break;
+    case R_ALPHA_GOTTPREL:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_ALPHA_TPRELHI:
+    case R_ALPHA_TPRELLO:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_ALPHA_GPREL32:
+    case R_ALPHA_LITUSE:
+    case R_ALPHA_GPDISP:
+    case R_ALPHA_HINT:
+    case R_ALPHA_GPRELHIGH:
+    case R_ALPHA_GPRELLOW:
+    case R_ALPHA_DTPRELHI:
+    case R_ALPHA_DTPRELLO:
+      break;
+    default:
+      Fatal(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+// An R_ALPHA_LITERAL relocation may request the linker to create a GOT
+// entry for an external symbol with a non-zero addend. This is an unusual
+// request which is not found in any other targets.
+//
+// Referring an external symbol with a non-zero addend is a bad practice
+// because we need to create as many dynamic relocations as the number of
+// distinctive addends for the same symbol.
+//
+// We don't want to mess up the implementation of the common GOT section
+// for Alpha. So we create another GOT-like section, .alpha_got. Any GOT
+// entry for an R_ALPHA_LITERAL reloc with a non-zero addend is created
+// not in .got but in .alpha_got.
+//
+// Since .alpha_got entries are accessed relative to GP, .alpha_got
+// needs to be close enough to .got. It's actually placed next to .got.
+void AlphaGotSection::add_symbol(Symbol<E> &sym, i64 addend) {
+  assert(addend);
+  std::scoped_lock lock(mu);
+  entries.push_back({&sym, addend});
+}
+
+bool operator<(const AlphaGotSection::Entry &a, const AlphaGotSection::Entry &b) {
+  return std::tuple(a.sym->file->priority, a.sym->sym_idx, a.addend) <
+         std::tuple(b.sym->file->priority, b.sym->sym_idx, b.addend);
+};
+
+u64 AlphaGotSection::get_addr(Symbol<E> &sym, i64 addend) {
+  auto it = std::lower_bound(entries.begin(), entries.end(), Entry{&sym, addend});
+  assert(it != entries.end());
+  return this->shdr.sh_addr + (it - entries.begin()) * sizeof(Word<E>);
+}
+
+i64 AlphaGotSection::get_reldyn_size(Context<E> &ctx) const {
+  i64 n = 0;
+  for (const Entry &e : entries)
+    if (e.sym->is_imported || (ctx.arg.pic && !e.sym->is_absolute()))
+      n++;
+  return n;
+}
+
+void AlphaGotSection::finalize() {
+  sort(entries);
+  remove_duplicates(entries);
+  shdr.sh_size = entries.size() * sizeof(Word<E>);
+}
+
+void AlphaGotSection::copy_buf(Context<E> &ctx) {
+  ElfRel<E> *dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                                    reldyn_offset);
+
+  for (i64 i = 0; i < entries.size(); i++) {
+    Entry &e = entries[i];
+    u64 P = this->shdr.sh_addr + sizeof(Word<E>) * i;
+    ul64 *buf = (ul64 *)(ctx.buf + this->shdr.sh_offset + sizeof(Word<E>) * i);
+
+    if (e.sym->is_imported) {
+      *buf = ctx.arg.apply_dynamic_relocs ? e.addend : 0;
+      *dynrel++ = ElfRel<E>(P, E::R_ABS, e.sym->get_dynsym_idx(ctx), e.addend);
+    } else {
+      *buf = e.sym->get_addr(ctx) + e.addend;
+      if (ctx.arg.pic && !e.sym->is_absolute())
+        *dynrel++ = ElfRel<E>(P, E::R_RELATIVE, 0, *buf);
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-arm32.cc
+++ b/third_party/mold/elf/arch-arm32.cc
@ -0,0 +1,737 @@
+// clang-format off
+// ARM32 is a bit special from the linker's viewpoint because ARM
+// processors support two different instruction encodings: Thumb and
+// ARM (in a narrower sense). Thumb instructions are either 16 bits or
+// 32 bits, while ARM instructions are all 32 bits. Feature-wise,
+// thumb is a subset of ARM, so not all ARM instructions are
+// representable in Thumb.
+//
+// ARM processors originally supported only ARM instructions. Thumb
+// instructions were later added to increase code density.
+//
+// ARM processors runs in either ARM mode or Thumb mode. The mode can
+// be switched using BX (branch and mode exchange)-family instructions.
+// We need to use that instructions to, for example, call a function
+// encoded in Thumb from a function encoded in ARM. Sometimes, the
+// linker even has to emit an interworking thunk code to switch mode.
+//
+// ARM instructions are aligned to 4 byte boundaries. Thumb are to 2
+// byte boundaries.
+//
+// You can distinguish Thumb functions from ARM functions by looking
+// at the least significant bit (LSB) of its "address". If LSB is 0,
+// it's ARM; otherwise, Thumb.
+//
+// For example, if a symbol `foo` is of type STT_FUNC and has value
+// 0x2001, `foo` is a function using Thumb instructions whose address
+// is 0x2000 (not 0x2001, as Thumb instructions are always 2-byte
+// aligned). Likewise, if a function pointer has value 0x2001, it
+// refers a Thumb function at 0x2000.
+//
+// https://github.com/ARM-software/abi-aa/blob/main/aaelf32/aaelf32.rst
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = ARM32;
+
+template <>
+i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
+  switch (rel.r_type) {
+  case R_ARM_ABS32:
+  case R_ARM_REL32:
+  case R_ARM_TARGET1:
+  case R_ARM_BASE_PREL:
+  case R_ARM_GOTOFF32:
+  case R_ARM_GOT_PREL:
+  case R_ARM_GOT_BREL:
+  case R_ARM_TLS_GD32:
+  case R_ARM_TLS_LDM32:
+  case R_ARM_TLS_LDO32:
+  case R_ARM_TLS_IE32:
+  case R_ARM_TLS_LE32:
+  case R_ARM_TLS_GOTDESC:
+  case R_ARM_TARGET2:
+    return *(il32 *)loc;
+  case R_ARM_THM_JUMP11:
+    return sign_extend(*(ul16 *)loc, 10) << 1;
+  case R_ARM_THM_CALL:
+  case R_ARM_THM_JUMP24:
+  case R_ARM_THM_TLS_CALL: {
+    u32 S = bit(*(ul16 *)loc, 10);
+    u32 J1 = bit(*(ul16 *)(loc + 2), 13);
+    u32 J2 = bit(*(ul16 *)(loc + 2), 11);
+    u32 I1 = !(J1 ^ S);
+    u32 I2 = !(J2 ^ S);
+    u32 imm10 = bits(*(ul16 *)loc, 9, 0);
+    u32 imm11 = bits(*(ul16 *)(loc + 2), 10, 0);
+    u32 val = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1);
+    return sign_extend(val, 24);
+  }
+  case R_ARM_CALL:
+  case R_ARM_JUMP24:
+  case R_ARM_PLT32:
+  case R_ARM_TLS_CALL:
+    return sign_extend(*(ul32 *)loc, 23) << 2;
+  case R_ARM_MOVW_PREL_NC:
+  case R_ARM_MOVW_ABS_NC:
+  case R_ARM_MOVT_PREL:
+  case R_ARM_MOVT_ABS: {
+    u32 imm12 = bits(*(ul32 *)loc, 11, 0);
+    u32 imm4 = bits(*(ul32 *)loc, 19, 16);
+    return sign_extend((imm4 << 12) | imm12, 15);
+  }
+  case R_ARM_PREL31:
+    return sign_extend(*(ul32 *)loc, 30);
+  case R_ARM_THM_MOVW_PREL_NC:
+  case R_ARM_THM_MOVW_ABS_NC:
+  case R_ARM_THM_MOVT_PREL:
+  case R_ARM_THM_MOVT_ABS: {
+    u32 imm4 = bits(*(ul16 *)loc, 3, 0);
+    u32 i = bit(*(ul16 *)loc, 10);
+    u32 imm3 = bits(*(ul16 *)(loc + 2), 14, 12);
+    u32 imm8 = bits(*(ul16 *)(loc + 2), 7, 0);
+    u32 val = (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8;
+    return sign_extend(val, 15);
+  }
+  default:
+    return 0;
+  }
+}
+
+static void write_mov_imm(u8 *loc, u32 val) {
+  u32 imm12 = bits(val, 11, 0);
+  u32 imm4 = bits(val, 15, 12);
+  *(ul32 *)loc = (*(ul32 *)loc & 0xfff0f000) | (imm4 << 16) | imm12;
+}
+
+static void write_thm_b_imm(u8 *loc, u32 val) {
+  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/BL--BLX--immediate-
+  u32 sign = bit(val, 24);
+  u32 I1 = bit(val, 23);
+  u32 I2 = bit(val, 22);
+  u32 J1 = !I1 ^ sign;
+  u32 J2 = !I2 ^ sign;
+  u32 imm10 = bits(val, 21, 12);
+  u32 imm11 = bits(val, 11, 1);
+
+  ul16 *buf = (ul16 *)loc;
+  buf[0] = (buf[0] & 0b1111'1000'0000'0000) | (sign << 10) | imm10;
+  buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11;
+}
+
+static void write_thm_mov_imm(u8 *loc, u32 val) {
+  // https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/MOVT
+  u32 imm4 = bits(val, 15, 12);
+  u32 i = bit(val, 11);
+  u32 imm3 = bits(val, 10, 8);
+  u32 imm8 = bits(val, 7, 0);
+
+  ul16 *buf = (ul16 *)loc;
+  buf[0] = (buf[0] & 0b1111'1011'1111'0000) | (i << 10) | imm4;
+  buf[1] = (buf[1] & 0b1000'1111'0000'0000) | (imm3 << 12) | imm8;
+}
+
+template <>
+void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
+  switch (rel.r_type) {
+  case R_ARM_NONE:
+    break;
+  case R_ARM_ABS32:
+  case R_ARM_REL32:
+  case R_ARM_TARGET1:
+  case R_ARM_BASE_PREL:
+  case R_ARM_GOTOFF32:
+  case R_ARM_GOT_PREL:
+  case R_ARM_GOT_BREL:
+  case R_ARM_TLS_GD32:
+  case R_ARM_TLS_LDM32:
+  case R_ARM_TLS_LDO32:
+  case R_ARM_TLS_IE32:
+  case R_ARM_TLS_LE32:
+  case R_ARM_TLS_GOTDESC:
+  case R_ARM_TARGET2:
+    *(ul32 *)loc = val;
+    break;
+  case R_ARM_THM_JUMP11:
+    *(ul16 *)loc = (*(ul16 *)loc & 0xf800) | bits(val, 11, 1);
+    break;
+  case R_ARM_THM_CALL:
+  case R_ARM_THM_JUMP24:
+  case R_ARM_THM_TLS_CALL:
+    write_thm_b_imm(loc, val);
+    break;
+  case R_ARM_CALL:
+  case R_ARM_JUMP24:
+  case R_ARM_PLT32:
+    *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
+    break;
+  case R_ARM_MOVW_PREL_NC:
+  case R_ARM_MOVW_ABS_NC:
+  case R_ARM_MOVT_PREL:
+  case R_ARM_MOVT_ABS:
+    write_mov_imm(loc, val);
+    break;
+  case R_ARM_PREL31:
+    *(ul32 *)loc = (*(ul32 *)loc & 0x8000'0000) | (val & 0x7fff'ffff);
+    break;
+  case R_ARM_THM_MOVW_PREL_NC:
+  case R_ARM_THM_MOVW_ABS_NC:
+  case R_ARM_THM_MOVT_PREL:
+  case R_ARM_THM_MOVT_ABS:
+    write_thm_mov_imm(loc, val);
+    break;
+  default:
+    unreachable();
+  }
+}
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const ul32 insn[] = {
+    0xe52d'e004, // push {lr}
+    0xe59f'e004, // ldr lr, 2f
+    0xe08f'e00e, // 1: add lr, pc, lr
+    0xe5be'f008, // ldr pc, [lr, #8]!
+    0x0000'0000, // 2: .word .got.plt - 1b - 8
+    0xe320'f000, // nop
+    0xe320'f000, // nop
+    0xe320'f000, // nop
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16;
+}
+
+static const ul32 plt_entry[] = {
+  0xe59f'c004, // 1: ldr ip, 2f
+  0xe08c'c00f, // add ip, ip, pc
+  0xe59c'f000, // ldr pc, [ip]
+  0x0000'0000, // 2: .word sym@GOT - 1b
+};
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  memcpy(buf, plt_entry, sizeof(plt_entry));
+  *(ul32 *)(buf + 12) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 12;
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  memcpy(buf, plt_entry, sizeof(plt_entry));
+  *(ul32 *)(buf + 12) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 12;
+}
+
+// ARM does not use .eh_frame for exception handling. Instead, it uses
+// .ARM.exidx and .ARM.extab. So this function is empty.
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {}
+
+// ARM and Thumb branch instructions can jump within ±16 MiB.
+static bool is_jump_reachable(i64 val) {
+  return sign_extend(val, 24) == val;
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable {
+    for (; i < output_section->thunks.size(); i++) {
+      i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
+                 addr;
+      if (is_jump_reachable(disp))
+        return disp;
+    }
+    unreachable();
+  };
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = get_addend(*this, rel);
+    u64 P = get_addr() + rel.r_offset;
+    u64 T = S & 1;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    auto get_thumb_thunk_addr = [&] { return get_thunk_addr(i); };
+    auto get_arm_thunk_addr   = [&] { return get_thunk_addr(i) + 4; };
+
+    switch (rel.r_type) {
+    case R_ARM_ABS32:
+    case R_ARM_TARGET1:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_ARM_REL32:
+      *(ul32 *)loc = S + A - P;
+      break;
+    case R_ARM_THM_CALL: {
+      if (sym.is_remaining_undef_weak()) {
+        // On ARM, calling an weak undefined symbol jumps to the
+        // next instruction.
+        *(ul32 *)loc = 0x8000'f3af; // NOP.W
+        break;
+      }
+
+      // THM_CALL relocation refers either BL or BLX instruction.
+      // They are different in only one bit. We need to use BL if
+      // the jump target is Thumb. Otherwise, use BLX.
+      i64 val = S + A - P;
+      if (is_jump_reachable(val)) {
+        if (T) {
+          write_thm_b_imm(loc, val);
+          *(ul16 *)(loc + 2) |= 0x1000;  // rewrite to BL
+        } else {
+          write_thm_b_imm(loc, align_to(val, 4));
+          *(ul16 *)(loc + 2) &= ~0x1000; // rewrite to BLX
+        }
+      } else {
+        write_thm_b_imm(loc, align_to(get_arm_thunk_addr() + A - P, 4));
+        *(ul16 *)(loc + 2) &= ~0x1000;  // rewrite to BLX
+      }
+      break;
+    }
+    case R_ARM_BASE_PREL:
+      *(ul32 *)loc = GOT + A - P;
+      break;
+    case R_ARM_GOTOFF32:
+      *(ul32 *)loc = ((S + A) | T) - GOT;
+      break;
+    case R_ARM_GOT_PREL:
+    case R_ARM_TARGET2:
+      *(ul32 *)loc = GOT + G + A - P;
+      break;
+    case R_ARM_GOT_BREL:
+      *(ul32 *)loc = G + A;
+      break;
+    case R_ARM_CALL: {
+      if (sym.is_remaining_undef_weak()) {
+        *(ul32 *)loc = 0xe320'f000; // NOP
+        break;
+      }
+
+      // Just like THM_CALL, ARM_CALL relocation refers either BL or
+      // BLX instruction. We may need to rewrite BL → BLX or BLX → BL.
+      bool is_bl = ((*(ul32 *)loc & 0xff00'0000) == 0xeb00'0000);
+      bool is_blx = ((*(ul32 *)loc & 0xfe00'0000) == 0xfa00'0000);
+      if (!is_bl && !is_blx)
+        Fatal(ctx) << *this << ": R_ARM_CALL refers neither BL nor BLX";
+
+      u64 val = S + A - P;
+      if (is_jump_reachable(val)) {
+        if (T) {
+          *(ul32 *)loc = 0xfa00'0000; // BLX
+          *(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2);
+        } else {
+          *(ul32 *)loc = 0xeb00'0000; // BL
+          *(ul32 *)loc |= bits(val, 25, 2);
+        }
+      } else {
+        *(ul32 *)loc = 0xeb00'0000; // BL
+        *(ul32 *)loc |= bits(get_arm_thunk_addr() + A - P, 25, 2);
+      }
+      break;
+    }
+    case R_ARM_JUMP24: {
+      if (sym.is_remaining_undef_weak()) {
+        *(ul32 *)loc = 0xe320'f000; // NOP
+        break;
+      }
+
+      // These relocs refers a B (unconditional branch) instruction.
+      // Unlike BL or BLX, we can't rewrite B to BX in place when the
+      // processor mode switch is required because BX doesn't takes an
+      // immediate; it takes only a register. So if mode switch is
+      // required, we jump to a linker-synthesized thunk which does the
+      // job with a longer code sequence.
+      u64 val = S + A - P;
+      if (!is_jump_reachable(val) || T)
+        val = get_arm_thunk_addr() + A - P;
+      *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
+      break;
+    }
+    case R_ARM_PLT32:
+      if (sym.is_remaining_undef_weak()) {
+        *(ul32 *)loc = 0xe320'f000; // NOP
+      } else {
+        u64 val = (T ? get_arm_thunk_addr() : S) + A - P;
+        *(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
+      }
+      break;
+    case R_ARM_THM_JUMP11:
+      assert(T);
+      check(S + A - P, -(1 << 11), 1 << 11);
+      *(ul16 *)loc &= 0xf800;
+      *(ul16 *)loc |= bits(S + A - P, 11, 1);
+      break;
+    case R_ARM_THM_JUMP19: {
+      i64 val = S + A - P;
+      check(val, -(1 << 19), 1 << 19);
+
+      // sign:J2:J1:imm6:imm11:'0'
+      u32 sign = bit(val, 20);
+      u32 J2 = bit(val, 19);
+      u32 J1 = bit(val, 18);
+      u32 imm6 = bits(val, 17, 12);
+      u32 imm11 = bits(val, 11, 1);
+
+      *(ul16 *)loc &= 0b1111'1011'1100'0000;
+      *(ul16 *)loc |= (sign << 10) | imm6;
+
+      *(ul16 *)(loc + 2) &= 0b1101'0000'0000'0000;
+      *(ul16 *)(loc + 2) |= (J2 << 13) | (J1 << 11) | imm11;
+      break;
+    }
+    case R_ARM_THM_JUMP24: {
+      if (sym.is_remaining_undef_weak()) {
+        *(ul32 *)loc = 0x8000'f3af; // NOP
+        break;
+      }
+
+      // Just like R_ARM_JUMP24, we need to jump to a thunk if we need to
+      // switch processor mode.
+      u64 val = S + A - P;
+      if (!is_jump_reachable(val) || !T)
+        val = get_thumb_thunk_addr() + A - P;
+      write_thm_b_imm(loc, val);
+      break;
+    }
+    case R_ARM_MOVW_PREL_NC:
+      write_mov_imm(loc, ((S + A) | T) - P);
+      break;
+    case R_ARM_MOVW_ABS_NC:
+      write_mov_imm(loc, (S + A) | T);
+      break;
+    case R_ARM_THM_MOVW_PREL_NC:
+      write_thm_mov_imm(loc, ((S + A) | T) - P);
+      break;
+    case R_ARM_PREL31:
+      check(S + A - P, -(1LL << 30), 1LL << 30);
+      *(ul32 *)loc &= 0x8000'0000;
+      *(ul32 *)loc |= (S + A - P) & 0x7fff'ffff;
+      break;
+    case R_ARM_THM_MOVW_ABS_NC:
+      write_thm_mov_imm(loc, (S + A) | T);
+      break;
+    case R_ARM_MOVT_PREL:
+      write_mov_imm(loc, (S + A - P) >> 16);
+      break;
+    case R_ARM_THM_MOVT_PREL:
+      write_thm_mov_imm(loc, (S + A - P) >> 16);
+      break;
+    case R_ARM_MOVT_ABS:
+      write_mov_imm(loc, (S + A) >> 16);
+      break;
+    case R_ARM_THM_MOVT_ABS:
+      write_thm_mov_imm(loc, (S + A) >> 16);
+      break;
+    case R_ARM_TLS_GD32:
+      *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - P;
+      break;
+    case R_ARM_TLS_LDM32:
+      *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - P;
+      break;
+    case R_ARM_TLS_LDO32:
+      *(ul32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_ARM_TLS_IE32:
+      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - P;
+      break;
+    case R_ARM_TLS_LE32:
+      *(ul32 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_ARM_TLS_GOTDESC:
+      if (sym.has_tlsdesc(ctx)) {
+        // A is odd if the corresponding TLS_CALL is Thumb.
+        if (A & 1)
+          *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 6;
+        else
+          *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 4;
+      } else {
+        *(ul32 *)loc = S - ctx.tp_addr;
+      }
+      break;
+    case R_ARM_TLS_CALL:
+      if (sym.has_tlsdesc(ctx)) {
+        // BL <tls_trampoline>
+        *(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2);
+      } else {
+        // BL -> NOP
+        *(ul32 *)loc = 0xe320'f000;
+      }
+      break;
+    case R_ARM_THM_TLS_CALL:
+      if (sym.has_tlsdesc(ctx)) {
+        u64 val = align_to(get_tls_trampoline_addr(P + 4), 4);
+        write_thm_b_imm(loc, val);
+        *(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX
+      } else {
+        // BL -> NOP.W
+        *(ul32 *)loc = 0x8000'f3af;
+      }
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : get_addend(*this, rel);
+
+    switch (rel.r_type) {
+    case R_ARM_ABS32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul32 *)loc = *val;
+      else
+        *(ul32 *)loc = S + A;
+      break;
+    case R_ARM_TLS_LDO32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul32 *)loc = *val;
+      else
+        *(ul32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+      break;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_ARM_ABS32:
+    case R_ARM_MOVT_ABS:
+    case R_ARM_THM_MOVT_ABS:
+    case R_ARM_TARGET1:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_ARM_THM_CALL:
+    case R_ARM_CALL:
+    case R_ARM_JUMP24:
+    case R_ARM_PLT32:
+    case R_ARM_THM_JUMP24:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_ARM_GOT_PREL:
+    case R_ARM_GOT_BREL:
+    case R_ARM_TARGET2:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_ARM_MOVT_PREL:
+    case R_ARM_THM_MOVT_PREL:
+    case R_ARM_PREL31:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_ARM_TLS_GD32:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_ARM_TLS_LDM32:
+      ctx.needs_tlsld = true;
+      break;
+    case R_ARM_TLS_IE32:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_ARM_TLS_GOTDESC:
+      if (!relax_tlsdesc(ctx, sym))
+        sym.flags |= NEEDS_TLSDESC;
+      break;
+    case R_ARM_TLS_LE32:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_ARM_REL32:
+    case R_ARM_BASE_PREL:
+    case R_ARM_GOTOFF32:
+    case R_ARM_THM_JUMP11:
+    case R_ARM_THM_JUMP19:
+    case R_ARM_MOVW_PREL_NC:
+    case R_ARM_MOVW_ABS_NC:
+    case R_ARM_THM_MOVW_PREL_NC:
+    case R_ARM_THM_MOVW_ABS_NC:
+    case R_ARM_TLS_LDO32:
+    case R_ARM_TLS_CALL:
+    case R_ARM_THM_TLS_CALL:
+    case R_ARM_V4BX:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+template <>
+void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
+
+  // TLS trampoline code. ARM32's TLSDESC is designed so that this
+  // common piece of code is factored out from object files to reduce
+  // output size. Since no one provide, the linker has to synthesize it.
+  static ul32 hdr[] = {
+    0xe08e'0000, // add r0, lr, r0
+    0xe590'1004, // ldr r1, [r0, #4]
+    0xe12f'ff11, // bx  r1
+  };
+
+  // This is a range extension and mode switch thunk.
+  // It has two entry points: +0 for Thumb and +4 for ARM.
+  const u8 entry[] = {
+    // .thumb
+    0xfc, 0x46,             //    mov  ip, pc
+    0x60, 0x47,             //    bx   ip  # jumps to the following `ldr` insn
+    // .arm
+    0x04, 0xc0, 0x9f, 0xe5, //    ldr  ip, 2f
+    0x0f, 0xc0, 0x8c, 0xe0, // 1: add  ip, ip, pc
+    0x1c, 0xff, 0x2f, 0xe1, //    bx   ip
+    0x00, 0x00, 0x00, 0x00, // 2: .word sym - 1b
+  };
+
+  static_assert(E::thunk_hdr_size == sizeof(hdr));
+  static_assert(E::thunk_size == sizeof(entry));
+
+  memcpy(buf, hdr, sizeof(hdr));
+
+  for (i64 i = 0; i < symbols.size(); i++) {
+    u8 *loc = buf + sizeof(hdr) + i * sizeof(entry);
+    memcpy(loc, entry, sizeof(entry));
+
+    u64 S = symbols[i]->get_addr(ctx);
+    u64 P = output_section.shdr.sh_addr + offset + sizeof(hdr) + i * sizeof(entry);
+    *(ul32 *)(loc + 16) = S - P - 16;
+  }
+}
+
+// ARM executables use an .ARM.exidx section to look up an exception
+// handling record for the current instruction pointer. The table needs
+// to be sorted by their addresses.
+//
+// Other target uses .eh_frame_hdr instead for the same purpose.
+// I don't know why only ARM uses the different mechanism, but it's
+// likely that it's due to some historical reason.
+//
+// This function sorts .ARM.exidx records.
+void fixup_arm_exidx_section(Context<E> &ctx) {
+  Timer t(ctx, "fixup_arm_exidx_section");
+
+  OutputSection<E> *osec = find_section(ctx, SHT_ARM_EXIDX);
+  if (!osec)
+    return;
+
+  // .ARM.exidx records consists of a signed 31-bit relative address
+  // and a 32-bit value. The relative address indicates the start
+  // address of a function that the record covers. The value is one of
+  // the followings:
+  //
+  // 1. CANTUNWIND indicating that there's no unwinding info for the function,
+  // 2. a compact unwinding record encoded into a 32-bit value, or
+  // 3. a 31-bit relative address which points to a larger record in
+  //    the .ARM.extab section.
+  //
+  // CANTUNWIND is value 1. The most significant bit is set in (2) but
+  // not in (3). So we can distinguished them just by looking at a value.
+  const u32 EXIDX_CANTUNWIND = 1;
+
+  struct Entry {
+    ul32 addr;
+    ul32 val;
+  };
+
+  if (osec->shdr.sh_size % sizeof(Entry))
+    Fatal(ctx) << "invalid .ARM.exidx section size";
+
+  Entry *ent = (Entry *)(ctx.buf + osec->shdr.sh_offset);
+  i64 num_entries = osec->shdr.sh_size / sizeof(Entry);
+
+  // Entry's addresses are relative to themselves. In order to sort
+  // records by addresses, we first translate them so that the addresses
+  // are relative to the beginning of the section.
+  auto is_relative = [](u32 val) {
+    return val != EXIDX_CANTUNWIND && !(val & 0x8000'0000);
+  };
+
+  tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
+    i64 offset = sizeof(Entry) * i;
+    ent[i].addr = sign_extend(ent[i].addr, 30) + offset;
+    if (is_relative(ent[i].val))
+      ent[i].val = 0x7fff'ffff & (ent[i].val + offset);
+  });
+
+  tbb::parallel_sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) {
+    return a.addr < b.addr;
+  });
+
+  // Make addresses relative to themselves.
+  tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
+    i64 offset = sizeof(Entry) * i;
+    ent[i].addr = 0x7fff'ffff & (ent[i].addr - offset);
+    if (is_relative(ent[i].val))
+      ent[i].val = 0x7fff'ffff & (ent[i].val - offset);
+  });
+
+  // .ARM.exidx's sh_link should be set to the .text section index.
+  // Runtime doesn't care about it, but the binutils's strip command does.
+  if (ctx.shdr) {
+    if (Chunk<E> *text = find_section(ctx, ".text")) {
+      osec->shdr.sh_link = text->shndx;
+      ctx.shdr->copy_buf(ctx);
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-arm64.cc
+++ b/third_party/mold/elf/arch-arm64.cc
@ -0,0 +1,595 @@
+// clang-format off
+// This file contains ARM64-specific code. Being new, the ARM64's ELF
+// psABI doesn't have anything peculiar. ARM64 is a clean RISC
+// instruction set that supports PC-relative load/store instructions.
+//
+// Unlike ARM32, instructions length doesn't vary. All ARM64
+// instructions are 4 bytes long.
+//
+// Branch instructions used for function call can jump within ±128 MiB.
+// We need to create range extension thunks to support binaries whose
+// .text is larger than that.
+//
+// Unlike most other targets, the TLSDESC access model is used by default
+// for -fPIC to access thread-local variables instead of the less
+// efficient GD model. You can still enable GD but it needs the
+// -mtls-dialect=trad flag. Since GD is used rarely, we don't need to
+// implement GD → LE relaxation.
+//
+// https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = ARM64;
+
+static void write_adrp(u8 *buf, u64 val) {
+  *(ul32 *)buf |= (bits(val, 13, 12) << 29) | (bits(val, 32, 14) << 5);
+}
+
+static void write_adr(u8 *buf, u64 val) {
+  *(ul32 *)buf |= (bits(val, 1, 0) << 29) | (bits(val, 20, 2) << 5);
+}
+
+static void write_movn_movz(u8 *buf, i64 val) {
+  *(ul32 *)buf &= 0b0000'0000'0110'0000'0000'0000'0001'1111;
+
+  if (val >= 0)
+    *(ul32 *)buf |= 0xd280'0000 | (bits(val, 15, 0) << 5);  // rewrite to movz
+  else
+    *(ul32 *)buf |= 0x9280'0000 | (bits(~val, 15, 0) << 5); // rewrite to movn
+}
+
+static u64 page(u64 val) {
+  return val & 0xffff'ffff'ffff'f000;
+}
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const ul32 insn[] = {
+    0xa9bf'7bf0, // stp  x16, x30, [sp,#-16]!
+    0x9000'0010, // adrp x16, .got.plt[2]
+    0xf940'0211, // ldr  x17, [x16, .got.plt[2]]
+    0x9100'0210, // add  x16, x16, .got.plt[2]
+    0xd61f'0220, // br   x17
+    0xd503'201f, // nop
+    0xd503'201f, // nop
+    0xd503'201f, // nop
+  };
+
+  u64 gotplt = ctx.gotplt->shdr.sh_addr + 16;
+  u64 plt = ctx.plt->shdr.sh_addr;
+
+  memcpy(buf, insn, sizeof(insn));
+  write_adrp(buf + 4, page(gotplt) - page(plt + 4));
+  *(ul32 *)(buf + 8) |= bits(gotplt, 11, 3) << 10;
+  *(ul32 *)(buf + 12) |= (gotplt & 0xfff) << 10;
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static const ul32 insn[] = {
+    0x9000'0010, // adrp x16, .got.plt[n]
+    0xf940'0211, // ldr  x17, [x16, .got.plt[n]]
+    0x9100'0210, // add  x16, x16, .got.plt[n]
+    0xd61f'0220, // br   x17
+  };
+
+  u64 gotplt = sym.get_gotplt_addr(ctx);
+  u64 plt = sym.get_plt_addr(ctx);
+
+  memcpy(buf, insn, sizeof(insn));
+  write_adrp(buf, page(gotplt) - page(plt));
+  *(ul32 *)(buf + 4) |= bits(gotplt, 11, 3) << 10;
+  *(ul32 *)(buf + 8) |= (gotplt & 0xfff) << 10;
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static const ul32 insn[] = {
+    0x9000'0010, // adrp x16, GOT[n]
+    0xf940'0211, // ldr  x17, [x16, GOT[n]]
+    0xd61f'0220, // br   x17
+    0xd503'201f, // nop
+  };
+
+  u64 got = sym.get_got_addr(ctx);
+  u64 plt = sym.get_plt_addr(ctx);
+
+  memcpy(buf, insn, sizeof(insn));
+  write_adrp(buf, page(got) - page(plt));
+  *(ul32 *)(buf + 4) |= bits(got, 11, 3) << 10;
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_AARCH64_ABS64:
+    *(ul64 *)loc = val;
+    break;
+  case R_AARCH64_PREL32:
+    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  case R_AARCH64_PREL64:
+    *(ul64 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+static bool is_adrp(u8 *loc) {
+  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADRP--Form-PC-relative-address-to-4KB-page-
+  u32 insn = *(ul32 *)loc;
+  return (bits(insn, 31, 24) & 0b1001'1111) == 0b1001'0000;
+}
+
+static bool is_ldr(u8 *loc) {
+  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
+  u32 insn = *(ul32 *)loc;
+  return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1111'1001'0100;
+}
+
+static bool is_add(u8 *loc) {
+  // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--immediate---Add--immediate--
+  u32 insn = *(ul32 *)loc;
+  return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1001'0001'0000;
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_AARCH64_ABS64:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_AARCH64_LDST8_ABS_LO12_NC:
+    case R_AARCH64_ADD_ABS_LO12_NC:
+      *(ul32 *)loc |= bits(S + A, 11, 0) << 10;
+      break;
+    case R_AARCH64_LDST16_ABS_LO12_NC:
+      *(ul32 *)loc |= bits(S + A, 11, 1) << 10;
+      break;
+    case R_AARCH64_LDST32_ABS_LO12_NC:
+      *(ul32 *)loc |= bits(S + A, 11, 2) << 10;
+      break;
+    case R_AARCH64_LDST64_ABS_LO12_NC:
+      *(ul32 *)loc |= bits(S + A, 11, 3) << 10;
+      break;
+    case R_AARCH64_LDST128_ABS_LO12_NC:
+      *(ul32 *)loc |= bits(S + A, 11, 4) << 10;
+      break;
+    case R_AARCH64_MOVW_UABS_G0:
+      check(S + A, 0, 1 << 16);
+      *(ul32 *)loc |= bits(S + A, 15, 0) << 5;
+      break;
+    case R_AARCH64_MOVW_UABS_G0_NC:
+      *(ul32 *)loc |= bits(S + A, 15, 0) << 5;
+      break;
+    case R_AARCH64_MOVW_UABS_G1:
+      check(S + A, 0, 1LL << 32);
+      *(ul32 *)loc |= bits(S + A, 31, 16) << 5;
+      break;
+    case R_AARCH64_MOVW_UABS_G1_NC:
+      *(ul32 *)loc |= bits(S + A, 31, 16) << 5;
+      break;
+    case R_AARCH64_MOVW_UABS_G2:
+      check(S + A, 0, 1LL << 48);
+      *(ul32 *)loc |= bits(S + A, 47, 32) << 5;
+      break;
+    case R_AARCH64_MOVW_UABS_G2_NC:
+      *(ul32 *)loc |= bits(S + A, 47, 32) << 5;
+      break;
+    case R_AARCH64_MOVW_UABS_G3:
+      *(ul32 *)loc |= bits(S + A, 63, 48) << 5;
+      break;
+    case R_AARCH64_ADR_GOT_PAGE:
+      if (sym.has_got(ctx)) {
+        i64 val = page(G + GOT + A) - page(P);
+        check(val, -(1LL << 32), 1LL << 32);
+        write_adrp(loc, val);
+      } else {
+        // Relax GOT-loading ADRP+LDR to an immediate ADRP+ADD
+        i64 val = page(S + A) - page(P);
+        check(val, -(1LL << 32), 1LL << 32);
+        write_adrp(loc, val);
+
+        u32 reg = bits(*(ul32 *)loc, 4, 0);
+        *(ul32 *)(loc + 4) = 0x9100'0000 | (reg << 5) | reg; // ADD
+        *(ul32 *)(loc + 4) |= bits(S + A, 11, 0) << 10;
+        i++;
+      }
+      break;
+    case R_AARCH64_ADR_PREL_PG_HI21: {
+      // The ARM64 psABI defines that an `ADRP x0, foo` and `ADD x0, x0,
+      // :lo12: foo` instruction pair to materialize a PC-relative address
+      // in a register can be relaxed to `NOP` followed by `ADR x0, foo`
+      // if foo is in PC ± 1 MiB.
+      if (ctx.arg.relax && i + 1 < rels.size() &&
+          sign_extend(S + A - P - 4, 20) == S + A - P - 4) {
+        const ElfRel<E> &rel2 = rels[i + 1];
+        if (rel2.r_type == R_AARCH64_ADD_ABS_LO12_NC &&
+            rel2.r_sym == rel.r_sym &&
+            rel2.r_offset == rel.r_offset + 4 &&
+            rel2.r_addend == rel.r_addend &&
+            is_adrp(loc) &&
+            is_add(loc + 4)) {
+          u32 reg1 = bits(*(ul32 *)loc, 4, 0);
+          u32 reg2 = bits(*(ul32 *)(loc + 4), 4, 0);
+          if (reg1 == reg2) {
+            *(ul32 *)loc = 0xd503'201f;              // nop
+            *(ul32 *)(loc + 4) = 0x1000'0000 | reg1; // adr
+            write_adr(loc + 4, S + A - P - 4);
+            i++;
+            break;
+          }
+        }
+      }
+
+      i64 val = page(S + A) - page(P);
+      check(val, -(1LL << 32), 1LL << 32);
+      write_adrp(loc, val);
+      break;
+    }
+    case R_AARCH64_ADR_PREL_LO21:
+      check(S + A - P, -(1LL << 20), 1LL << 20);
+      write_adr(loc, S + A - P);
+      break;
+    case R_AARCH64_CALL26:
+    case R_AARCH64_JUMP26: {
+      if (sym.is_remaining_undef_weak()) {
+        // On ARM, calling an weak undefined symbol jumps to the
+        // next instruction.
+        *(ul32 *)loc = 0xd503'201f; // nop
+        break;
+      }
+
+      i64 val = S + A - P;
+      if (val < -(1 << 27) || (1 << 27) <= val)
+        val = get_thunk_addr(i) + A - P;
+      *(ul32 *)loc |= bits(val, 27, 2);
+      break;
+    }
+    case R_AARCH64_PLT32:
+      check(S + A - P, -(1LL << 31), 1LL << 31);
+      *(ul32 *)loc = S + A - P;
+      break;
+    case R_AARCH64_CONDBR19:
+    case R_AARCH64_LD_PREL_LO19:
+      check(S + A - P, -(1LL << 20), 1LL << 20);
+      *(ul32 *)loc |= bits(S + A - P, 20, 2) << 5;
+      break;
+    case R_AARCH64_PREL16:
+      check(S + A - P, -(1LL << 15), 1LL << 15);
+      *(ul16 *)loc = S + A - P;
+      break;
+    case R_AARCH64_PREL32:
+      check(S + A - P, -(1LL << 31), 1LL << 32);
+      *(ul32 *)loc = S + A - P;
+      break;
+    case R_AARCH64_PREL64:
+      *(ul64 *)loc = S + A - P;
+      break;
+    case R_AARCH64_LD64_GOT_LO12_NC:
+      *(ul32 *)loc |= bits(G + GOT + A, 11, 3) << 10;
+      break;
+    case R_AARCH64_LD64_GOTPAGE_LO15: {
+      i64 val = G + GOT + A - page(GOT);
+      check(val, 0, 1 << 15);
+      *(ul32 *)loc |= bits(val, 14, 3) << 10;
+      break;
+    }
+    case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: {
+      i64 val = page(sym.get_gottp_addr(ctx) + A) - page(P);
+      check(val, -(1LL << 32), 1LL << 32);
+      write_adrp(loc, val);
+      break;
+    }
+    case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+      *(ul32 *)loc |= bits(sym.get_gottp_addr(ctx) + A, 11, 3) << 10;
+      break;
+    case R_AARCH64_TLSLE_MOVW_TPREL_G0: {
+      i64 val = S + A - ctx.tp_addr;
+      check(val, -(1 << 15), 1 << 15);
+      write_movn_movz(loc, val);
+      break;
+    }
+    case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
+      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 15, 0) << 5;
+      break;
+    case R_AARCH64_TLSLE_MOVW_TPREL_G1: {
+      i64 val = S + A - ctx.tp_addr;
+      check(val, -(1LL << 31), 1LL << 31);
+      write_movn_movz(loc, val >> 16);
+      break;
+    }
+    case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
+      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 31, 16) << 5;
+      break;
+    case R_AARCH64_TLSLE_MOVW_TPREL_G2: {
+      i64 val = S + A - ctx.tp_addr;
+      check(val, -(1LL << 47), 1LL << 47);
+      write_movn_movz(loc, val >> 32);
+      break;
+    }
+    case R_AARCH64_TLSLE_ADD_TPREL_HI12: {
+      i64 val = S + A - ctx.tp_addr;
+      check(val, 0, 1LL << 24);
+      *(ul32 *)loc |= bits(val, 23, 12) << 10;
+      break;
+    }
+    case R_AARCH64_TLSLE_ADD_TPREL_LO12:
+      check(S + A - ctx.tp_addr, 0, 1 << 12);
+      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10;
+      break;
+    case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+      *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10;
+      break;
+    case R_AARCH64_TLSGD_ADR_PAGE21: {
+      i64 val = page(sym.get_tlsgd_addr(ctx) + A) - page(P);
+      check(val, -(1LL << 32), 1LL << 32);
+      write_adrp(loc, val);
+      break;
+    }
+    case R_AARCH64_TLSGD_ADD_LO12_NC:
+      *(ul32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A, 11, 0) << 10;
+      break;
+    case R_AARCH64_TLSDESC_ADR_PAGE21:
+      if (sym.has_tlsdesc(ctx)) {
+        i64 val = page(sym.get_tlsdesc_addr(ctx) + A) - page(P);
+        check(val, -(1LL << 32), 1LL << 32);
+        write_adrp(loc, val);
+      } else {
+        // adrp x0, 0 -> movz x0, #tls_ofset_hi, lsl #16
+        i64 val = (S + A - ctx.tp_addr);
+        check(val, -(1LL << 32), 1LL << 32);
+        *(ul32 *)loc = 0xd2a0'0000 | (bits(val, 32, 16) << 5);
+      }
+      break;
+    case R_AARCH64_TLSDESC_LD64_LO12:
+      if (sym.has_tlsdesc(ctx)) {
+        *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 3) << 10;
+      } else {
+        // ldr x2, [x0] -> movk x0, #tls_ofset_lo
+        u32 offset_lo = (S + A - ctx.tp_addr) & 0xffff;
+        *(ul32 *)loc = 0xf280'0000 | (offset_lo << 5);
+      }
+      break;
+    case R_AARCH64_TLSDESC_ADD_LO12:
+      if (sym.has_tlsdesc(ctx)) {
+        *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 0) << 10;
+      } else {
+        // add x0, x0, #0 -> nop
+        *(ul32 *)loc = 0xd503'201f;
+      }
+      break;
+    case R_AARCH64_TLSDESC_CALL:
+      if (!sym.has_tlsdesc(ctx)) {
+        // blr x2 -> nop
+        *(ul32 *)loc = 0xd503'201f;
+      }
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_AARCH64_ABS64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul64 *)loc = *val;
+      else
+        *(ul64 *)loc = S + A;
+      break;
+    case R_AARCH64_ABS32: {
+      i64 val = S + A;
+      check(val, 0, 1LL << 32);
+      *(ul32 *)loc = val;
+      break;
+    }
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+      break;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = (u8 *)(contents.data() + rel.r_offset);
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_AARCH64_ABS64:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_AARCH64_ADR_GOT_PAGE:
+      // An ADR_GOT_PAGE and GOT_LO12_NC relocation pair is used to load a
+      // symbol's address from GOT. If the GOT value is a link-time
+      // constant, we may be able to rewrite the ADRP+LDR instruction pair
+      // with an ADRP+ADD, eliminating a GOT memory load.
+      if (ctx.arg.relax && sym.is_relative() && !sym.is_imported &&
+          !sym.is_ifunc() && i + 1 < rels.size()) {
+        // ADRP+LDR must be consecutive and use the same register to relax.
+        const ElfRel<E> &rel2 = rels[i + 1];
+        if (rel2.r_type == R_AARCH64_LD64_GOT_LO12_NC &&
+            rel2.r_offset == rel.r_offset + 4 &&
+            rel2.r_sym == rel.r_sym &&
+            rel.r_addend == 0 &&
+            rel2.r_addend == 0 &&
+            is_adrp(loc) &&
+            is_ldr(loc + 4)) {
+          u32 rd = bits(*(ul32 *)loc, 4, 0);
+          u32 rn = bits(*(ul32 *)(loc + 4), 9, 5);
+          u32 rt = bits(*(ul32 *)(loc + 4), 4, 0);
+          if (rd == rn && rn == rt) {
+            i++;
+            break;
+          }
+        }
+      }
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_AARCH64_LD64_GOT_LO12_NC:
+    case R_AARCH64_LD64_GOTPAGE_LO15:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_AARCH64_CALL26:
+    case R_AARCH64_JUMP26:
+    case R_AARCH64_PLT32:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
+    case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_AARCH64_ADR_PREL_PG_HI21:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_AARCH64_TLSGD_ADR_PAGE21:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_AARCH64_TLSDESC_ADR_PAGE21:
+    case R_AARCH64_TLSDESC_LD64_LO12:
+    case R_AARCH64_TLSDESC_ADD_LO12:
+      if (!relax_tlsdesc(ctx, sym))
+        sym.flags |= NEEDS_TLSDESC;
+      break;
+    case R_AARCH64_TLSLE_MOVW_TPREL_G0:
+    case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
+    case R_AARCH64_TLSLE_MOVW_TPREL_G1:
+    case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
+    case R_AARCH64_TLSLE_MOVW_TPREL_G2:
+    case R_AARCH64_TLSLE_ADD_TPREL_HI12:
+    case R_AARCH64_TLSLE_ADD_TPREL_LO12:
+    case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_AARCH64_ADD_ABS_LO12_NC:
+    case R_AARCH64_ADR_PREL_LO21:
+    case R_AARCH64_CONDBR19:
+    case R_AARCH64_LD_PREL_LO19:
+    case R_AARCH64_LDST16_ABS_LO12_NC:
+    case R_AARCH64_LDST32_ABS_LO12_NC:
+    case R_AARCH64_LDST64_ABS_LO12_NC:
+    case R_AARCH64_LDST128_ABS_LO12_NC:
+    case R_AARCH64_LDST8_ABS_LO12_NC:
+    case R_AARCH64_MOVW_UABS_G0:
+    case R_AARCH64_MOVW_UABS_G0_NC:
+    case R_AARCH64_MOVW_UABS_G1:
+    case R_AARCH64_MOVW_UABS_G1_NC:
+    case R_AARCH64_MOVW_UABS_G2:
+    case R_AARCH64_MOVW_UABS_G2_NC:
+    case R_AARCH64_MOVW_UABS_G3:
+    case R_AARCH64_PREL16:
+    case R_AARCH64_PREL32:
+    case R_AARCH64_PREL64:
+    case R_AARCH64_TLSGD_ADD_LO12_NC:
+    case R_AARCH64_TLSDESC_CALL:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+template <>
+void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
+
+  static const ul32 data[] = {
+    0x9000'0010, // adrp x16, 0   # R_AARCH64_ADR_PREL_PG_HI21
+    0x9100'0210, // add  x16, x16 # R_AARCH64_ADD_ABS_LO12_NC
+    0xd61f'0200, // br   x16
+  };
+
+  static_assert(E::thunk_size == sizeof(data));
+
+  for (i64 i = 0; i < symbols.size(); i++) {
+    u64 S = symbols[i]->get_addr(ctx);
+    u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size;
+
+    u8 *loc = buf + i * E::thunk_size;
+    memcpy(loc , data, sizeof(data));
+    write_adrp(loc, page(S) - page(P));
+    *(ul32 *)(loc + 4) |= bits(S, 11, 0) << 10;
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-i386.cc
+++ b/third_party/mold/elf/arch-i386.cc
@ -0,0 +1,565 @@
+// clang-format off
+// i386 is similar to x86-64 but lacks PC-relative memory access
+// instructions. So it's not straightforward to support position-
+// independent code (PIC) on that target.
+//
+// If an object file is compiled with -fPIC, a function that needs to load
+// a value from memory first obtains its own address with the following
+// code
+//
+//   call __x86.get_pc_thunk.bx
+//
+// where __x86.get_pc_thunk.bx is defined as
+//
+//   __x86.get_pc_thunk.bx:
+//     mov (%esp), %ebx  # move the return address to %ebx
+//     ret
+//
+// . With the function's own address (or, more precisely, the address
+// immediately after the call instruction), the function can compute an
+// absolute address of a variable with its address + link-time constant.
+//
+// Executing call-mov-ret isn't very cheap, and allocating one register to
+// store PC isn't cheap too, especially given that i386 has only 8
+// general-purpose registers. But that's the cost of PIC on i386. You need
+// to pay it when creating a .so and a position-independent executable.
+//
+// When a position-independent function calls another function, it sets
+// %ebx to the address of .got. Position-independent PLT entries use that
+// register to load values from .got.plt/.got.
+//
+// If we are creating a position-dependent executable (PDE), we can't
+// assume that %ebx is set to .got. For PDE, we need to create position-
+// dependent PLT entries which don't use %ebx.
+//
+// https://github.com/rui314/psabi/blob/main/i386.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = I386;
+
+template <>
+i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
+  switch (rel.r_type) {
+  case R_386_8:
+  case R_386_PC8:
+    return *loc;
+  case R_386_16:
+  case R_386_PC16:
+    return *(ul16 *)loc;
+  case R_386_32:
+  case R_386_PC32:
+  case R_386_GOT32:
+  case R_386_GOT32X:
+  case R_386_PLT32:
+  case R_386_GOTOFF:
+  case R_386_GOTPC:
+  case R_386_TLS_LDM:
+  case R_386_TLS_GOTIE:
+  case R_386_TLS_LE:
+  case R_386_TLS_IE:
+  case R_386_TLS_GD:
+  case R_386_TLS_LDO_32:
+  case R_386_SIZE32:
+  case R_386_TLS_GOTDESC:
+    return *(ul32 *)loc;
+  default:
+    return 0;
+  }
+}
+
+template <>
+void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
+  switch (rel.r_type) {
+  case R_386_NONE:
+    break;
+  case R_386_8:
+  case R_386_PC8:
+    *loc = val;
+    break;
+  case R_386_16:
+  case R_386_PC16:
+    *(ul16 *)loc = val;
+    break;
+  case R_386_32:
+  case R_386_PC32:
+  case R_386_GOT32:
+  case R_386_GOT32X:
+  case R_386_PLT32:
+  case R_386_GOTOFF:
+  case R_386_GOTPC:
+  case R_386_TLS_LDM:
+  case R_386_TLS_GOTIE:
+  case R_386_TLS_LE:
+  case R_386_TLS_IE:
+  case R_386_TLS_GD:
+  case R_386_TLS_LDO_32:
+  case R_386_SIZE32:
+  case R_386_TLS_GOTDESC:
+    *(ul32 *)loc = val;
+    break;
+  default:
+    unreachable();
+  }
+}
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  if (ctx.arg.pic) {
+    static const u8 insn[] = {
+      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
+      0x51,                   // push   %ecx
+      0x8d, 0x8b, 0, 0, 0, 0, // lea    GOTPLT+4(%ebx), %ecx
+      0xff, 0x31,             // push   (%ecx)
+      0xff, 0x61, 0x04,       // jmp    *0x4(%ecx)
+    };
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 7) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr + 4;
+  } else {
+    static const u8 insn[] = {
+      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
+      0x51,                   // push   %ecx
+      0xb9, 0, 0, 0, 0,       // mov    GOTPLT+4, %ecx
+      0xff, 0x31,             // push   (%ecx)
+      0xff, 0x61, 0x04,       // jmp    *0x4(%ecx)
+      0xcc,                   // (padding)
+    };
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 6) = ctx.gotplt->shdr.sh_addr + 4;
+  }
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  if (ctx.arg.pic) {
+    static const u8 insn[] = {
+      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
+      0xb9, 0, 0, 0, 0,       // mov $reloc_offset, %ecx
+      0xff, 0xa3, 0, 0, 0, 0, // jmp *foo@GOT(%ebx)
+      0xcc,                   // (padding)
+    };
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+    *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
+  } else {
+    static const u8 insn[] = {
+      0xf3, 0x0f, 0x1e, 0xfb, // endbr32
+      0xb9, 0, 0, 0, 0,       // mov $reloc_offset, %ecx
+      0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT
+      0xcc,                   // (padding)
+    };
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+    *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx);
+  }
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  if (ctx.arg.pic) {
+    static const u8 insn[] = {
+      0xf3, 0x0f, 0x1e, 0xfb,             // endbr32
+      0xff, 0xa3, 0, 0, 0, 0,             // jmp *foo@GOT(%ebx)
+      0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding)
+    };
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 6) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr;
+  } else {
+    static const u8 insn[] = {
+      0xf3, 0x0f, 0x1e, 0xfb,             // endbr32
+      0xff, 0x25, 0, 0, 0, 0,             // jmp *foo@GOT
+      0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding)
+    };
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 6) = sym.get_got_addr(ctx);
+  }
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_386_32:
+    *(ul32 *)loc = val;
+    break;
+  case R_386_PC32:
+    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+static u32 relax_got32x(u8 *loc) {
+  // mov imm(%reg1), %reg2 -> lea imm(%reg1), %reg2
+  if (loc[0] == 0x8b)
+    return 0x8d00 | loc[1];
+  return 0;
+}
+
+// Relax GD to LE
+static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
+  static const u8 insn[] = {
+    0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
+    0x81, 0xc0, 0, 0, 0, 0, // add $tp_offset, %eax
+  };
+
+  switch (rel.r_type) {
+  case R_386_PLT32:
+  case R_386_PC32:
+    memcpy(loc - 3, insn, sizeof(insn));
+    *(ul32 *)(loc + 5) = val;
+    break;
+  case R_386_GOT32:
+  case R_386_GOT32X:
+    memcpy(loc - 2, insn, sizeof(insn));
+    *(ul32 *)(loc + 6) = val;
+    break;
+  default:
+    unreachable();
+  }
+}
+
+// Relax LD to LE
+static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
+  switch (rel.r_type) {
+  case R_386_PLT32:
+  case R_386_PC32: {
+    static const u8 insn[] = {
+      0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
+      0x2d, 0, 0, 0, 0,       // sub $tls_size, %eax
+    };
+    memcpy(loc - 2, insn, sizeof(insn));
+    *(ul32 *)(loc + 5) = val;
+    break;
+  }
+  case R_386_GOT32:
+  case R_386_GOT32X: {
+    static const u8 insn[] = {
+      0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
+      0x2d, 0, 0, 0, 0,       // sub $tls_size, %eax
+      0x90,                   // nop
+    };
+    memcpy(loc - 2, insn, sizeof(insn));
+    *(ul32 *)(loc + 5) = val;
+    break;
+  }
+  default:
+    unreachable();
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = get_addend(*this, rel);
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_386_8:
+      check(S + A, 0, 1 << 8);
+      *loc = S + A;
+      break;
+    case R_386_16:
+      check(S + A, 0, 1 << 16);
+      *(ul16 *)loc = S + A;
+      break;
+    case R_386_32:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_386_PC8:
+      check(S + A - P, -(1 << 7), 1 << 7);
+      *loc = S + A - P;
+      break;
+    case R_386_PC16:
+      check(S + A - P, -(1 << 15), 1 << 15);
+      *(ul16 *)loc = S + A - P;
+      break;
+    case R_386_PC32:
+    case R_386_PLT32:
+      *(ul32 *)loc = S + A - P;
+      break;
+    case R_386_GOT32:
+      *(ul32 *)loc = G + A;
+      break;
+    case R_386_GOT32X:
+      if (sym.has_got(ctx)) {
+        *(ul32 *)loc = G + A;
+      } else {
+        u32 insn = relax_got32x(loc - 2);
+        assert(insn);
+        loc[-2] = insn >> 8;
+        loc[-1] = insn;
+        *(ul32 *)loc = S + A - GOT;
+      }
+      break;
+    case R_386_GOTOFF:
+      *(ul32 *)loc = S + A - GOT;
+      break;
+    case R_386_GOTPC:
+      *(ul32 *)loc = GOT + A - P;
+      break;
+    case R_386_TLS_GOTIE:
+      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
+      break;
+    case R_386_TLS_LE:
+      *(ul32 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_386_TLS_IE:
+      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A;
+      break;
+    case R_386_TLS_GD:
+      if (sym.has_tlsgd(ctx)) {
+        *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
+      } else {
+        relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr);
+        i++;
+      }
+      break;
+    case R_386_TLS_LDM:
+      if (ctx.got->has_tlsld(ctx)) {
+        *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
+      } else {
+        relax_ld_to_le(loc, rels[i + 1], ctx.tp_addr - ctx.tls_begin);
+        i++;
+      }
+      break;
+    case R_386_TLS_LDO_32:
+      *(ul32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_386_SIZE32:
+      *(ul32 *)loc = sym.esym().st_size + A;
+      break;
+    case R_386_TLS_GOTDESC:
+      if (sym.has_tlsdesc(ctx)) {
+        *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - GOT;
+      } else {
+        static const u8 insn[] = {
+          0x8d, 0x05, 0, 0, 0, 0, // lea 0, %eax
+        };
+        memcpy(loc - 2, insn, sizeof(insn));
+        *(ul32 *)loc = S + A - ctx.tp_addr;
+      }
+      break;
+    case R_386_TLS_DESC_CALL:
+      if (!sym.has_tlsdesc(ctx)) {
+        // call *(%eax) -> nop
+        loc[0] = 0x66;
+        loc[1] = 0x90;
+      }
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : get_addend(*this, rel);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_386_8:
+      check(S + A, 0, 1 << 8);
+      *loc = S + A;
+      break;
+    case R_386_16:
+      check(S + A, 0, 1 << 16);
+      *(ul16 *)loc = S + A;
+      break;
+    case R_386_32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul32 *)loc = *val;
+      else
+        *(ul32 *)loc = S + A;
+      break;
+    case R_386_PC8:
+      check(S + A, -(1 << 7), 1 << 7);
+      *loc = S + A;
+      break;
+    case R_386_PC16:
+      check(S + A, -(1 << 15), 1 << 15);
+      *(ul16 *)loc = S + A;
+      break;
+    case R_386_PC32:
+      *(ul32 *)loc = S + A;
+      break;
+    case R_386_GOTPC:
+      *(ul32 *)loc = GOT + A;
+      break;
+    case R_386_GOTOFF:
+      *(ul32 *)loc = S + A - GOT;
+      break;
+    case R_386_TLS_LDO_32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul32 *)loc = *val;
+      else
+        *(ul32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_386_SIZE32:
+      *(ul32 *)loc = sym.esym().st_size + A;
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = (u8 *)(contents.data() + rel.r_offset);
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_386_8:
+    case R_386_16:
+      scan_absrel(ctx, sym, rel);
+      break;
+    case R_386_32:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_386_PC8:
+    case R_386_PC16:
+    case R_386_PC32:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_386_GOT32:
+    case R_386_GOTPC:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_386_GOT32X: {
+      // We always want to relax GOT32X because static PIE doesn't
+      // work without it.
+      bool do_relax = !sym.is_imported && sym.is_relative() &&
+                      relax_got32x(loc - 2);
+      if (!do_relax)
+        sym.flags |= NEEDS_GOT;
+      break;
+    }
+    case R_386_PLT32:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_386_TLS_GOTIE:
+    case R_386_TLS_IE:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_386_TLS_GD:
+      if (i + 1 == rels.size())
+        Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
+
+      if (u32 ty = rels[i + 1].r_type;
+          ty != R_386_PLT32 && ty != R_386_PC32 &&
+          ty != R_386_GOT32 && ty != R_386_GOT32X)
+        Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
+
+      // We always relax if -static because libc.a doesn't contain
+      // __tls_get_addr().
+      if (ctx.arg.is_static ||
+          (ctx.arg.relax && !ctx.arg.shared && !sym.is_imported))
+        i++;
+      else
+        sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_386_TLS_LDM:
+      if (i + 1 == rels.size())
+        Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
+
+      if (u32 ty = rels[i + 1].r_type;
+          ty != R_386_PLT32 && ty != R_386_PC32 &&
+          ty != R_386_GOT32 && ty != R_386_GOT32X)
+        Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
+
+      // We always relax if -static because libc.a doesn't contain
+      // __tls_get_addr().
+      if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared))
+        i++;
+      else
+        ctx.needs_tlsld = true;
+      break;
+    case R_386_TLS_GOTDESC:
+      if (!relax_tlsdesc(ctx, sym))
+        sym.flags |= NEEDS_TLSDESC;
+      break;
+    case R_386_TLS_LE:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_386_GOTOFF:
+    case R_386_TLS_LDO_32:
+    case R_386_SIZE32:
+    case R_386_TLS_DESC_CALL:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-m68k.cc
+++ b/third_party/mold/elf/arch-m68k.cc
@ -0,0 +1,326 @@
+// clang-format off
+// This file contains code for the Motorola 68000 series microprocessors,
+// which is often abbreviated as m68k. Running a Unix-like system on a
+// m68k-based machine today is probably a retro-computing hobby activity,
+// but the processor was a popular choice to build Unix computers during
+// '80s. Early Sun workstations for example used m68k. Macintosh until
+// 1994 were based on m68k as well until they switched to PowerPC (and
+// then to x86 and to ARM.)
+//
+// From the linker's point of view, it is not hard to support m68k. It's
+// just a 32-bit big-endian CISC ISA. Compared to comtemporary i386,
+// m68k's psABI is actually simpler because m68k has PC-relative memory
+// access instructions and therefore can support position-independent
+// code without too much hassle.
+//
+// https://github.com/rui314/psabi/blob/main/m68k.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = M68K;
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const u8 insn[] = {
+    0x2f, 0x00,                         // move.l %d0, -(%sp)
+    0x2f, 0x3b, 0x01, 0x70, 0, 0, 0, 0, // move.l (GOTPLT+4, %pc), -(%sp)
+    0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp    ([GOTPLT+8, %pc])
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ub32 *)(buf + 6) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr;
+  *(ub32 *)(buf + 14) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 4;
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static const u8 insn[] = {
+    0x20, 0x3c, 0, 0, 0, 0,             // move.l PLT_OFFSET, %d0
+    0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp    ([GOTPLT_ENTRY, %pc])
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ub32 *)(buf + 2) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+  *(ub32 *)(buf + 10) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8;
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static const u8 insn[] = {
+    0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOT_ENTRY, %pc])
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ub32 *)(buf + 4) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 2;
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_68K_32:
+    *(ub32 *)loc = val;
+    break;
+  case R_68K_PC32:
+    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    auto write16 = [&](u64 val) {
+      check(val, 0, 1 << 16);
+      *(ub16 *)loc = val;
+    };
+
+    auto write16s = [&](u64 val) {
+      check(val, -(1 << 15), 1 << 15);
+      *(ub16 *)loc = val;
+    };
+
+    auto write8 = [&](u64 val) {
+      check(val, 0, 1 << 8);
+      *loc = val;
+    };
+
+    auto write8s = [&](u64 val) {
+      check(val, -(1 << 7), 1 << 7);
+      *loc = val;
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_68K_32:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_68K_16:
+      write16(S + A);
+      break;
+    case R_68K_8:
+      write8(S + A);
+      break;
+    case R_68K_PC32:
+    case R_68K_PLT32:
+      *(ub32 *)loc = S + A - P;
+      break;
+    case R_68K_PC16:
+    case R_68K_PLT16:
+      write16s(S + A - P);
+      break;
+    case R_68K_PC8:
+    case R_68K_PLT8:
+      write8s(S + A - P);
+      break;
+    case R_68K_GOTPCREL32:
+      *(ub32 *)loc = GOT + A - P;
+      break;
+    case R_68K_GOTPCREL16:
+      write16s(GOT + A - P);
+      break;
+    case R_68K_GOTPCREL8:
+      write8s(GOT + A - P);
+      break;
+    case R_68K_GOTOFF32:
+      *(ub32 *)loc = G + A;
+      break;
+    case R_68K_GOTOFF16:
+      write16(G + A);
+      break;
+    case R_68K_GOTOFF8:
+      write8(G + A);
+      break;
+    case R_68K_TLS_GD32:
+      *(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
+      break;
+    case R_68K_TLS_GD16:
+      write16(sym.get_tlsgd_addr(ctx) + A - GOT);
+      break;
+    case R_68K_TLS_GD8:
+      write8(sym.get_tlsgd_addr(ctx) + A - GOT);
+      break;
+    case R_68K_TLS_LDM32:
+      *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
+      break;
+    case R_68K_TLS_LDM16:
+      write16(ctx.got->get_tlsld_addr(ctx) + A - GOT);
+      break;
+    case R_68K_TLS_LDM8:
+      write8(ctx.got->get_tlsld_addr(ctx) + A - GOT);
+      break;
+    case R_68K_TLS_LDO32:
+      *(ub32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_68K_TLS_LDO16:
+      write16s(S + A - ctx.dtp_addr);
+      break;
+    case R_68K_TLS_LDO8:
+      write8s(S + A - ctx.dtp_addr);
+      break;
+    case R_68K_TLS_IE32:
+      *(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
+      break;
+    case R_68K_TLS_IE16:
+      write16(sym.get_gottp_addr(ctx) + A - GOT);
+      break;
+    case R_68K_TLS_IE8:
+      write8(sym.get_gottp_addr(ctx) + A - GOT);
+      break;
+    case R_68K_TLS_LE32:
+      *(ub32 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_68K_TLS_LE16:
+      write16(S + A - ctx.tp_addr);
+      break;
+    case R_68K_TLS_LE8:
+      write8(S + A - ctx.tp_addr);
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_68K_32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ub32 *)loc = *val;
+      else
+        *(ub32 *)loc = S + A;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      Error(ctx) << sym << ": GNU ifunc symbol is not supported on m68k";
+
+    switch (rel.r_type) {
+    case R_68K_32:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_68K_16:
+    case R_68K_8:
+      scan_absrel(ctx, sym, rel);
+      break;
+    case R_68K_PC32:
+    case R_68K_PC16:
+    case R_68K_PC8:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_68K_GOTPCREL32:
+    case R_68K_GOTPCREL16:
+    case R_68K_GOTPCREL8:
+    case R_68K_GOTOFF32:
+    case R_68K_GOTOFF16:
+    case R_68K_GOTOFF8:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_68K_PLT32:
+    case R_68K_PLT16:
+    case R_68K_PLT8:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_68K_TLS_GD32:
+    case R_68K_TLS_GD16:
+    case R_68K_TLS_GD8:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_68K_TLS_LDM32:
+    case R_68K_TLS_LDM16:
+    case R_68K_TLS_LDM8:
+      ctx.needs_tlsld = true;
+      break;
+    case R_68K_TLS_IE32:
+    case R_68K_TLS_IE16:
+    case R_68K_TLS_IE8:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_68K_TLS_LE32:
+    case R_68K_TLS_LE16:
+    case R_68K_TLS_LE8:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_68K_TLS_LDO32:
+    case R_68K_TLS_LDO16:
+    case R_68K_TLS_LDO8:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-ppc32.cc
+++ b/third_party/mold/elf/arch-ppc32.cc
@ -0,0 +1,452 @@
+// clang-format off
+// This file implements the PowerPC 32-bit ISA. For 64-bit PowerPC, see
+// arch-ppc64v1.cpp and arch-ppc64v2.cpp.
+//
+// PPC32 is a RISC ISA. It has 32 general-purpose registers (GPRs).
+// r0, r11 and r12 are reserved for static linkers, so we can use these
+// registers in PLTs and range extension thunks. In addition to that, it
+// has a few special registers. Notable ones are LR which holds a return
+// address and CTR which we can use to store a branch target address.
+//
+// It feels that the PPC32 psABI is unnecessarily complicated at first
+// glance, but that is mainly stemmed from the fact that the ISA lacks
+// PC-relative load/store instructions. Since machine instructions cannot
+// load data relative to its own address, it is not straightforward to
+// support position-independent code (PIC) on PPC32.
+//
+// A position-independent function typically contains the following code
+// in the prologue to obtain its own address:
+//
+//    mflr  r0        // save the current return address to %r0
+//    bcl   20, 31, 4 // call the next instruction as if it were a function
+//    mtlr  r12       // save the return address to %r12
+//    mtlr  r0        // restore the original return address
+//
+// An object file compiled with -fPIC contains a data section named
+// `.got2` to store addresses of locally-defined global variables and
+// constants. A PIC function usually computes its .got2+0x8000 and set it
+// to %r30. This scheme allows the function to access global objects
+// defined in the same input file with a single %r30-relative load/store
+// instruction with a 16-bit offset, given that .got2 is smaller than
+// 0x10000 (or 65536) bytes.
+//
+// Since each object file has its own .got2, %r30 refers to different
+// places in a merged .got2 for two functions that came from different
+// input files. Therefore, %r30 makes sense only within a single function.
+//
+// Technically, we can reuse a %r30 value in our PLT if we create a PLT
+// _for each input file_ (that's what GNU ld seems to be doing), but that
+// doesn't seems to be worth its complexity. Our PLT simply doesn't rely
+// on a %r30 value.
+//
+// https://github.com/rui314/psabi/blob/main/ppc32.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = PPC32;
+
+static u64 lo(u64 x)    { return x & 0xffff; }
+static u64 hi(u64 x)    { return x >> 16; }
+static u64 ha(u64 x)    { return (x + 0x8000) >> 16; }
+static u64 high(u64 x)  { return (x >> 16) & 0xffff; }
+static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const ub32 insn[] = {
+    // Get the address of this PLT section
+    0x7c08'02a6, //    mflr    r0
+    0x429f'0005, //    bcl     20, 31, 4
+    0x7d88'02a6, // 1: mflr    r12
+    0x7c08'03a6, //    mtlr    r0
+
+    // Compute the runtime address of GOTPLT+12
+    0x3d8c'0000, //    addis   r12, r12, (GOTPLT - 1b)@higha
+    0x398c'0000, //    addi    r12, r12, (GOTPLT - 1b)@lo
+
+    // Compute the PLT entry offset
+    0x7d6c'5850, //    sub     r11, r11, r12
+    0x1d6b'0003, //    mulli   r11, r11, 3
+
+    // Load GOTPLT[2] and branch to GOTPLT[1]
+    0x800c'fff8, //    lwz     r0,  -8(r12)
+    0x7c09'03a6, //    mtctr   r0
+    0x818c'fffc, //    lwz     r12, -4(r12)
+    0x4e80'0420, //    bctr
+    0x6000'0000, //    nop
+    0x6000'0000, //    nop
+    0x6000'0000, //    nop
+    0x6000'0000, //    nop
+  };
+
+  static_assert(sizeof(insn) == E::plt_hdr_size);
+  memcpy(buf, insn, sizeof(insn));
+
+  ub32 *loc = (ub32 *)buf;
+  loc[4] |= higha(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
+  loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
+}
+
+static const ub32 plt_entry[] = {
+  // Get the address of this PLT entry
+  0x7c08'02a6, // mflr    r0
+  0x429f'0005, // bcl     20, 31, 4
+  0x7d88'02a6, // mflr    r12
+  0x7c08'03a6, // mtlr    r0
+
+  // Load an address from the GOT/GOTPLT entry and jump to that address
+  0x3d6c'0000, // addis   r11, r12, OFFSET@higha
+  0x396b'0000, // addi    r11, r11, OFFSET@lo
+  0x818b'0000, // lwz     r12, 0(r11)
+  0x7d89'03a6, // mtctr   r12
+  0x4e80'0420, // bctr
+};
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static_assert(E::plt_size == sizeof(plt_entry));
+  memcpy(buf, plt_entry, sizeof(plt_entry));
+
+  ub32 *loc = (ub32 *)buf;
+  i64 offset = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8;
+  loc[4] |= higha(offset);
+  loc[5] |= lo(offset);
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static_assert(E::pltgot_size == sizeof(plt_entry));
+  memcpy(buf, plt_entry, sizeof(plt_entry));
+
+  ub32 *loc = (ub32 *)buf;
+  i64 offset = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 8;
+  loc[4] |= higha(offset);
+  loc[5] |= lo(offset);
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_PPC_ADDR32:
+    *(ub32 *)loc = val;
+    break;
+  case R_PPC_REL32:
+    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  u64 GOT2 = file.ppc32_got2 ? file.ppc32_got2->get_addr() : 0;
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_PPC_ADDR32:
+    case R_PPC_UADDR32:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_PPC_ADDR14:
+      *(ub32 *)loc |= bits(S + A, 15, 2) << 2;
+      break;
+    case R_PPC_ADDR16:
+    case R_PPC_UADDR16:
+    case R_PPC_ADDR16_LO:
+      *(ub16 *)loc = lo(S + A);
+      break;
+    case R_PPC_ADDR16_HI:
+      *(ub16 *)loc = hi(S + A);
+      break;
+    case R_PPC_ADDR16_HA:
+      *(ub16 *)loc = ha(S + A);
+      break;
+    case R_PPC_ADDR24:
+      *(ub32 *)loc |= bits(S + A, 25, 2) << 2;
+      break;
+    case R_PPC_ADDR30:
+      *(ub32 *)loc |= bits(S + A, 31, 2) << 2;
+      break;
+    case R_PPC_PLT16_LO:
+      *(ub16 *)loc = lo(G + GOT - A - GOT2);
+      break;
+    case R_PPC_PLT16_HI:
+      *(ub16 *)loc = hi(G + GOT - A - GOT2);
+      break;
+    case R_PPC_PLT16_HA:
+      *(ub16 *)loc = ha(G + GOT - A - GOT2);
+      break;
+    case R_PPC_PLT32:
+      *(ub32 *)loc = G + GOT - A - GOT2;
+      break;
+    case R_PPC_REL14:
+      *(ub32 *)loc |= bits(S + A - P, 15, 2) << 2;
+      break;
+    case R_PPC_REL16:
+    case R_PPC_REL16_LO:
+      *(ub16 *)loc = lo(S + A - P);
+      break;
+    case R_PPC_REL16_HI:
+      *(ub16 *)loc = hi(S + A - P);
+      break;
+    case R_PPC_REL16_HA:
+      *(ub16 *)loc = ha(S + A - P);
+      break;
+    case R_PPC_REL24:
+    case R_PPC_LOCAL24PC: {
+      i64 val = S + A - P;
+      if (sign_extend(val, 25) != val)
+        val = get_thunk_addr(i) - P;
+      *(ub32 *)loc |= bits(val, 25, 2) << 2;
+      break;
+    }
+    case R_PPC_PLTREL24: {
+      i64 val = S - P;
+      if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
+        val = get_thunk_addr(i) - P;
+      *(ub32 *)loc |= bits(val, 25, 2) << 2;
+      break;
+    }
+    case R_PPC_REL32:
+    case R_PPC_PLTREL32:
+      *(ub32 *)loc = S + A - P;
+      break;
+    case R_PPC_GOT16:
+    case R_PPC_GOT16_LO:
+      *(ub16 *)loc = lo(G + A);
+      break;
+    case R_PPC_GOT16_HI:
+      *(ub16 *)loc = hi(G + A);
+      break;
+    case R_PPC_GOT16_HA:
+      *(ub16 *)loc = ha(G + A);
+      break;
+    case R_PPC_TPREL16_LO:
+      *(ub16 *)loc = lo(S + A - ctx.tp_addr);
+      break;
+    case R_PPC_TPREL16_HI:
+      *(ub16 *)loc = hi(S + A - ctx.tp_addr);
+      break;
+    case R_PPC_TPREL16_HA:
+      *(ub16 *)loc = ha(S + A - ctx.tp_addr);
+      break;
+    case R_PPC_DTPREL16_LO:
+      *(ub16 *)loc = lo(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC_DTPREL16_HI:
+      *(ub16 *)loc = hi(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC_DTPREL16_HA:
+      *(ub16 *)loc = ha(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC_GOT_TLSGD16:
+      *(ub16 *)loc = sym.get_tlsgd_addr(ctx) - GOT;
+      break;
+    case R_PPC_GOT_TLSLD16:
+      *(ub16 *)loc = ctx.got->get_tlsld_addr(ctx) - GOT;
+      break;
+    case R_PPC_GOT_TPREL16:
+      *(ub16 *)loc = sym.get_gottp_addr(ctx) - GOT;
+      break;
+    case R_PPC_TLS:
+    case R_PPC_TLSGD:
+    case R_PPC_TLSLD:
+    case R_PPC_PLTSEQ:
+    case R_PPC_PLTCALL:
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_PPC_ADDR32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ub32 *)loc = *val;
+      else
+        *(ub32 *)loc = S + A;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_PPC_ADDR32:
+    case R_PPC_UADDR32:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_PPC_ADDR14:
+    case R_PPC_ADDR16:
+    case R_PPC_UADDR16:
+    case R_PPC_ADDR16_LO:
+    case R_PPC_ADDR16_HI:
+    case R_PPC_ADDR16_HA:
+    case R_PPC_ADDR24:
+    case R_PPC_ADDR30:
+      scan_absrel(ctx, sym, rel);
+      break;
+    case R_PPC_REL14:
+    case R_PPC_REL16:
+    case R_PPC_REL16_LO:
+    case R_PPC_REL16_HI:
+    case R_PPC_REL16_HA:
+    case R_PPC_REL32:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_PPC_GOT16:
+    case R_PPC_GOT16_LO:
+    case R_PPC_GOT16_HI:
+    case R_PPC_GOT16_HA:
+    case R_PPC_PLT16_LO:
+    case R_PPC_PLT16_HI:
+    case R_PPC_PLT16_HA:
+    case R_PPC_PLT32:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_PPC_REL24:
+    case R_PPC_PLTREL24:
+    case R_PPC_PLTREL32:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_PPC_GOT_TLSGD16:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_PPC_GOT_TLSLD16:
+      ctx.needs_tlsld = true;
+      break;
+    case R_PPC_GOT_TPREL16:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_PPC_TPREL16_LO:
+    case R_PPC_TPREL16_HI:
+    case R_PPC_TPREL16_HA:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_PPC_LOCAL24PC:
+    case R_PPC_TLS:
+    case R_PPC_TLSGD:
+    case R_PPC_TLSLD:
+    case R_PPC_DTPREL16_LO:
+    case R_PPC_DTPREL16_HI:
+    case R_PPC_DTPREL16_HA:
+    case R_PPC_PLTSEQ:
+    case R_PPC_PLTCALL:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+template <>
+void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
+
+  static const ub32 local_thunk[] = {
+    // Get this thunk's address
+    0x7c08'02a6, // mflr    r0
+    0x429f'0005, // bcl     20, 31, 4
+    0x7d88'02a6, // mflr    r12
+    0x7c08'03a6, // mtlr    r0
+
+    // Materialize the destination's address in %r11 and jump to that address
+    0x3d6c'0000, // addis   r11, r12, OFFSET@higha
+    0x396b'0000, // addi    r11, r11, OFFSET@lo
+    0x7d69'03a6, // mtctr   r11
+    0x4e80'0420, // bctr
+    0x6000'0000, // nop
+  };
+
+  static_assert(E::thunk_size == sizeof(plt_entry));
+  static_assert(E::thunk_size == sizeof(local_thunk));
+
+  for (i64 i = 0; i < symbols.size(); i++) {
+    ub32 *loc = (ub32 *)(buf + i * E::thunk_size);
+    Symbol<E> &sym = *symbols[i];
+
+    if (sym.has_plt(ctx)) {
+      memcpy(loc, plt_entry, sizeof(plt_entry));
+      u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx);
+      i64 val = got - get_addr(i) - 8;
+      loc[4] |= higha(val);
+      loc[5] |= lo(val);
+    } else {
+      memcpy(loc, local_thunk, sizeof(local_thunk));
+      i64 val = sym.get_addr(ctx) - get_addr(i) - 8;
+      loc[4] |= higha(val);
+      loc[5] |= lo(val);
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-ppc64v1.cc
+++ b/third_party/mold/elf/arch-ppc64v1.cc
@ -0,0 +1,687 @@
+// clang-format off
+// This file contains code for the 64-bit PowerPC ELFv1 ABI that is
+// commonly used for big-endian PPC systems. Modern PPC systems that use
+// the processor in the little-endian mode use the ELFv2 ABI instead. For
+// ELFv2, see arch-ppc64v2.cc.
+//
+// Even though they are similiar, ELFv1 isn't only different from ELFv2 in
+// endianness. The most notable difference is, in ELFv1, a function
+// pointer doesn't directly refer to the entry point of a function but
+// instead refers to a data structure so-called "function descriptor".
+//
+// The function descriptor is essentially a pair of a function entry point
+// address and a value that should be set to %r2 before calling that
+// function. There is also a third member for "the environment pointer for
+// languages such as Pascal and PL/1" according to the psABI, but it looks
+// like no one acutally uses it. In total, the function descriptor is 24
+// bytes long. Here is why we need it.
+//
+// PPC generally lacks PC-relative data access instructions. Position-
+// independent code sets GOT + 0x8000 to %r2 and access global variables
+// relative to %r2.
+//
+// Each ELF file has its own GOT. If a function calls another function in
+// the same ELF file, it doesn't have to reset %r2. However, if it is in
+// other file (e.g. other .so), it has to set a new value to %r2 so that
+// the register contains the callee's GOT + 0x8000.
+//
+// In this way, you can't call a function just by knowing the function's
+// entry point address. You also need to know a proper %r2 value for the
+// function. This is why a function pointer refers to a tuple of an
+// address and a %r2 value.
+//
+// If a function call is made through PLT, PLT takes care of restoring %r2.
+// Therefore, the caller has to restore %r2 only for function calls
+// through function pointers.
+//
+// .opd (short for "official procedure descriptors") contains function
+// descriptors.
+//
+// You can think OPD as this: even in other targets, a function can have a
+// few different addresses for different purposes. It may not only have an
+// entry point address but may also have PLT and/or GOT addresses.
+// In PPCV1, it may have an OPD address in addition to these. OPD address
+// is used for relocations that refers to the address of a function as a
+// function pointer.
+//
+// https://github.com/rui314/psabi/blob/main/ppc64v1.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+#include "third_party/libcxx/algorithm"
+// MISSING #include <tbb/parallel_for_each.h>
+
+namespace mold::elf {
+
+using E = PPC64V1;
+
+static u64 lo(u64 x)    { return x & 0xffff; }
+static u64 hi(u64 x)    { return x >> 16; }
+static u64 ha(u64 x)    { return (x + 0x8000) >> 16; }
+static u64 high(u64 x)  { return (x >> 16) & 0xffff; }
+static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
+
+// .plt is used only for lazy symbol resolution on PPC64. All PLT
+// calls are made via range extension thunks even if they are within
+// reach. Thunks read addresses from .got.plt and jump there.
+// Therefore, once PLT symbols are resolved and final addresses are
+// written to .got.plt, thunks just skip .plt and directly jump to the
+// resolved addresses.
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const ub32 insn[] = {
+    0x7d88'02a6, // mflr    r12
+    0x429f'0005, // bcl     20, 31, 4 // obtain PC
+    0x7d68'02a6, // mflr    r11
+    0xe84b'0024, // ld      r2,36(r11)
+    0x7d88'03a6, // mtlr    r12
+    0x7d62'5a14, // add     r11,r2,r11
+    0xe98b'0000, // ld      r12,0(r11)
+    0xe84b'0008, // ld      r2,8(r11)
+    0x7d89'03a6, // mtctr   r12
+    0xe96b'0010, // ld      r11,16(r11)
+    0x4e80'0420, // bctr
+    // .quad .got.plt - .plt - 8
+    0x0000'0000,
+    0x0000'0000,
+  };
+
+  static_assert(sizeof(insn) == E::plt_hdr_size);
+  memcpy(buf, insn, sizeof(insn));
+  *(ub64 *)(buf + 44) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8;
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  ub32 *loc = (ub32 *)buf;
+  i64 idx = sym.get_plt_idx(ctx);
+
+  // The PPC64 ELFv1 ABI requires PLT entries to be vary in size depending
+  // on their indices. Unlike other targets, .got.plt is filled not by us
+  // but by the loader, so we don't have a control over where the initial
+  // call to the PLT entry jumps to. So we need to strictly follow the PLT
+  // section layout as the loader expect it to be.
+  if (idx < 0x8000) {
+    static const ub32 insn[] = {
+      0x3800'0000, // li      r0, PLT_INDEX
+      0x4b00'0000, // b       plt0
+    };
+
+    memcpy(loc, insn, sizeof(insn));
+    loc[0] |= idx;
+    loc[1] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 4) & 0x00ff'ffff;
+  } else {
+    static const ub32 insn[] = {
+      0x3c00'0000, // lis     r0, PLT_INDEX@high
+      0x6000'0000, // ori     r0, r0, PLT_INDEX@lo
+      0x4b00'0000, // b       plt0
+    };
+
+    memcpy(loc, insn, sizeof(insn));
+    loc[0] |= high(idx);
+    loc[1] |= lo(idx);
+    loc[2] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 8) & 0x00ff'ffff;
+  }
+}
+
+// .plt.got is not necessary on PPC64 because range extension thunks
+// directly read GOT entries and jump there.
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_PPC64_ADDR64:
+    *(ub64 *)loc = val;
+    break;
+  case R_PPC64_REL32:
+    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  case R_PPC64_REL64:
+    *(ub64 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+    u64 TOC = ctx.extra.TOC->value;
+
+    switch (rel.r_type) {
+    case R_PPC64_ADDR64:
+      apply_toc_rel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_PPC64_TOC:
+      apply_toc_rel(ctx, *ctx.extra.TOC, rel, loc, TOC, A, P, dynrel);
+      break;
+    case R_PPC64_TOC16_HA:
+      *(ub16 *)loc = ha(S + A - TOC);
+      break;
+    case R_PPC64_TOC16_LO:
+      *(ub16 *)loc = lo(S + A - TOC);
+      break;
+    case R_PPC64_TOC16_DS:
+      check(S + A - TOC, -(1 << 15), 1 << 15);
+      *(ub16 *)loc |= (S + A - TOC) & 0xfffc;
+      break;
+    case R_PPC64_TOC16_LO_DS:
+      *(ub16 *)loc |= (S + A - TOC) & 0xfffc;
+      break;
+    case R_PPC64_REL24: {
+      i64 val = sym.get_addr(ctx, NO_OPD) + A - P;
+      if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
+        val = get_thunk_addr(i) + A - P;
+
+      check(val, -(1 << 25), 1 << 25);
+      *(ub32 *)loc |= bits(val, 25, 2) << 2;
+
+      // If a callee is an external function, PLT saves %r2 to the
+      // caller's r2 save slot. We need to restore it after function
+      // return. To do so, there's usually a NOP as a placeholder
+      // after a BL. 0x6000'0000 is a NOP.
+      if (sym.has_plt(ctx) && *(ub32 *)(loc + 4) == 0x6000'0000)
+        *(ub32 *)(loc + 4) = 0xe841'0028; // ld r2, 40(r1)
+      break;
+    }
+    case R_PPC64_REL32:
+      *(ub32 *)loc = S + A - P;
+      break;
+    case R_PPC64_REL64:
+      *(ub64 *)loc = S + A - P;
+      break;
+    case R_PPC64_REL16_HA:
+      *(ub16 *)loc = ha(S + A - P);
+      break;
+    case R_PPC64_REL16_LO:
+      *(ub16 *)loc = lo(S + A - P);
+      break;
+    case R_PPC64_PLT16_HA:
+      *(ub16 *)loc = ha(G + GOT - TOC);
+      break;
+    case R_PPC64_PLT16_HI:
+      *(ub16 *)loc = hi(G + GOT - TOC);
+      break;
+    case R_PPC64_PLT16_LO:
+      *(ub16 *)loc = lo(G + GOT - TOC);
+      break;
+    case R_PPC64_PLT16_LO_DS:
+      *(ub16 *)loc |= (G + GOT - TOC) & 0xfffc;
+      break;
+    case R_PPC64_GOT_TPREL16_HA:
+      *(ub16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSGD16_HA:
+      *(ub16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSGD16_LO:
+      *(ub16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSLD16_HA:
+      *(ub16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSLD16_LO:
+      *(ub16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC);
+      break;
+    case R_PPC64_DTPREL16_HA:
+      *(ub16 *)loc = ha(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC64_DTPREL16_LO:
+      *(ub16 *)loc = lo(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC64_TPREL16_HA:
+      *(ub16 *)loc = ha(S + A - ctx.tp_addr);
+      break;
+    case R_PPC64_TPREL16_LO:
+      *(ub16 *)loc = lo(S + A - ctx.tp_addr);
+      break;
+    case R_PPC64_GOT_TPREL16_LO_DS:
+      *(ub16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc;
+      break;
+    case R_PPC64_PLTSEQ:
+    case R_PPC64_PLTCALL:
+    case R_PPC64_TLS:
+    case R_PPC64_TLSGD:
+    case R_PPC64_TLSLD:
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_PPC64_ADDR64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ub64 *)loc = *val;
+      else
+        *(ub64 *)loc = S + A;
+      break;
+    case R_PPC64_ADDR32: {
+      i64 val = S + A;
+      check(val, 0, 1LL << 32);
+      *(ub32 *)loc = val;
+      break;
+    }
+    case R_PPC64_DTPREL64:
+      *(ub64 *)loc = S + A - ctx.dtp_addr;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT | NEEDS_PPC_OPD;
+
+    // Any relocation except R_PPC64_REL24 is considered as an
+    // address-taking relocation.
+    if (rel.r_type != R_PPC64_REL24 && sym.get_type() == STT_FUNC)
+      sym.flags |= NEEDS_PPC_OPD;
+
+    switch (rel.r_type) {
+    case R_PPC64_ADDR64:
+    case R_PPC64_TOC:
+      scan_toc_rel(ctx, sym, rel);
+      break;
+    case R_PPC64_GOT_TPREL16_HA:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_PPC64_REL24:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_PPC64_PLT16_HA:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_PPC64_GOT_TLSGD16_HA:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_PPC64_GOT_TLSLD16_HA:
+      ctx.needs_tlsld = true;
+      break;
+    case R_PPC64_TPREL16_HA:
+    case R_PPC64_TPREL16_LO:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_PPC64_REL32:
+    case R_PPC64_REL64:
+    case R_PPC64_TOC16_HA:
+    case R_PPC64_TOC16_LO:
+    case R_PPC64_TOC16_LO_DS:
+    case R_PPC64_TOC16_DS:
+    case R_PPC64_REL16_HA:
+    case R_PPC64_REL16_LO:
+    case R_PPC64_PLT16_HI:
+    case R_PPC64_PLT16_LO:
+    case R_PPC64_PLT16_LO_DS:
+    case R_PPC64_PLTSEQ:
+    case R_PPC64_PLTCALL:
+    case R_PPC64_GOT_TPREL16_LO_DS:
+    case R_PPC64_GOT_TLSGD16_LO:
+    case R_PPC64_GOT_TLSLD16_LO:
+    case R_PPC64_TLS:
+    case R_PPC64_TLSGD:
+    case R_PPC64_TLSLD:
+    case R_PPC64_DTPREL16_HA:
+    case R_PPC64_DTPREL16_LO:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+template <>
+void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
+
+  // If the destination is .plt.got, we save the current r2, read an
+  // address of a function descriptor from .got, restore %r2 and jump
+  // to the function.
+  static const ub32 pltgot_thunk[] = {
+    // Store the caller's %r2
+    0xf841'0028, // std   %r2, 40(%r1)
+
+    // Load an address of a function descriptor
+    0x3d82'0000, // addis %r12, %r2,  foo@got@toc@ha
+    0xe98c'0000, // ld    %r12, foo@got@toc@lo(%r12)
+
+    // Restore the callee's %r2
+    0xe84c'0008, // ld    %r2,  8(%r12)
+
+    // Jump to the function
+    0xe98c'0000, // ld    %r12, 0(%r12)
+    0x7d89'03a6, // mtctr %r12
+    0x4e80'0420, // bctr
+  };
+
+  // If the destination is .plt, read a function descriptor from .got.plt.
+  static const ub32 plt_thunk[] = {
+    // Store the caller's %r2
+    0xf841'0028, // std   %r2, 40(%r1)
+
+    // Materialize an address of a function descriptor
+    0x3d82'0000, // addis %r12, %r2,  foo@gotplt@toc@ha
+    0x398c'0000, // addi  %r12, %r12, foo@gotplt@toc@lo
+
+    // Restore the callee's %r2
+    0xe84c'0008, // ld    %r2,  8(%r12)
+
+    // Jump to the function
+    0xe98c'0000, // ld    %r12, 0(%r12)
+    0x7d89'03a6, // mtctr %r12
+    0x4e80'0420, // bctr
+  };
+
+  // If the destination is a non-imported function, we directly jump
+  // to the function entry address.
+  static const ub32 local_thunk[] = {
+    0x3d82'0000, // addis r12, r2,  foo@toc@ha
+    0x398c'0000, // addi  r12, r12, foo@toc@lo
+    0x7d89'03a6, // mtctr r12
+    0x4e80'0420, // bctr
+    0x6000'0000, // nop
+    0x6000'0000, // nop
+    0x6000'0000, // nop
+  };
+
+  static_assert(E::thunk_size == sizeof(pltgot_thunk));
+  static_assert(E::thunk_size == sizeof(plt_thunk));
+  static_assert(E::thunk_size == sizeof(local_thunk));
+
+  for (i64 i = 0; i < symbols.size(); i++) {
+    Symbol<E> &sym = *symbols[i];
+    ub32 *loc = (ub32 *)(buf + i * E::thunk_size);
+
+    if (sym.has_got(ctx)) {
+      memcpy(loc, pltgot_thunk, sizeof(pltgot_thunk));
+      i64 val = sym.get_got_addr(ctx) - ctx.extra.TOC->value;
+      loc[1] |= higha(val);
+      loc[2] |= lo(val);
+    } else if(sym.has_plt(ctx)) {
+      memcpy(loc, plt_thunk, sizeof(plt_thunk));
+      i64 val = sym.get_gotplt_addr(ctx) - ctx.extra.TOC->value;
+      loc[1] |= higha(val);
+      loc[2] |= lo(val);
+    } else {
+      memcpy(loc, local_thunk, sizeof(local_thunk));
+      i64 val = sym.get_addr(ctx, NO_OPD) - ctx.extra.TOC->value;
+      loc[0] |= higha(val);
+      loc[1] |= lo(val);
+    }
+  }
+}
+
+static InputSection<E> *get_opd_section(ObjectFile<E> &file) {
+  for (std::unique_ptr<InputSection<E>> &isec : file.sections)
+    if (isec && isec->name() == ".opd")
+      return isec.get();
+  return nullptr;
+}
+
+static ElfRel<E> *
+get_relocation_at(Context<E> &ctx, InputSection<E> &isec, i64 offset) {
+  std::span<ElfRel<E>> rels = isec.get_rels(ctx);
+
+  auto it = std::lower_bound(rels.begin(), rels.end(), offset,
+                             [](const ElfRel<E> &r, i64 offset) {
+    return r.r_offset < offset;
+  });
+
+  if (it == rels.end())
+    return nullptr;
+  if (it->r_offset != offset)
+    return nullptr;
+  return &*it;
+}
+
+struct OpdSymbol {
+  bool operator<(const OpdSymbol &x) const { return r_offset < x.r_offset; }
+
+  u64 r_offset = 0;
+  Symbol<E> *sym = nullptr;
+};
+
+static Symbol<E> *
+get_opd_sym_at(Context<E> &ctx, std::span<OpdSymbol> syms, u64 offset) {
+  auto it = std::lower_bound(syms.begin(), syms.end(), OpdSymbol{offset});
+  if (it == syms.end())
+    return nullptr;
+  if (it->r_offset != offset)
+    return nullptr;
+  return it->sym;
+}
+
+// Compiler creates an .opd entry for each function symbol. The intention
+// is to make it possible to create an output .opd section just by linking
+// input .opd sections in the same manner as we do to other normal input
+// sections.
+//
+// However, in reality, .opd isn't a normal input section. It needs many
+// special treatments as follows:
+//
+// 1. A function symbol refers to not a .text but an .opd. Its address
+//    works fine for address-taking relocations such as R_PPC64_ADDR64.
+//    However, R_PPC64_REL24 (which is used for branch instruction) needs
+//    a function's real address instead of the function's .opd address.
+//    We need to read .opd contents to find out a function entry point
+//    address to apply R_PPC64_REL24.
+//
+// 2. Output .opd entries are needed only for functions whose addresses
+//    are taken. Just copying input .opd sections to an output would
+//    produces lots of dead .opd entries.
+//
+// 3. In this design, all function symbols refer to an .opd section, and
+//    that doesn't work well with graph traversal optimizations such as
+//    garbage collection or identical comdat folding. For example, garbage
+//    collector would mark an .opd alive which in turn mark all functions
+//    thatare referenced by .opd as alive, effectively keeping all
+//    functions as alive.
+//
+// The problem is that the compiler creates a half-baked .opd section, and
+// the linker has to figure out what all these .opd entries and
+// relocations are trying to achieve. It's like the compiler would emit a
+// half-baked .plt section in an object file and the linker has to deal
+// with that. That's not a good design.
+//
+// So, in this function, we undo what the compiler did to .opd. We remove
+// function symbols from .opd and reattach them to their function entry
+// points. We also rewrite relocations that directly refer to an input
+// .opd  section so that they refer to function symbols instead. We then
+// mark input .opd sections as dead.
+//
+// After this function, we mark symbols with the NEEDS_PPC_OPD flag if the
+// symbol needs an .opd entry. We then create an output .opd just like we
+// do for .plt or .got.
+void ppc64v1_rewrite_opd(Context<E> &ctx) {
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    InputSection<E> *opd = get_opd_section(*file);
+    if (!opd)
+      return;
+    opd->is_alive = false;
+
+    // Move symbols from .opd to .text.
+    std::vector<OpdSymbol> opd_syms;
+
+    for (Symbol<E> *sym : file->symbols) {
+      if (sym->file != file || sym->get_input_section() != opd)
+        continue;
+
+      if (u32 ty = sym->get_type(); ty != STT_FUNC && ty != STT_GNU_IFUNC)
+        continue;
+
+      ElfRel<E> *rel = get_relocation_at(ctx, *opd, sym->value);
+      if (!rel)
+        Fatal(ctx) << *file << ": cannot find a relocation in .opd for "
+                   << *sym << " at offset 0x" << std::hex << (u64)sym->value;
+
+      Symbol<E> *sym2 = file->symbols[rel->r_sym];
+      if (sym2->get_type() != STT_SECTION)
+        Fatal(ctx) << *file << ": bad relocation in .opd referring " << *sym2;
+
+      opd_syms.push_back({sym->value, sym});
+
+      sym->set_input_section(sym2->get_input_section());
+      sym->value = rel->r_addend;
+    }
+
+    // Sort symbols so that get_opd_sym_at() can do binary search.
+    sort(opd_syms);
+
+    // Rewrite relocations so that they directly refer to .opd.
+    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
+      if (!isec || !isec->is_alive || isec.get() == opd)
+        continue;
+
+      for (ElfRel<E> &r : isec->get_rels(ctx)) {
+        Symbol<E> &sym = *file->symbols[r.r_sym];
+        if (sym.get_input_section() != opd)
+          continue;
+
+        Symbol<E> *real_sym = get_opd_sym_at(ctx, opd_syms, r.r_addend);
+        if (!real_sym)
+          Fatal(ctx) << *isec << ": cannot find a symbol in .opd for " << r
+                     << " at offset 0x" << std::hex << (u64)r.r_addend;
+
+        r.r_sym = real_sym->sym_idx;
+        r.r_addend = 0;
+      }
+    }
+  });
+}
+
+// When a function is exported, the dynamic symbol for the function should
+// refers to the function's .opd entry. This function marks such symbols
+// with NEEDS_PPC_OPD.
+void ppc64v1_scan_symbols(Context<E> &ctx) {
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (Symbol<E> *sym : file->symbols)
+      if (sym->file == file && sym->is_exported)
+        if (u32 ty = sym->get_type(); ty == STT_FUNC || ty == STT_GNU_IFUNC)
+          sym->flags |= NEEDS_PPC_OPD;
+  });
+
+  // Functions referenced by the ELF header also have to have .opd entries.
+  auto mark = [&](std::string_view name) {
+    if (!name.empty())
+      if (Symbol<E> &sym = *get_symbol(ctx, name); !sym.is_imported)
+        sym.flags |= NEEDS_PPC_OPD;
+  };
+
+  mark(ctx.arg.entry);
+  mark(ctx.arg.init);
+  mark(ctx.arg.fini);
+}
+
+void PPC64OpdSection::add_symbol(Context<E> &ctx, Symbol<E> *sym) {
+  sym->set_opd_idx(ctx, symbols.size());
+  symbols.push_back(sym);
+  this->shdr.sh_size += ENTRY_SIZE;
+}
+
+i64 PPC64OpdSection::get_reldyn_size(Context<E> &ctx) const {
+  if (ctx.arg.pic)
+    return symbols.size() * 2;
+  return 0;
+}
+
+void PPC64OpdSection::copy_buf(Context<E> &ctx) {
+  ub64 *buf = (ub64 *)(ctx.buf + this->shdr.sh_offset);
+
+  ElfRel<E> *rel = nullptr;
+  if (ctx.arg.pic)
+    rel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset + reldyn_offset);
+
+  for (Symbol<E> *sym : symbols) {
+    u64 addr = sym->get_addr(ctx, NO_PLT | NO_OPD);
+    *buf++ = addr;
+    *buf++ = ctx.extra.TOC->value;
+    *buf++ = 0;
+
+    if (ctx.arg.pic) {
+      u64 loc = sym->get_opd_addr(ctx);
+      *rel++ = ElfRel<E>(loc, E::R_RELATIVE, 0, addr);
+      *rel++ = ElfRel<E>(loc + 8, E::R_RELATIVE, 0, ctx.extra.TOC->value);
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-ppc64v2.cc
+++ b/third_party/mold/elf/arch-ppc64v2.cc
@ -0,0 +1,555 @@
+// clang-format off
+// This file implements the PowerPC ELFv2 ABI which was standardized in
+// 2014. Modern little-endian PowerPC systems are based on this ABI.
+// The ABI is often referred to as "ppc64le". This shouldn't be confused
+// with "ppc64" which refers to the original, big-endian PowerPC systems.
+//
+// PPC64 is a bit tricky to support because PC-relative load/store
+// instructions hadn't been available until Power10 which debuted in 2021.
+// Prior to Power10, it wasn't trivial for position-independent code (PIC)
+// to load a value from, for example, .got, as we can't do that with [PC +
+// the offset to the .got entry].
+//
+// In the following, I'll explain how PIC is supported on pre-Power10
+// systems first and then explain what has changed with Power10.
+//
+//
+// Position-independent code on Power9 or earlier:
+//
+// We can get the program counter on older PPC64 systems with the
+// following four instructions
+//
+//   mflr  r1  // save the current link register to r1
+//   bl    .+4 // branch to the next instruction as if it were a function
+//   mflr  r0  // copy the return address to r0
+//   mtlr  r1  // restore the original link register value
+//
+// , but it's too expensive to do if we do this for each load/store.
+//
+// As a workaround, most functions are compiled in such a way that r2 is
+// assumed to always contain the address of .got + 0x8000. With this, we
+// can for example load the first entry of .got with a single instruction
+// `lw r0, -0x8000(r2)`. r2 is called the TOC pointer.
+//
+// There's only one .got for each ELF module. Therefore, if a callee is in
+// the same ELF module, r2 doesn't have to be recomputed. Most function
+// calls are usually within the same ELF module, so this mechanism is
+// efficient.
+//
+// A function compiled for pre-Power10 usually has two entry points,
+// global and local. The global entry point usually 8 bytes precedes
+// the local entry point. In between is the following instructions:
+//
+//   addis r2, r12, .TOC.@ha
+//   addi  r2, r2,  .TOC.@lo + 4;
+//
+// The global entry point assumes that the address of itself is in r12,
+// and it computes its own TOC pointer from r12. It's easy to do so for
+// the callee because the offset between its .got + 0x8000 and the
+// function is known at link-time. The above code sequence then falls
+// through to the local entry point that assumes r2 is .got + 0x8000.
+//
+// So, if a callee's TOC pointer is different from the current one
+// (e.g. calling a function in another .so), we first load the callee's
+// address to r12 (e.g. from .got.plt with a r2-relative load) and branch
+// to that address. Then the callee computes its own TOC pointer using
+// r12.
+//
+//
+// Position-independent code on Power10:
+//
+// Power10 added 8-bytes-long instructions to the ISA. Some of them are
+// PC-relative load/store instructions that take 34 bits offsets.
+// Functions compiled with `-mcpu=power10` use these instructions for PIC.
+// r2 does not have a special meaning in such fucntions.
+//
+// When a fucntion compiled for Power10 calls a function that uses the TOC
+// pointer, we need to compute a correct value for TOC and set it to r2
+// before transferring the control to the callee. Thunks are responsible
+// for doing it.
+//
+// `_NOTOC` relocations such as `R_PPC64_REL24_NOTOC` indicate that the
+// callee does not use TOC (i.e. compiled with `-mcpu=power10`). If a
+// function using TOC is referenced via a `_NOTOC` relocation, that call
+// is made through a range extension thunk.
+//
+//
+// Note on section names: the PPC64 psABI uses a weird naming convention
+// which calls .got.plt .plt. We ignored that part because it's just
+// confusing. Since the runtime only cares about segments, we should be
+// able to name sections whatever we want.
+//
+// https://github.com/rui314/psabi/blob/main/ppc64v2.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = PPC64V2;
+
+static u64 lo(u64 x)    { return x & 0xffff; }
+static u64 hi(u64 x)    { return x >> 16; }
+static u64 ha(u64 x)    { return (x + 0x8000) >> 16; }
+static u64 high(u64 x)  { return (x >> 16) & 0xffff; }
+static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
+
+static u64 prefix34(u64 x) {
+  return bits(x, 33, 16) | (bits(x, 15, 0) << 32);
+}
+
+// .plt is used only for lazy symbol resolution on PPC64. All PLT
+// calls are made via range extension thunks even if they are within
+// reach. Thunks read addresses from .got.plt and jump there.
+// Therefore, once PLT symbols are resolved and final addresses are
+// written to .got.plt, thunks just skip .plt and directly jump to the
+// resolved addresses.
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const ul32 insn[] = {
+    // Get PC
+    0x7c08'02a6, // mflr    r0
+    0x429f'0005, // bcl     20, 31, 4 // obtain PC
+    0x7d68'02a6, // mflr    r11
+    0x7c08'03a6, // mtlr    r0
+
+    // Compute the PLT entry index
+    0xe80b'002c, // ld      r0, 44(r11)
+    0x7d8b'6050, // subf    r12, r11, r12
+    0x7d60'5a14, // add     r11, r0, r11
+    0x380c'ffcc, // addi    r0, r12, -52
+    0x7800'f082, // rldicl  r0, r0, 62, 2
+
+    // Load .got.plt[0] and .got.plt[1] and branch to .got.plt[0]
+    0xe98b'0000, // ld      r12, 0(r11)
+    0x7d89'03a6, // mtctr   r12
+    0xe96b'0008, // ld      r11, 8(r11)
+    0x4e80'0420, // bctr
+
+    // .quad .got.plt - .plt - 8
+    0x0000'0000,
+    0x0000'0000,
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ul64 *)(buf + 52) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8;
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  // When the control is transferred to a PLT entry, the PLT entry's
+  // address is already set to %r12 by the caller.
+  i64 offset = ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx);
+  *(ul32 *)buf = 0x4b00'0000 | (offset & 0x00ff'ffff);        // b plt0
+}
+
+// .plt.got is not necessary on PPC64 because range extension thunks
+// directly read GOT entries and jump there.
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_PPC64_ADDR64:
+    *(ul64 *)loc = val;
+    break;
+  case R_PPC64_REL32:
+    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  case R_PPC64_REL64:
+    *(ul64 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+static u64 get_local_entry_offset(Context<E> &ctx, Symbol<E> &sym) {
+  i64 val = sym.esym().ppc_local_entry;
+  assert(val <= 7);
+  if (val == 7)
+    Fatal(ctx) << sym << ": local entry offset 7 is reserved";
+
+  if (val == 0 || val == 1)
+    return 0;
+  return 1 << val;
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+    u64 TOC = ctx.extra.TOC->value;
+
+    auto r2save_thunk_addr = [&] { return get_thunk_addr(i); };
+    auto no_r2save_thunk_addr = [&] { return get_thunk_addr(i) + 4; };
+
+    switch (rel.r_type) {
+    case R_PPC64_ADDR64:
+      if (name() == ".toc")
+        apply_toc_rel(ctx, sym, rel, loc, S, A, P, dynrel);
+      else
+        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_PPC64_TOC16_HA:
+      *(ul16 *)loc = ha(S + A - TOC);
+      break;
+    case R_PPC64_TOC16_LO:
+      *(ul16 *)loc = lo(S + A - TOC);
+      break;
+    case R_PPC64_TOC16_DS:
+    case R_PPC64_TOC16_LO_DS:
+      *(ul16 *)loc |= (S + A - TOC) & 0xfffc;
+      break;
+    case R_PPC64_REL24:
+      if (sym.has_plt(ctx) || !sym.esym().preserves_r2()) {
+        i64 val = r2save_thunk_addr() + A - P;
+        *(ul32 *)loc |= bits(val, 25, 2) << 2;
+
+        // The thunk saves %r2 to the caller's r2 save slot. We need to
+        // restore it after function return. To do so, there's usually a
+        // NOP as a placeholder after a BL. 0x6000'0000 is a NOP.
+        if (*(ul32 *)(loc + 4) == 0x6000'0000)
+          *(ul32 *)(loc + 4) = 0xe841'0018; // ld r2, 24(r1)
+      } else {
+        i64 val = S + get_local_entry_offset(ctx, sym) + A - P;
+        if (sign_extend(val, 25) != val)
+          val = no_r2save_thunk_addr() + A - P;
+        *(ul32 *)loc |= bits(val, 25, 2) << 2;
+      }
+      break;
+    case R_PPC64_REL24_NOTOC:
+      if (sym.has_plt(ctx) || sym.esym().uses_toc()) {
+        i64 val = no_r2save_thunk_addr() + A - P;
+        *(ul32 *)loc |= bits(val, 25, 2) << 2;
+      } else {
+        i64 val = S + A - P;
+        if (sign_extend(val, 25) != val)
+          val = no_r2save_thunk_addr() + A - P;
+        *(ul32 *)loc |= bits(val, 25, 2) << 2;
+      }
+      break;
+    case R_PPC64_REL32:
+      *(ul32 *)loc = S + A - P;
+      break;
+    case R_PPC64_REL64:
+      *(ul64 *)loc = S + A - P;
+      break;
+    case R_PPC64_REL16_HA:
+      *(ul16 *)loc = ha(S + A - P);
+      break;
+    case R_PPC64_REL16_LO:
+      *(ul16 *)loc = lo(S + A - P);
+      break;
+    case R_PPC64_PLT16_HA:
+      *(ul16 *)loc = ha(G + GOT - TOC);
+      break;
+    case R_PPC64_PLT16_HI:
+      *(ul16 *)loc = hi(G + GOT - TOC);
+      break;
+    case R_PPC64_PLT16_LO:
+      *(ul16 *)loc = lo(G + GOT - TOC);
+      break;
+    case R_PPC64_PLT16_LO_DS:
+      *(ul16 *)loc |= (G + GOT - TOC) & 0xfffc;
+      break;
+    case R_PPC64_PLT_PCREL34:
+    case R_PPC64_PLT_PCREL34_NOTOC:
+    case R_PPC64_GOT_PCREL34:
+      *(ul64 *)loc |= prefix34(G + GOT - P);
+      break;
+    case R_PPC64_PCREL34:
+      *(ul64 *)loc |= prefix34(S + A - P);
+      break;
+    case R_PPC64_GOT_TPREL16_HA:
+      *(ul16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TPREL16_LO_DS:
+      *(ul16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc;
+      break;
+    case R_PPC64_GOT_TPREL_PCREL34:
+      *(ul64 *)loc |= prefix34(sym.get_gottp_addr(ctx) - P);
+      break;
+    case R_PPC64_GOT_TLSGD16_HA:
+      *(ul16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSGD16_LO:
+      *(ul16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSGD_PCREL34:
+      *(ul64 *)loc |= prefix34(sym.get_tlsgd_addr(ctx) - P);
+      break;
+    case R_PPC64_GOT_TLSLD16_HA:
+      *(ul16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSLD16_LO:
+      *(ul16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC);
+      break;
+    case R_PPC64_GOT_TLSLD_PCREL34:
+      *(ul64 *)loc |= prefix34(ctx.got->get_tlsld_addr(ctx) - P);
+      break;
+    case R_PPC64_DTPREL16_HA:
+      *(ul16 *)loc = ha(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC64_DTPREL16_LO:
+      *(ul16 *)loc = lo(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC64_DTPREL34:
+      *(ul64 *)loc |= prefix34(S + A - ctx.dtp_addr);
+      break;
+    case R_PPC64_TPREL16_HA:
+      *(ul16 *)loc = ha(S + A - ctx.tp_addr);
+      break;
+    case R_PPC64_TPREL16_LO:
+      *(ul16 *)loc = lo(S + A - ctx.tp_addr);
+      break;
+    case R_PPC64_PLTSEQ:
+    case R_PPC64_PLTSEQ_NOTOC:
+    case R_PPC64_PLTCALL:
+    case R_PPC64_PLTCALL_NOTOC:
+    case R_PPC64_TLS:
+    case R_PPC64_TLSGD:
+    case R_PPC64_TLSLD:
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_PPC64_ADDR64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul64 *)loc = *val;
+      else
+        *(ul64 *)loc = S + A;
+      break;
+    case R_PPC64_ADDR32: {
+      i64 val = S + A;
+      check(val, 0, 1LL << 32);
+      *(ul32 *)loc = val;
+      break;
+    }
+    case R_PPC64_DTPREL64:
+      *(ul64 *)loc = S + A - ctx.dtp_addr;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_PPC64_ADDR64:
+      if (name() == ".toc")
+        scan_toc_rel(ctx, sym, rel);
+      else
+        scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_PPC64_GOT_TPREL16_HA:
+    case R_PPC64_GOT_TPREL_PCREL34:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_PPC64_REL24:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_PPC64_REL24_NOTOC:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      ctx.extra.is_power10 = true;
+      break;
+    case R_PPC64_PLT16_HA:
+    case R_PPC64_PLT_PCREL34:
+    case R_PPC64_PLT_PCREL34_NOTOC:
+    case R_PPC64_GOT_PCREL34:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_PPC64_GOT_TLSGD16_HA:
+    case R_PPC64_GOT_TLSGD_PCREL34:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_PPC64_GOT_TLSLD16_HA:
+    case R_PPC64_GOT_TLSLD_PCREL34:
+      ctx.needs_tlsld = true;
+      break;
+    case R_PPC64_TPREL16_HA:
+    case R_PPC64_TPREL16_LO:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_PPC64_REL32:
+    case R_PPC64_REL64:
+    case R_PPC64_TOC16_HA:
+    case R_PPC64_TOC16_LO:
+    case R_PPC64_TOC16_LO_DS:
+    case R_PPC64_TOC16_DS:
+    case R_PPC64_REL16_HA:
+    case R_PPC64_REL16_LO:
+    case R_PPC64_PLT16_HI:
+    case R_PPC64_PLT16_LO:
+    case R_PPC64_PLT16_LO_DS:
+    case R_PPC64_PCREL34:
+    case R_PPC64_PLTSEQ:
+    case R_PPC64_PLTSEQ_NOTOC:
+    case R_PPC64_PLTCALL:
+    case R_PPC64_PLTCALL_NOTOC:
+    case R_PPC64_GOT_TPREL16_LO_DS:
+    case R_PPC64_GOT_TLSGD16_LO:
+    case R_PPC64_GOT_TLSLD16_LO:
+    case R_PPC64_TLS:
+    case R_PPC64_TLSGD:
+    case R_PPC64_TLSLD:
+    case R_PPC64_DTPREL16_HA:
+    case R_PPC64_DTPREL16_LO:
+    case R_PPC64_DTPREL34:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+template <>
+void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
+  u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
+
+  // If the destination is PLT, we read an address from .got.plt or .got
+  // and jump there.
+  static const ul32 plt_thunk[] = {
+    0xf841'0018, // std   r2, 24(r1)
+    0x3d82'0000, // addis r12, r2, foo@gotplt@toc@ha
+    0xe98c'0000, // ld    r12, foo@gotplt@toc@lo(r12)
+    0x7d89'03a6, // mtctr r12
+    0x4e80'0420, // bctr
+  };
+
+  static const ul32 plt_thunk_power10[] = {
+    0xf841'0018, // std   r2, 24(r1)
+    0x0410'0000, // pld   r12, foo@gotplt@pcrel
+    0xe580'0000,
+    0x7d89'03a6, // mtctr r12
+    0x4e80'0420, // bctr
+  };
+
+  // If the destination is a non-imported function, we directly jump
+  // to its local entry point.
+  static const ul32 local_thunk[] = {
+    0xf841'0018, // std   r2, 24(r1)
+    0x3d82'0000, // addis r12, r2,  foo@toc@ha
+    0x398c'0000, // addi  r12, r12, foo@toc@lo
+    0x7d89'03a6, // mtctr r12
+    0x4e80'0420, // bctr
+  };
+
+  static const ul32 local_thunk_power10[] = {
+    0xf841'0018, // std   r2, 24(r1)
+    0x0610'0000, // pla   r12, foo@pcrel
+    0x3980'0000,
+    0x7d89'03a6, // mtctr r12
+    0x4e80'0420, // bctr
+  };
+
+  static_assert(E::thunk_size == sizeof(plt_thunk));
+  static_assert(E::thunk_size == sizeof(plt_thunk_power10));
+  static_assert(E::thunk_size == sizeof(local_thunk));
+  static_assert(E::thunk_size == sizeof(local_thunk_power10));
+
+  for (i64 i = 0; i < symbols.size(); i++) {
+    Symbol<E> &sym = *symbols[i];
+    ul32 *loc = (ul32 *)(buf + i * E::thunk_size);
+
+    if (sym.has_plt(ctx)) {
+      u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx);
+
+      if (ctx.extra.is_power10) {
+        memcpy(loc, plt_thunk_power10, E::thunk_size);
+        *(ul64 *)(loc + 1) |= prefix34(got - get_addr(i) - 4);
+      } else {
+        i64 val = got - ctx.extra.TOC->value;
+        memcpy(loc, plt_thunk, E::thunk_size);
+        loc[1] |= higha(val);
+        loc[2] |= lo(val);
+      }
+    } else {
+      if (ctx.extra.is_power10) {
+        memcpy(loc, local_thunk_power10, E::thunk_size);
+        *(ul64 *)(loc + 1) |= prefix34(sym.get_addr(ctx) - get_addr(i) - 4);
+      } else {
+        i64 val = sym.get_addr(ctx) - ctx.extra.TOC->value;
+        memcpy(loc, local_thunk, E::thunk_size);
+        loc[1] |= higha(val);
+        loc[2] |= lo(val);
+      }
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-riscv.cc
+++ b/third_party/mold/elf/arch-riscv.cc
@ -0,0 +1,938 @@
+// clang-format off
+// RISC-V is a clean RISC ISA. It supports PC-relative load/store for
+// position-independent code. Its 32-bit and 64-bit ISAs are almost
+// identical. That is, you can think RV32 as a RV64 without 64-bit
+// operations. In this file, we support both RV64 and RV32.
+//
+// RISC-V is essentially little-endian, but the big-endian version is
+// available as an extension. GCC supports `-mbig-endian` to generate
+// big-endian code. Even in big-endian mode, machine instructions are
+// defined to be encoded in little-endian, though. Only the behavior of
+// load/store instructions are different between LE RISC-V and BE RISC-V.
+//
+// From the linker's point of view, the RISC-V's psABI is unique because
+// sections in input object files can be shrunk while being copied to the
+// output file. That is contrary to other psABIs in which sections are an
+// atomic unit of copying. Let me explain it in more details.
+//
+// Since RISC-V instructions are 16-bit or 32-bit long, there's no way to
+// embed a very large immediate into a branch instruction. In fact, JAL
+// (jump and link) instruction can jump to only within PC ± 1 MiB because
+// its immediate is only 21 bits long. If the destination is out of its
+// reach, we need to use two instructions instead; the first instruction
+// being AUIPC which sets upper 20 bits to a register and the second being
+// JALR with a 12-bit immediate and the register. Combined, they specify a
+// 32 bits displacement.
+//
+// Other RISC ISAs have the same limitation, and they solved the problem by
+// letting the linker create so-called "range extension thunks". It works as
+// follows: the compiler optimistically emits single jump instructions for
+// function calls. If the linker finds that a branch target is out of reach,
+// it emits a small piece of machine code near the branch instruction and
+// redirect the branch to the linker-synthesized code. The code constructs a
+// full 32-bit address in a register and jump to the destination. That
+// linker-synthesized code is called "range extension thunks" or just
+// "thunks".
+//
+// The RISC-V psABI is unique that it works the other way around. That is,
+// for RISC-V, the compiler always emits two instructions (AUIPC + JAL) for
+// function calls. If the linker finds the destination is reachable with a
+// single instruction, it replaces the two instructions with the one and
+// shrink the section size by one instruction length, instead of filling the
+// gap with a nop.
+//
+// With the presence of this relaxation, sections can no longer be
+// considered as an atomic unit. If we delete 4 bytes from the middle of a
+// section, all contents after that point needs to be shifted by 4. Symbol
+// values and relocation offsets have to be adjusted accordingly if they
+// refer to past the deleted bytes.
+//
+// In mold, we use `r_deltas` to memorize how many bytes have be adjusted
+// for relocations. For symbols, we directly mutate their `value` member.
+//
+// RISC-V object files tend to have way more relocations than those for
+// other targets. This is because all branches, including ones that jump
+// within the same section, are explicitly expressed with relocations.
+// Here is why we need them: all control-flow statements such as `if` or
+// `for` are implemented using branch instructions. For other targets, the
+// compiler doesn't emit relocations for such branches because they know
+// at compile-time exactly how many bytes has to be skipped. That's not
+// true to RISC-V because the linker may delete bytes between a branch and
+// its destination. Therefore, all branches including in-section ones have
+// to be explicitly expressed with relocations.
+//
+// Note that this mechanism only shrink sections and never enlarge, as
+// the compiler always emits the longest instruction sequence. This
+// makes the linker implementation a bit simpler because we don't need
+// to worry about oscillation.
+//
+// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc
+
+#include "third_party/mold/elf/mold.h"
+
+// MISSING #include <tbb/parallel_for.h>
+// MISSING #include <tbb/parallel_for_each.h>
+
+namespace mold::elf {
+
+static void write_itype(u8 *loc, u32 val) {
+  *(ul32 *)loc &= 0b000000'00000'11111'111'11111'1111111;
+  *(ul32 *)loc |= bits(val, 11, 0) << 20;
+}
+
+static void write_stype(u8 *loc, u32 val) {
+  *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
+  *(ul32 *)loc |= bits(val, 11, 5) << 25 | bits(val, 4, 0) << 7;
+}
+
+static void write_btype(u8 *loc, u32 val) {
+  *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
+  *(ul32 *)loc |= bit(val, 12) << 31   | bits(val, 10, 5) << 25 |
+                  bits(val, 4, 1) << 8 | bit(val, 11) << 7;
+}
+
+static void write_utype(u8 *loc, u32 val) {
+  *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;
+
+  // U-type instructions are used in combination with I-type
+  // instructions. U-type insn sets an immediate to the upper 20-bits
+  // of a register. I-type insn sign-extends a 12-bits immediate and
+  // adds it to a register value to construct a complete value. 0x800
+  // is added here to compensate for the sign-extension.
+  *(ul32 *)loc |= (val + 0x800) & 0xffff'f000;
+}
+
+static void write_jtype(u8 *loc, u32 val) {
+  *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;
+  *(ul32 *)loc |= bit(val, 20) << 31 | bits(val, 10, 1)  << 21 |
+                  bit(val, 11) << 20 | bits(val, 19, 12) << 12;
+}
+
+static void write_cbtype(u8 *loc, u32 val) {
+  *(ul16 *)loc &= 0b111'000'111'00000'11;
+  *(ul16 *)loc |= bit(val, 8) << 12 | bit(val, 4) << 11 | bit(val, 3) << 10 |
+                  bit(val, 7) << 6  | bit(val, 6) << 5  | bit(val, 2) << 4  |
+                  bit(val, 1) << 3  | bit(val, 5) << 2;
+}
+
+static void write_cjtype(u8 *loc, u32 val) {
+  *(ul16 *)loc &= 0b111'00000000000'11;
+  *(ul16 *)loc |= bit(val, 11) << 12 | bit(val, 4)  << 11 | bit(val, 9) << 10 |
+                  bit(val, 8)  << 9  | bit(val, 10) << 8  | bit(val, 6) << 7  |
+                  bit(val, 7)  << 6  | bit(val, 3)  << 5  | bit(val, 2) << 4  |
+                  bit(val, 1)  << 3  | bit(val, 5)  << 2;
+}
+
+static void overwrite_uleb(u8 *loc, u64 val) {
+  while (*loc & 0b1000'0000) {
+    *loc++ = 0b1000'0000 | (val & 0b0111'1111);
+    val >>= 7;
+  }
+}
+
+// Returns the rd register of an R/I/U/J-type instruction.
+static u32 get_rd(u32 val) {
+  return bits(val, 11, 7);
+}
+
+static void set_rs1(u8 *loc, u32 rs1) {
+  assert(rs1 < 32);
+  *(ul32 *)loc &= 0b111111'11111'00000'111'11111'1111111;
+  *(ul32 *)loc |= rs1 << 15;
+}
+
+template <typename E>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const ul32 insn_64[] = {
+    0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
+    0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
+    0x0003'be03, // ld     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
+    0xfd43'0313, // addi   t1, t1, -44              # .plt entry
+    0x0003'8293, // addi   t0, t2, %pcrel_lo(1b)    # &.got.plt
+    0x0013'5313, // srli   t1, t1, 1                # .plt entry offset
+    0x0082'b283, // ld     t0, 8(t0)                # link map
+    0x000e'0067, // jr     t3
+  };
+
+  static const ul32 insn_32[] = {
+    0x0000'0397, // auipc  t2, %pcrel_hi(.got.plt)
+    0x41c3'0333, // sub    t1, t1, t3               # .plt entry + hdr + 12
+    0x0003'ae03, // lw     t3, %pcrel_lo(1b)(t2)    # _dl_runtime_resolve
+    0xfd43'0313, // addi   t1, t1, -44              # .plt entry
+    0x0003'8293, // addi   t0, t2, %pcrel_lo(1b)    # &.got.plt
+    0x0023'5313, // srli   t1, t1, 2                # .plt entry offset
+    0x0042'a283, // lw     t0, 4(t0)                # link map
+    0x000e'0067, // jr     t3
+  };
+
+  if constexpr (E::is_64)
+    memcpy(buf, insn_64, sizeof(insn_64));
+  else
+    memcpy(buf, insn_32, sizeof(insn_32));
+
+  u64 gotplt = ctx.gotplt->shdr.sh_addr;
+  u64 plt = ctx.plt->shdr.sh_addr;
+  write_utype(buf, gotplt - plt);
+  write_itype(buf + 8, gotplt - plt);
+  write_itype(buf + 16, gotplt - plt);
+}
+
+static const ul32 plt_entry_64[] = {
+  0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
+  0x000e'3e03, // ld      t3, %pcrel_lo(1b)(t3)
+  0x000e'0367, // jalr    t1, t3
+  0x0000'0013, // nop
+};
+
+static const ul32 plt_entry_32[] = {
+  0x0000'0e17, // auipc   t3, %pcrel_hi(function@.got.plt)
+  0x000e'2e03, // lw      t3, %pcrel_lo(1b)(t3)
+  0x000e'0367, // jalr    t1, t3
+  0x0000'0013, // nop
+};
+
+template <typename E>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  if constexpr (E::is_64)
+    memcpy(buf, plt_entry_64, sizeof(plt_entry_64));
+  else
+    memcpy(buf, plt_entry_32, sizeof(plt_entry_32));
+
+  u64 gotplt = sym.get_gotplt_addr(ctx);
+  u64 plt = sym.get_plt_addr(ctx);
+  write_utype(buf, gotplt - plt);
+  write_itype(buf + 4, gotplt - plt);
+}
+
+template <typename E>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  if constexpr (E::is_64)
+    memcpy(buf, plt_entry_64, sizeof(plt_entry_64));
+  else
+    memcpy(buf, plt_entry_32, sizeof(plt_entry_32));
+
+  u64 got = sym.get_got_addr(ctx);
+  u64 plt = sym.get_plt_addr(ctx);
+  write_utype(buf, got - plt);
+  write_itype(buf + 4, got - plt);
+}
+
+template <typename E>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_RISCV_ADD32:
+    *(U32<E> *)loc += val;
+    break;
+  case R_RISCV_SUB8:
+    *loc -= val;
+    break;
+  case R_RISCV_SUB16:
+    *(U16<E> *)loc -= val;
+    break;
+  case R_RISCV_SUB32:
+    *(U32<E> *)loc -= val;
+    break;
+  case R_RISCV_SUB6:
+    *loc = (*loc & 0b1100'0000) | ((*loc - val) & 0b0011'1111);
+    break;
+  case R_RISCV_SET6:
+    *loc = (*loc & 0b1100'0000) | (val & 0b0011'1111);
+    break;
+  case R_RISCV_SET8:
+    *loc = val;
+    break;
+  case R_RISCV_SET16:
+    *(U16<E> *)loc = val;
+    break;
+  case R_RISCV_SET32:
+    *(U32<E> *)loc = val;
+    break;
+  case R_RISCV_32_PCREL:
+    *(U32<E> *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <typename E>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  auto get_r_delta = [&](i64 idx) {
+    return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
+  };
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    i64 r_offset = rel.r_offset - get_r_delta(i);
+    i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
+    u8 *loc = base + r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    auto find_paired_reloc = [&] {
+      Symbol<E> &sym = *file.symbols[rels[i].r_sym];
+      assert(sym.get_input_section() == this);
+
+      if (sym.value < r_offset) {
+        for (i64 j = i - 1; j >= 0; j--)
+          if (u32 ty = rels[j].r_type;
+              ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
+              ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20)
+            if (sym.value == rels[j].r_offset - get_r_delta(j))
+              return j;
+      } else {
+        for (i64 j = i + 1; j < rels.size(); j++)
+          if (u32 ty = rels[j].r_type;
+              ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
+              ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20)
+            if (sym.value == rels[j].r_offset - get_r_delta(j))
+              return j;
+      }
+
+      Fatal(ctx) << *this << ": paired relocation is missing: " << i;
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_RISCV_32:
+      if constexpr (E::is_64)
+        *(U32<E> *)loc = S + A;
+      else
+        apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_RISCV_64:
+      assert(E::is_64);
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_RISCV_BRANCH:
+      check(S + A - P, -(1 << 12), 1 << 12);
+      write_btype(loc, S + A - P);
+      break;
+    case R_RISCV_JAL:
+      check(S + A - P, -(1 << 20), 1 << 20);
+      write_jtype(loc, S + A - P);
+      break;
+    case R_RISCV_CALL:
+    case R_RISCV_CALL_PLT: {
+      u32 rd = get_rd(*(ul32 *)(contents.data() + rel.r_offset + 4));
+
+      if (removed_bytes == 4) {
+        // auipc + jalr -> jal
+        *(ul32 *)loc = (rd << 7) | 0b1101111;
+        write_jtype(loc, S + A - P);
+      } else if (removed_bytes == 6 && rd == 0) {
+        // auipc + jalr -> c.j
+        *(ul16 *)loc = 0b101'00000000000'01;
+        write_cjtype(loc, S + A - P);
+      } else if (removed_bytes == 6 && rd == 1) {
+        // auipc + jalr -> c.jal
+        assert(!E::is_64);
+        *(ul16 *)loc = 0b001'00000000000'01;
+        write_cjtype(loc, S + A - P);
+      } else {
+        assert(removed_bytes == 0);
+        // Calling an undefined weak symbol does not make sense.
+        // We make such call into an infinite loop. This should
+        // help debugging of a faulty program.
+        u64 val = sym.esym().is_undef_weak() ? 0 : S + A - P;
+        check(val, -(1LL << 31), 1LL << 31);
+        write_utype(loc, val);
+        write_itype(loc + 4, val);
+      }
+      break;
+    }
+    case R_RISCV_GOT_HI20:
+      write_utype(loc, G + GOT + A - P);
+      break;
+    case R_RISCV_TLS_GOT_HI20:
+      write_utype(loc, sym.get_gottp_addr(ctx) + A - P);
+      break;
+    case R_RISCV_TLS_GD_HI20:
+      write_utype(loc, sym.get_tlsgd_addr(ctx) + A - P);
+      break;
+    case R_RISCV_PCREL_HI20:
+      write_utype(loc, S + A - P);
+      break;
+    case R_RISCV_PCREL_LO12_I:
+    case R_RISCV_PCREL_LO12_S: {
+      i64 idx2 = find_paired_reloc();
+      const ElfRel<E> &rel2 = rels[idx2];
+      Symbol<E> &sym2 = *file.symbols[rel2.r_sym];
+
+      u64 S = sym2.get_addr(ctx);
+      u64 A = rel2.r_addend;
+      u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
+      u64 G = sym2.get_got_idx(ctx) * sizeof(Word<E>);
+      u64 val;
+
+      switch (rel2.r_type) {
+      case R_RISCV_GOT_HI20:
+        val = G + GOT + A - P;
+        break;
+      case R_RISCV_TLS_GOT_HI20:
+        val = sym2.get_gottp_addr(ctx) + A - P;
+        break;
+      case R_RISCV_TLS_GD_HI20:
+        val = sym2.get_tlsgd_addr(ctx) + A - P;
+        break;
+      case R_RISCV_PCREL_HI20:
+        val = S + A - P;
+        break;
+      default:
+        unreachable();
+      }
+
+      if (rel.r_type == R_RISCV_PCREL_LO12_I)
+        write_itype(loc, val);
+      else
+        write_stype(loc, val);
+      break;
+    }
+    case R_RISCV_HI20:
+      assert(removed_bytes == 0 || removed_bytes == 4);
+      if (removed_bytes == 0) {
+        check(S + A, -(1LL << 31), 1LL << 31);
+        write_utype(loc, S + A);
+      }
+      break;
+    case R_RISCV_LO12_I:
+    case R_RISCV_LO12_S:
+      if (rel.r_type == R_RISCV_LO12_I)
+        write_itype(loc, S + A);
+      else
+        write_stype(loc, S + A);
+
+      // Rewrite `lw t1, 0(t0)` with `lw t1, 0(x0)` if the address is
+      // accessible relative to the zero register. If the upper 20 bits
+      // are all zero, the corresponding LUI might have been removed.
+      if (bits(S + A, 31, 12) == 0)
+        set_rs1(loc, 0);
+      break;
+    case R_RISCV_TPREL_HI20:
+      assert(removed_bytes == 0 || removed_bytes == 4);
+      if (removed_bytes == 0)
+        write_utype(loc, S + A - ctx.tp_addr);
+      break;
+    case R_RISCV_TPREL_ADD:
+      // This relocation just annotates an ADD instruction that can be
+      // removed when a TPREL is relaxed. No value is needed to be
+      // written.
+      assert(removed_bytes == 0 || removed_bytes == 4);
+      break;
+    case R_RISCV_TPREL_LO12_I:
+    case R_RISCV_TPREL_LO12_S: {
+      i64 val = S + A - ctx.tp_addr;
+      if (rel.r_type == R_RISCV_TPREL_LO12_I)
+        write_itype(loc, val);
+      else
+        write_stype(loc, val);
+
+      // Rewrite `lw t1, 0(t0)` with `lw t1, 0(tp)` if the address is
+      // directly accessible using tp. tp is x4.
+      if (sign_extend(val, 11) == val)
+        set_rs1(loc, 4);
+      break;
+    }
+    case R_RISCV_ADD8:
+      loc += S + A;
+      break;
+    case R_RISCV_ADD16:
+      *(U16<E> *)loc += S + A;
+      break;
+    case R_RISCV_ADD32:
+      *(U32<E> *)loc += S + A;
+      break;
+    case R_RISCV_ADD64:
+      *(U64<E> *)loc += S + A;
+      break;
+    case R_RISCV_SUB8:
+      loc -= S + A;
+      break;
+    case R_RISCV_SUB16:
+      *(U16<E> *)loc -= S + A;
+      break;
+    case R_RISCV_SUB32:
+      *(U32<E> *)loc -= S + A;
+      break;
+    case R_RISCV_SUB64:
+      *(U64<E> *)loc -= S + A;
+      break;
+    case R_RISCV_ALIGN: {
+      // A R_RISCV_ALIGN is followed by a NOP sequence. We need to remove
+      // zero or more bytes so that the instruction after R_RISCV_ALIGN is
+      // aligned to a given alignment boundary.
+      //
+      // We need to guarantee that the NOP sequence is valid after byte
+      // removal (e.g. we can't remove the first 2 bytes of a 4-byte NOP).
+      // For the sake of simplicity, we always rewrite the entire NOP sequence.
+      i64 padding_bytes = rel.r_addend - removed_bytes;
+      assert((padding_bytes & 1) == 0);
+
+      i64 i = 0;
+      for (; i <= padding_bytes - 4; i += 4)
+        *(ul32 *)(loc + i) = 0x0000'0013; // nop
+      if (i < padding_bytes)
+        *(ul16 *)(loc + i) = 0x0001;      // c.nop
+      break;
+    }
+    case R_RISCV_RVC_BRANCH:
+      check(S + A - P, -(1 << 8), 1 << 8);
+      write_cbtype(loc, S + A - P);
+      break;
+    case R_RISCV_RVC_JUMP:
+      check(S + A - P, -(1 << 11), 1 << 11);
+      write_cjtype(loc, S + A - P);
+      break;
+    case R_RISCV_SUB6:
+      *loc = (*loc & 0b1100'0000) | ((*loc - (S + A)) & 0b0011'1111);
+      break;
+    case R_RISCV_SET6:
+      *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
+      break;
+    case R_RISCV_SET8:
+      *loc = S + A;
+      break;
+    case R_RISCV_SET16:
+      *(U16<E> *)loc = S + A;
+      break;
+    case R_RISCV_SET32:
+      *(U32<E> *)loc = S + A;
+      break;
+    case R_RISCV_PLT32:
+    case R_RISCV_32_PCREL:
+      *(U32<E> *)loc = S + A - P;
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <typename E>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_RISCV_32:
+      *(U32<E> *)loc = S + A;
+      break;
+    case R_RISCV_64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(U64<E> *)loc = *val;
+      else
+        *(U64<E> *)loc = S + A;
+      break;
+    case R_RISCV_ADD8:
+      *loc += S + A;
+      break;
+    case R_RISCV_ADD16:
+      *(U16<E> *)loc += S + A;
+      break;
+    case R_RISCV_ADD32:
+      *(U32<E> *)loc += S + A;
+      break;
+    case R_RISCV_ADD64:
+      *(U64<E> *)loc += S + A;
+      break;
+    case R_RISCV_SUB8:
+      *loc -= S + A;
+      break;
+    case R_RISCV_SUB16:
+      *(U16<E> *)loc -= S + A;
+      break;
+    case R_RISCV_SUB32:
+      *(U32<E> *)loc -= S + A;
+      break;
+    case R_RISCV_SUB64:
+      *(U64<E> *)loc -= S + A;
+      break;
+    case R_RISCV_SUB6:
+      *loc = (*loc & 0b1100'0000) | ((*loc - (S + A)) & 0b0011'1111);
+      break;
+    case R_RISCV_SET6:
+      *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
+      break;
+    case R_RISCV_SET8:
+      *loc = S + A;
+      break;
+    case R_RISCV_SET16:
+      *(U16<E> *)loc = S + A;
+      break;
+    case R_RISCV_SET32:
+      *(U32<E> *)loc = S + A;
+      break;
+    case R_RISCV_SET_ULEB128:
+      overwrite_uleb(loc, S + A);
+      break;
+    case R_RISCV_SUB_ULEB128: {
+      u8 *p = loc;
+      u64 val = read_uleb(p);
+      overwrite_uleb(loc, val - S - A);
+      break;
+    }
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+      break;
+    }
+  }
+}
+
+template <typename E>
+void InputSection<E>::copy_contents_riscv(Context<E> &ctx, u8 *buf) {
+  // If a section is not relaxed, we can copy it as a one big chunk.
+  if (extra.r_deltas.empty()) {
+    uncompress_to(ctx, buf);
+    return;
+  }
+
+  // A relaxed section is copied piece-wise.
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+  i64 pos = 0;
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
+    if (delta == 0)
+      continue;
+    assert(delta > 0);
+
+    const ElfRel<E> &r = rels[i];
+    memcpy(buf, contents.data() + pos, r.r_offset - pos);
+    buf += r.r_offset - pos;
+    pos = r.r_offset + delta;
+  }
+
+  memcpy(buf, contents.data() + pos, contents.size() - pos);
+}
+
+template <typename E>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_RISCV_32:
+      if constexpr (E::is_64)
+        scan_absrel(ctx, sym, rel);
+      else
+        scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_RISCV_HI20:
+      scan_absrel(ctx, sym, rel);
+      break;
+    case R_RISCV_64:
+      if constexpr (!E::is_64)
+        Fatal(ctx) << *this << ": R_RISCV_64 cannot be used on RV32";
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_RISCV_CALL:
+    case R_RISCV_CALL_PLT:
+    case R_RISCV_PLT32:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_RISCV_GOT_HI20:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_RISCV_TLS_GOT_HI20:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_RISCV_TLS_GD_HI20:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_RISCV_32_PCREL:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_RISCV_TPREL_HI20:
+    case R_RISCV_TPREL_LO12_I:
+    case R_RISCV_TPREL_LO12_S:
+    case R_RISCV_TPREL_ADD:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_RISCV_BRANCH:
+    case R_RISCV_JAL:
+    case R_RISCV_PCREL_HI20:
+    case R_RISCV_PCREL_LO12_I:
+    case R_RISCV_PCREL_LO12_S:
+    case R_RISCV_LO12_I:
+    case R_RISCV_LO12_S:
+    case R_RISCV_ADD8:
+    case R_RISCV_ADD16:
+    case R_RISCV_ADD32:
+    case R_RISCV_ADD64:
+    case R_RISCV_SUB8:
+    case R_RISCV_SUB16:
+    case R_RISCV_SUB32:
+    case R_RISCV_SUB64:
+    case R_RISCV_ALIGN:
+    case R_RISCV_RVC_BRANCH:
+    case R_RISCV_RVC_JUMP:
+    case R_RISCV_RELAX:
+    case R_RISCV_SUB6:
+    case R_RISCV_SET6:
+    case R_RISCV_SET8:
+    case R_RISCV_SET16:
+    case R_RISCV_SET32:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+template <typename E>
+static bool is_resizable(Context<E> &ctx, InputSection<E> *isec) {
+  return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) &&
+         (isec->shdr().sh_flags & SHF_EXECINSTR);
+}
+
+// Returns the distance between a relocated place and a symbol.
+template <typename E>
+static i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
+                            InputSection<E> &isec, const ElfRel<E> &rel) {
+  // We handle absolute symbols as if they were infinitely far away
+  // because `shrink_section` may increase a distance between a branch
+  // instruction and an absolute symbol. Branching to an absolute
+  // location is extremely rare in real code, though.
+  if (sym.is_absolute())
+    return INT32_MAX;
+
+  // Likewise, relocations against weak undefined symbols won't be relaxed.
+  if (sym.esym().is_undef_weak())
+    return INT32_MAX;
+
+  // Compute a distance between the relocated place and the symbol.
+  i64 S = sym.get_addr(ctx);
+  i64 A = rel.r_addend;
+  i64 P = isec.get_addr() + rel.r_offset;
+  return S + A - P;
+}
+
+// Scan relocations to shrink sections.
+template <typename E>
+static void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
+  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
+  isec.extra.r_deltas.resize(rels.size() + 1);
+
+  i64 delta = 0;
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &r = rels[i];
+    Symbol<E> &sym = *isec.file.symbols[r.r_sym];
+    isec.extra.r_deltas[i] = delta;
+
+    // Handling R_RISCV_ALIGN is mandatory.
+    //
+    // R_RISCV_ALIGN refers to NOP instructions. We need to eliminate some
+    // or all of the instructions so that the instruction that immediately
+    // follows the NOPs is aligned to a specified alignment boundary.
+    if (r.r_type == R_RISCV_ALIGN) {
+      // The total bytes of NOPs is stored to r_addend, so the next
+      // instruction is r_addend away.
+      u64 loc = isec.get_addr() + r.r_offset - delta;
+      u64 next_loc = loc + r.r_addend;
+      u64 alignment = bit_ceil(r.r_addend + 1);
+      assert(alignment <= (1 << isec.p2align));
+      delta += next_loc - align_to(loc, alignment);
+      continue;
+    }
+
+    // Handling other relocations is optional.
+    if (!ctx.arg.relax || i == rels.size() - 1 ||
+        rels[i + 1].r_type != R_RISCV_RELAX)
+      continue;
+
+    // Linker-synthesized symbols haven't been assigned their final
+    // values when we are shrinking sections because actual values can
+    // be computed only after we fix the file layout. Therefore, we
+    // assume that relocations against such symbols are always
+    // non-relaxable.
+    if (sym.file == ctx.internal_obj)
+      continue;
+
+    switch (r.r_type) {
+    case R_RISCV_CALL:
+    case R_RISCV_CALL_PLT: {
+      // These relocations refer to an AUIPC + JALR instruction pair to
+      // allow to jump to anywhere in PC ± 2 GiB. If the jump target is
+      // close enough to PC, we can use C.J, C.JAL or JAL instead.
+      i64 dist = compute_distance(ctx, sym, isec, r);
+      if (dist & 1)
+        break;
+
+      i64 rd = get_rd(*(ul32 *)(isec.contents.data() + r.r_offset + 4));
+
+      if (rd == 0 && sign_extend(dist, 11) == dist && use_rvc) {
+        // If rd is x0 and the jump target is within ±2 KiB, we can use
+        // C.J, saving 6 bytes.
+        delta += 6;
+      } else if (rd == 1 && sign_extend(dist, 11) == dist && use_rvc && !E::is_64) {
+        // If rd is x1 and the jump target is within ±2 KiB, we can use
+        // C.JAL. This is RV32 only because C.JAL is RV32-only instruction.
+        delta += 6;
+      } else if (sign_extend(dist, 20) == dist) {
+        // If the jump target is within ±1 MiB, we can use JAL.
+        delta += 4;
+      }
+      break;
+    }
+    case R_RISCV_HI20:
+      // If the upper 20 bits are all zero, we can remove LUI.
+      // The corresponding instructions referred to by LO12_I/LO12_S
+      // relocations will use the zero register instead.
+      if (bits(sym.get_addr(ctx), 31, 12) == 0)
+        delta += 4;
+      break;
+    case R_RISCV_TPREL_HI20:
+    case R_RISCV_TPREL_ADD:
+      // These relocations are used to add a high 20-bit value to the
+      // thread pointer. The following two instructions materializes
+      // TP + HI20(foo) in %r5, for example.
+      //
+      //  lui  a5,%tprel_hi(foo)         # R_RISCV_TPREL_HI20 (symbol)
+      //  add  a5,a5,tp,%tprel_add(foo)  # R_RISCV_TPREL_ADD (symbol)
+      //
+      // Then thread-local variable `foo` is accessed with a low 12-bit
+      // offset like this:
+      //
+      //  sw   t0,%tprel_lo(foo)(a5)     # R_RISCV_TPREL_LO12_S (symbol)
+      //
+      // However, if the variable is at TP ±2 KiB, TP + HI20(foo) is the
+      // same as TP, so we can instead access the thread-local variable
+      // directly using TP like this:
+      //
+      //  sw   t0,%tprel_lo(foo)(tp)
+      //
+      // Here, we remove `lui` and `add` if the offset is within ±2 KiB.
+      if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
+          sign_extend(val, 11) == val)
+        delta += 4;
+      break;
+    }
+  }
+
+  isec.extra.r_deltas[rels.size()] = delta;
+  isec.sh_size -= delta;
+}
+
+// Shrink sections by interpreting relocations.
+//
+// This operation seems to be optional, because by default longest
+// instructions are being used. However, calling this function is actually
+// mandatory because of R_RISCV_ALIGN. R_RISCV_ALIGN is a directive to the
+// linker to align the location referred to by the relocation to a
+// specified byte boundary. We at least have to interpret them to satisfy
+// the alignment constraints.
+template <typename E>
+i64 riscv_resize_sections(Context<E> &ctx) {
+  Timer t(ctx, "riscv_resize_sections");
+
+  // True if we can use the 2-byte instructions. This is usually true on
+  // Unix because RV64GC is generally considered the baseline hardware.
+  bool use_rvc = get_eflags(ctx) & EF_RISCV_RVC;
+
+  // Find all the relocations that can be relaxed.
+  // This step should only shrink sections.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (std::unique_ptr<InputSection<E>> &isec : file->sections)
+      if (is_resizable(ctx, isec.get()))
+        shrink_section(ctx, *isec, use_rvc);
+  });
+
+  // Fix symbol values.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (Symbol<E> *sym : file->symbols) {
+      if (sym->file != file)
+        continue;
+
+      InputSection<E> *isec = sym->get_input_section();
+      if (!isec || isec->extra.r_deltas.empty())
+        continue;
+
+      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
+      auto it = std::lower_bound(rels.begin(), rels.end(), sym->value,
+                                 [&](const ElfRel<E> &r, u64 val) {
+        return r.r_offset < val;
+      });
+
+      sym->value -= isec->extra.r_deltas[it - rels.begin()];
+    }
+  });
+
+  // Re-compute section offset again to finalize them.
+  compute_section_sizes(ctx);
+  return set_osec_offsets(ctx);
+}
+
+#define INSTANTIATE(E)                                                       \
+  template void write_plt_header(Context<E> &, u8 *);                        \
+  template void write_plt_entry(Context<E> &, u8 *, Symbol<E> &);            \
+  template void write_pltgot_entry(Context<E> &, u8 *, Symbol<E> &);         \
+  template void                                                              \
+  EhFrameSection<E>::apply_reloc(Context<E> &, const ElfRel<E> &, u64, u64); \
+  template void InputSection<E>::apply_reloc_alloc(Context<E> &, u8 *);      \
+  template void InputSection<E>::apply_reloc_nonalloc(Context<E> &, u8 *);   \
+  template void InputSection<E>::copy_contents_riscv(Context<E> &, u8 *);    \
+  template void InputSection<E>::scan_relocations(Context<E> &);             \
+  template i64 riscv_resize_sections(Context<E> &);
+
+INSTANTIATE(RV64LE);
+INSTANTIATE(RV64BE);
+INSTANTIATE(RV32LE);
+INSTANTIATE(RV32BE);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-s390x.cc
+++ b/third_party/mold/elf/arch-s390x.cc
@ -0,0 +1,491 @@
+// clang-format off
+// This file contains code for the IBM z/Architecture 64-bit ISA, which is
+// commonly referred to as "s390x" on Linux.
+//
+// z/Architecture is a 64-bit CISC ISA developed by IBM around 2000 for
+// IBM's "big iron" mainframe computers. The computers are direct
+// descendents of IBM System/360 all the way back in 1966. I've never
+// actually seen a mainframe, and you probaly haven't either, but it looks
+// like the mainframe market is still large enough to sustain its ecosystem.
+// Ubuntu for example provides the official support for s390x as of 2022.
+// Since they are being actively maintained, we need to support them.
+//
+// As an instruction set, s390x isn't particularly odd. It has 16 general-
+// purpose registers. Instructions are 2, 4 or 6 bytes long and always
+// aligned to 2 bytes boundaries. Despite unfamiliarty, I found that it
+// just feels like an x86-64 in a parallel universe.
+//
+// Here is the register usage in this ABI:
+//
+//   r0-r1: reserved as scratch registers so we can use them in our PLT
+//   r2:    parameter passing and return values
+//   r3-r6: parameter passing
+//   r12:   address of GOT if position-independent code
+//   r14:   return address
+//   r15:   stack pointer
+//   a1:    upper 32 bits of TP (thread pointer)
+//   a2:    lower 32 bits of TP (thread pointer)
+//
+// Thread-local storage (TLS) is supported on s390x in the same way as it
+// is on other targets with one exeption. On other targets, __tls_get_addr
+// is used to get an address of a thread-local variable. On s390x,
+// __tls_get_offset is used instead. The difference is __tls_get_offset
+// returns an address of a thread-local variable as an offset from TP. So
+// we need to add TP to a return value before use. I don't know why it is
+// different, but that is the way it is.
+//
+// https://github.com/rui314/psabi/blob/main/s390x.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = S390X;
+
+static void write_mid20(u8 *loc, u64 val) {
+  *(ub32 *)loc |= (bits(val, 11, 0) << 16) | (bits(val, 19, 12) << 8);
+}
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static u8 insn[] = {
+    0xe3, 0x00, 0xf0, 0x38, 0x00, 0x24, // stg   %r0, 56(%r15)
+    0xc0, 0x10, 0, 0, 0, 0,             // larl  %r1, GOTPLT_OFFSET
+    0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc   48(8, %r15), 8(%r1)
+    0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg    %r1, 16(%r1)
+    0x07, 0xf1,                         // br    %r1
+    0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ub32 *)(buf + 8) = (ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 6) >> 1;
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static u8 insn[] = {
+    0xc0, 0x10, 0, 0, 0, 0,             // larl  %r1, GOTPLT_ENTRY_OFFSET
+    0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg    %r1, (%r1)
+    0xc0, 0x01, 0, 0, 0, 0,             // lgfi  %r0, PLT_INDEX
+    0x07, 0xf1,                         // br    %r1
+    0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
+    0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ub32 *)(buf + 2) = (sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx)) >> 1;
+  *(ub32 *)(buf + 14) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static u8 insn[] = {
+    0xc0, 0x10, 0, 0, 0, 0,             // larl  %r1, GOT_ENTRY_OFFSET
+    0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg    %r1, (%r1)
+    0x07, 0xf1,                         // br    %r1
+    0x07, 0x00,                         // nopr
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ub32 *)(buf + 2) = (sym.get_got_addr(ctx) - sym.get_plt_addr(ctx)) >> 1;
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_390_PC32:
+    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  case R_390_64:
+    *(ub64 *)loc = val;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    auto check_dbl = [&](i64 val, i64 lo, i64 hi) {
+      check(val, lo, hi);
+
+      // R_390_*DBL relocs should never refer a symbol at an odd address
+      if (val & 1)
+        Error(ctx) << *this << ": misaligned symbol " << sym
+                   << " for relocation " << rel;
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_390_64:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_390_8:
+      check(S + A, 0, 1 << 8);
+      *loc = S + A;
+      break;
+    case R_390_12:
+      check(S + A, 0, 1 << 12);
+      *(ul16 *)loc |= bits(S + A, 11, 0);
+      break;
+    case R_390_16:
+      check(S + A, 0, 1 << 16);
+      *(ub16 *)loc = S + A;
+      break;
+    case R_390_20:
+      check(S + A, 0, 1 << 20);
+      write_mid20(loc, S + A);
+      break;
+    case R_390_32:
+    case R_390_PLT32:
+      check(S + A, 0, 1LL << 32);
+      *(ub32 *)loc = S + A;
+      break;
+    case R_390_PLT64:
+      *(ub64 *)loc = S + A;
+      break;
+    case R_390_PC12DBL:
+    case R_390_PLT12DBL:
+      check_dbl(S + A - P, -(1 << 12), 1 << 12);
+      *(ul16 *)loc |= bits(S + A - P, 12, 1);
+      break;
+    case R_390_PC16:
+      check(S + A - P, -(1 << 15), 1 << 15);
+      *(ub16 *)loc = S + A - P;
+      break;
+    case R_390_PC32:
+      check(S + A - P, -(1LL << 31), 1LL << 31);
+      *(ub32 *)loc = S + A - P;
+      break;
+    case R_390_PC64:
+      *(ub64 *)loc = S + A - P;
+      break;
+    case R_390_PC16DBL:
+    case R_390_PLT16DBL:
+      check_dbl(S + A - P, -(1 << 16), 1 << 16);
+      *(ub16 *)loc = (S + A - P) >> 1;
+      break;
+    case R_390_PC24DBL:
+    case R_390_PLT24DBL:
+      check_dbl(S + A - P, -(1 << 24), 1 << 24);
+      *(ub32 *)loc |= bits(S + A - P, 24, 1);
+      break;
+    case R_390_PC32DBL:
+    case R_390_PLT32DBL:
+      check_dbl(S + A - P, -(1LL << 32), 1LL << 32);
+      *(ub32 *)loc = (S + A - P) >> 1;
+      break;
+    case R_390_GOT12:
+    case R_390_GOTPLT12:
+      check(G + A, 0, 1 << 12);
+      *(ul16 *)loc |= bits(G + A, 11, 0);
+      break;
+    case R_390_GOT16:
+    case R_390_GOTPLT16:
+      check(G + A, 0, 1 << 16);
+      *(ub16 *)loc = G + A;
+      break;
+    case R_390_GOT20:
+    case R_390_GOTPLT20:
+      check(G + A, 0, 1 << 20);
+      write_mid20(loc, G + A);
+      break;
+    case R_390_GOT32:
+    case R_390_GOTPLT32:
+      check(G + A, 0, 1LL << 32);
+      *(ub32 *)loc = G + A;
+      break;
+    case R_390_GOT64:
+    case R_390_GOTPLT64:
+      *(ub64 *)loc = G + A;
+      break;
+    case R_390_GOTOFF16:
+    case R_390_PLTOFF16:
+      check(S + A - GOT, -(1 << 15), 1 << 15);
+      *(ub16 *)loc = S + A - GOT;
+      break;
+    case R_390_GOTOFF32:
+    case R_390_PLTOFF32:
+      check(S + A - GOT, -(1LL << 31), 1LL << 31);
+      *(ub32 *)loc = S + A - GOT;
+      break;
+    case R_390_GOTOFF64:
+    case R_390_PLTOFF64:
+      *(ub64 *)loc = S + A - GOT;
+      break;
+    case R_390_GOTPC:
+      *(ub64 *)loc = GOT + A - P;
+      break;
+    case R_390_GOTPCDBL:
+      check_dbl(GOT + A - P, -(1LL << 32), 1LL << 32);
+      *(ub32 *)loc = (GOT + A - P) >> 1;
+      break;
+    case R_390_GOTENT:
+      check(GOT + G + A - P, -(1LL << 32), 1LL << 32);
+      *(ub32 *)loc = (GOT + G + A - P) >> 1;
+      break;
+    case R_390_TLS_LE32:
+      *(ub32 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_390_TLS_LE64:
+      *(ub64 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_390_TLS_GOTIE20:
+      write_mid20(loc, sym.get_gottp_addr(ctx) + A - GOT);
+      break;
+    case R_390_TLS_IEENT:
+      *(ub32 *)loc = (sym.get_gottp_addr(ctx) + A - P) >> 1;
+      break;
+    case R_390_TLS_GD32:
+      if (sym.has_tlsgd(ctx))
+        *(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
+      else if (sym.has_gottp(ctx))
+        *(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
+      else
+        *(ub32 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_390_TLS_GD64:
+      if (sym.has_tlsgd(ctx))
+        *(ub64 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
+      else if (sym.has_gottp(ctx))
+        *(ub64 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
+      else
+        *(ub64 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_390_TLS_GDCALL:
+      if (sym.has_tlsgd(ctx)) {
+        // do nothing
+      } else if (sym.has_gottp(ctx)) {
+        // lg %r2, 0(%r2, %r12)
+        static u8 insn[] = { 0xe3, 0x22, 0xc0, 0x00, 0x00, 0x04 };
+        memcpy(loc, insn, sizeof(insn));
+      } else {
+        // nop
+        static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
+        memcpy(loc, insn, sizeof(insn));
+      }
+      break;
+    case R_390_TLS_LDM32:
+      if (ctx.got->has_tlsld(ctx))
+        *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
+      break;
+    case R_390_TLS_LDM64:
+      if (ctx.got->has_tlsld(ctx))
+        *(ub64 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
+      break;
+    case R_390_TLS_LDO32:
+      if (ctx.got->has_tlsld(ctx))
+        *(ub32 *)loc = S + A - ctx.dtp_addr;
+      else
+        *(ub32 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_390_TLS_LDO64:
+      if (ctx.got->has_tlsld(ctx))
+        *(ub64 *)loc = S + A - ctx.dtp_addr;
+      else
+        *(ub64 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_390_TLS_LDCALL:
+      if (!ctx.got->has_tlsld(ctx)) {
+        // nop
+        static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
+        memcpy(loc, insn, sizeof(insn));
+      }
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_390_32: {
+      i64 val = S + A;
+      check(val, 0, 1LL << 32);
+      *(ub32 *)loc = val;
+      break;
+    }
+    case R_390_64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ub64 *)loc = *val;
+      else
+        *(ub64 *)loc = S + A;
+      break;
+    case R_390_TLS_LDO64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ub64 *)loc = *val;
+      else
+        *(ub64 *)loc = S + A - ctx.dtp_addr;
+      break;
+    default:
+      Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_390_64:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_390_8:
+    case R_390_12:
+    case R_390_16:
+    case R_390_20:
+    case R_390_32:
+      scan_absrel(ctx, sym, rel);
+      break;
+    case R_390_PC16:
+    case R_390_PC16DBL:
+    case R_390_PC32:
+    case R_390_PC32DBL:
+    case R_390_PC64:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_390_GOT12:
+    case R_390_GOT16:
+    case R_390_GOT20:
+    case R_390_GOT32:
+    case R_390_GOT64:
+    case R_390_GOTOFF16:
+    case R_390_GOTOFF32:
+    case R_390_GOTOFF64:
+    case R_390_GOTPLT12:
+    case R_390_GOTPLT16:
+    case R_390_GOTPLT20:
+    case R_390_GOTPLT32:
+    case R_390_GOTPLT64:
+    case R_390_GOTPC:
+    case R_390_GOTPCDBL:
+    case R_390_GOTENT:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_390_PLT12DBL:
+    case R_390_PLT16DBL:
+    case R_390_PLT24DBL:
+    case R_390_PLT32:
+    case R_390_PLT32DBL:
+    case R_390_PLT64:
+    case R_390_PLTOFF16:
+    case R_390_PLTOFF32:
+    case R_390_PLTOFF64:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_390_TLS_GOTIE20:
+    case R_390_TLS_IEENT:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_390_TLS_GD32:
+    case R_390_TLS_GD64:
+      // We always want to relax calls to __tls_get_offset() in statically-
+      // linked executables because __tls_get_offset() in libc.a just calls
+      // abort().
+      if (ctx.arg.is_static ||
+          (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared)) {
+        // do nothing
+      } else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared &&
+                 !ctx.arg.z_dlopen) {
+        sym.flags |= NEEDS_GOTTP;
+      } else {
+        sym.flags |= NEEDS_TLSGD;
+      }
+      break;
+    case R_390_TLS_LDM32:
+    case R_390_TLS_LDM64: {
+      bool do_relax = ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared);
+      if (!do_relax)
+        ctx.needs_tlsld = true;
+      break;
+    }
+    case R_390_TLS_LE32:
+    case R_390_TLS_LE64:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_390_TLS_LDO32:
+    case R_390_TLS_LDO64:
+    case R_390_TLS_GDCALL:
+    case R_390_TLS_LDCALL:
+      break;
+    default:
+      Fatal(ctx) << *this << ": scan_relocations: " << rel;
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-sh4.cc
+++ b/third_party/mold/elf/arch-sh4.cc
@ -0,0 +1,355 @@
+// clang-format off
+// SH-4 (SuperH 4) is a 32-bit RISC ISA developed by Hitachi in the early
+// '90s. Some relatively powerful systems were developed with SH-4.
+// A notable example is Sega's Dreamcast game console which debuted in 1998.
+// Hitachi later spun off its semiconductor division as an independent
+// company, Renesas, and Renesas is still selling SH-4 processors for the
+// embedded market. It has never been as popular as ARM is, and its
+// popularity continues to decline though.
+//
+// SH-4's most distinctive feature compared to other RISC ISAs is that its
+// instructions are 16 bits in length instead of more common 32 bits for
+// better code density. This difference affects various aspects of its
+// instruction set as shown below:
+//
+//  - SH-4 has 16 general-purpose registers (GPRs) instead of the most
+//    commmon 32 GPR configuration to save one bit to specify a register.
+//
+//  - Binary instructions such as ADD normally take three register in
+//    RISC ISAs (e.g. x ← y ⊕ z where x, y and z are registers), but
+//    SH-4's instructions take only two registers. The result of an
+//    operation is written to one of the source registers (e.g. x ← x ⊕ y).
+//
+//  - Usual RISC ISAs have "load high" and "load low" instructions to set
+//    an immediate to most significant and least significant bits in a
+//    register to construct a full 32-bit value in a register. This
+//    technique is hard to use in SH-4, as 16 bit instructions are too
+//    small to contain large immediates. On SH-4, large immediates are
+//    loaded from memory using `mov.l` PC-relative load instruction.
+//
+//  - Many RISC ISAs are, despite their name, actually fairly complex.
+//    They tend to have hundreds if not thousands of different instructions.
+//    SH-4 doesn't really have that many instructions because its 16-bit
+//    machine code simply can't encode many different opcodes. As a
+//    result, the number of relocations the linker has to support is also
+//    small.
+//
+// Beside these, SH-4 has a delay branch slot just like contemporary MIPS
+// and SPARC. That is, one instruction after a branch instruction will
+// always be executed even if the branch is taken. Delay branch slot allows
+// a pipelined CPU to start and finish executing an instruction after a
+// branch regardless of the branch's condition, simplifying the processor's
+// implementation. It's considered a bad premature optimization nowadays,
+// though. Modern RISC processors don't have it.
+//
+// Here are notes about the SH-4 psABI:
+//
+//  - If a source file is compiled with -fPIC, each function starts
+//    with a piece of code to store the address of .got to %r12.
+//    We can use the register in our PLT for position-independent output.
+//
+//  - Even though it uses the RELA-type relocations, relocation addends
+//    are stored not to the r_addend field but to the relocated section
+//    contents for some reason. Therefore, it's effectively REL.
+//
+//  - It looks like the ecosystem has bit-rotted. Some tests, especially
+//    one using C++ exceptions, don't pass even with GNU ld.
+//
+//  - GCC/SH4 tends to write dynamically-relocated data into .text, so the
+//    output from the linker contains lots of text relocations. That's not
+//    a problem with embedded programming, I guess.
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = SH4;
+
+// Even though SH-4 uses RELA-type relocations, addends are stored to
+// relocated places for some reason.
+template <>
+i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
+  switch (rel.r_type) {
+  case R_SH_DIR32:
+  case R_SH_REL32:
+  case R_SH_TLS_GD_32:
+  case R_SH_TLS_LD_32:
+  case R_SH_TLS_LDO_32:
+  case R_SH_TLS_IE_32:
+  case R_SH_TLS_LE_32:
+  case R_SH_TLS_DTPMOD32:
+  case R_SH_TLS_DTPOFF32:
+  case R_SH_TLS_TPOFF32:
+  case R_SH_GOT32:
+  case R_SH_PLT32:
+  case R_SH_GOTOFF:
+  case R_SH_GOTPC:
+  case R_SH_GOTPLT32:
+    return *(ul32 *)loc;
+  default:
+    return 0;
+  }
+}
+
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  if (ctx.arg.pic) {
+    static const u8 insn[] = {
+      0x02, 0xd2, //    mov.l   1f, r2
+      0xcc, 0x32, //    add     r12, r2
+      0x22, 0x50, //    mov.l   @(8, r2), r0
+      0x21, 0x52, //    mov.l   @(4, r2), r2
+      0x2b, 0x40, //    jmp     @r0
+      0x00, 0xe0, //    mov     #0, r0
+      0, 0, 0, 0, // 1: .long GOTPLT
+    };
+
+    static_assert(sizeof(insn) == E::plt_hdr_size);
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr;
+  } else {
+    static const u8 insn[] = {
+      0x02, 0xd2, //    mov.l   1f, r2
+      0x22, 0x50, //    mov.l   @(8, r2), r0
+      0x21, 0x52, //    mov.l   @(4, r2), r2
+      0x2b, 0x40, //    jmp     @r0
+      0x00, 0xe0, //    mov     #0, r0
+      0x09, 0x00, //    nop
+      0, 0, 0, 0, // 1: .long GOTPLT
+    };
+
+    static_assert(sizeof(insn) == E::plt_hdr_size);
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr;
+  }
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  if (ctx.arg.pic) {
+    static const u8 insn[] = {
+      0x01, 0xd0, //    mov.l   1f, r0
+      0xce, 0x00, //    mov.l   @(r0, r12), r0
+      0x2b, 0x40, //    jmp     @r0
+      0x01, 0xd1, //    mov.l   2f, r1
+      0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
+      0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
+    };
+
+    static_assert(sizeof(insn) == E::plt_size);
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
+    *(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+  } else {
+    static const u8 insn[] = {
+      0x01, 0xd0, //    mov.l   1f, r0
+      0x02, 0x60, //    mov.l   @r0, r0
+      0x2b, 0x40, //    jmp     @r0
+      0x01, 0xd1, //    mov.l   2f, r1
+      0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
+      0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
+    };
+
+    static_assert(sizeof(insn) == E::plt_size);
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx);
+    *(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
+  }
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  if (ctx.arg.pic) {
+    static const u8 insn[] = {
+      0x01, 0xd0, //    mov.l   1f, r0
+      0xce, 0x00, //    mov.l   @(r0, r12), r0
+      0x2b, 0x40, //    jmp     @r0
+      0x09, 0x00, //    nop
+      0, 0, 0, 0, // 1: .long GOT_ENTRY
+    };
+
+    static_assert(sizeof(insn) == E::pltgot_size);
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 8) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr;
+  } else {
+    static const u8 insn[] = {
+      0x01, 0xd0, //    mov.l   1f, r0
+      0x02, 0x60, //    mov.l   @r0, r0
+      0x2b, 0x40, //    jmp     @r0
+      0x09, 0x00, //    nop
+      0, 0, 0, 0, // 1: .long GOT_ENTRY
+    };
+
+    static_assert(sizeof(insn) == E::pltgot_size);
+    memcpy(buf, insn, sizeof(insn));
+    *(ul32 *)(buf + 8) = sym.get_got_addr(ctx);
+  }
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_SH_DIR32:
+    *(ul32 *)loc = val;
+    break;
+  case R_SH_REL32:
+    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = get_addend(loc, rel);
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_SH_DIR32:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_SH_REL32:
+    case R_SH_PLT32:
+      *(ul32 *)loc = S + A - P;
+      break;
+    case R_SH_GOT32:
+      *(ul32 *)loc = G;
+      break;
+    case R_SH_GOTPC:
+      *(ul32 *)loc = GOT + A - P;
+      break;
+    case R_SH_GOTOFF:
+      *(ul32 *)loc = S + A - GOT;
+      break;
+    case R_SH_TLS_GD_32:
+      *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
+      break;
+    case R_SH_TLS_LD_32:
+      *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
+      break;
+    case R_SH_TLS_LDO_32:
+      *(ul32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_SH_TLS_IE_32:
+      *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
+      break;
+    case R_SH_TLS_LE_32:
+      *(ul32 *)loc = S + A - ctx.tp_addr;
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : get_addend(loc, rel);
+
+    switch (rel.r_type) {
+    case R_SH_DIR32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul32 *)loc = *val;
+      else
+        *(ul32 *)loc = S + A;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      Error(ctx) << sym << ": GNU ifunc symbol is not supported on sh4";
+
+    switch (rel.r_type) {
+    case R_SH_DIR32:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_SH_REL32:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_SH_GOT32:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_SH_PLT32:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_SH_TLS_GD_32:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_SH_TLS_LD_32:
+      ctx.needs_tlsld = true;
+      break;
+    case R_SH_TLS_IE_32:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_SH_TLS_LE_32:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_SH_GOTPC:
+    case R_SH_GOTOFF:
+    case R_SH_TLS_LDO_32:
+      break;
+    default:
+      Fatal(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-sparc64.cc
+++ b/third_party/mold/elf/arch-sparc64.cc
@ -0,0 +1,622 @@
+// clang-format off
+// SPARC is a RISC ISA developed by Sun Microsystems.
+//
+// The byte order of the processor is big-endian. Anything larger than a
+// byte is stored in the "reverse" order compared to little-endian
+// processors such as x86-64.
+//
+// All instructions are 4 bytes long and aligned to 4 bytes boundaries.
+//
+// A notable feature of SPARC is that, unlike other RISC ISAs, it doesn't
+// need range extension thunks. It is because the SPARC's CALL instruction
+// contains a whopping 30 bits immediate. The processor scales it by 4 to
+// extend it to 32 bits (this is doable because all instructions are
+// aligned to 4 bytes boundaries, so the least significant two bits are
+// always zero). That means CALL's reach is PC ± 2 GiB, elinating the
+// need of range extension thunks. It comes with the cost that the CALL
+// instruction alone takes 1/4th of the instruction encoding space,
+// though.
+//
+// SPARC has 32 general purpose registers. CALL instruction saves a return
+// address to %o7, which is an alias for %r15. Thread pointer is stored to
+// %g7 which is %r7.
+//
+// SPARC does not have PC-relative load/store instructions. To access data
+// in the position-independent manner, we usually first set the address of
+// .got to, for example, %l7, with the following piece of code
+//
+//   sethi  %hi(. - _GLOBAL_OFFSET_TABLE_), %l7
+//   add  %l7, %lo(. - _GLOBAL_OFFSET_TABLE_), %l7
+//   call __sparc_get_pc_thunk.l7
+//   nop
+//
+// where __sparc_get_pc_thunk.l7 is defined as
+//
+//   retl
+//   add  %o7, %l7, %l7
+//
+// . SETHI and the following ADD materialize a 32 bits offset to .got.
+// CALL instruction sets a return address to $o7, and the subsequent ADD
+// adds it to the GOT offset to materialize the absolute address of .got.
+//
+// Note that we have a NOP after CALL and an ADD after RETL because of
+// SPARC's delay branch slots. That is, the SPARC processor always
+// executes one instruction after a branch even if the branch is taken.
+// This may seem like an odd behavior, and indeed it is considered as such
+// (that's a premature optimization for the early pipelined SPARC
+// processors), but that's been a part of the ISA's spec so that's what it
+// is.
+//
+// Note also that the .got address obtained this way is not shared between
+// functions, so functions can use an arbitrary register to hold the .got
+// address. That also means each function needs to execute the above piece
+// of code to become position-independent.
+//
+// This scheme is very similar to i386. That may not be a coincidence
+// because the i386 ELF psABI is created by Sun Microsystems too.
+//
+// https://github.com/rui314/psabi/blob/main/sparc.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = SPARC64;
+
+// SPARC's PLT section is writable despite containing executable code.
+// We don't need to write the PLT header entry because the dynamic loader
+// will do that for us.
+//
+// We also don't need a .got.plt section to store the result of lazy PLT
+// symbol resolution because the dynamic symbol resolver directly mutates
+// instructions in PLT so that they jump to the right places next time.
+// That's why each PLT entry contains lots of NOPs; they are a placeholder
+// for the runtime to add more instructions.
+//
+// Self-modifying code is nowadays considered really bad from the security
+// point of view, though.
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  memset(buf, 0, E::plt_hdr_size);
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static ub32 insn[] = {
+    0x0300'0000, // sethi (. - .PLT0), %g1
+    0x3068'0000, // ba,a  %xcc, .PLT1
+    0x0100'0000, // nop
+    0x0100'0000, // nop
+    0x0100'0000, // nop
+    0x0100'0000, // nop
+    0x0100'0000, // nop
+    0x0100'0000, // nop
+  };
+
+  u64 plt0 = ctx.plt->shdr.sh_addr;
+  u64 plt1 = ctx.plt->shdr.sh_addr + E::plt_size;
+  u64 entry = sym.get_plt_addr(ctx);
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ub32 *)buf |= bits(entry - plt0, 21, 0);
+  *(ub32 *)(buf + 4) |= bits(plt1 - entry - 4, 20, 2);
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static ub32 entry[] = {
+    0x8a10'000f, // mov  %o7, %g5
+    0x4000'0002, // call . + 8
+    0xc25b'e014, // ldx  [ %o7 + 20 ], %g1
+    0xc25b'c001, // ldx  [ %o7 + %g1 ], %g1
+    0x81c0'4000, // jmp  %g1
+    0x9e10'0005, // mov  %g5, %o7
+    0x0000'0000, // .quad $plt_entry - $got_entry
+    0x0000'0000,
+  };
+
+  memcpy(buf, entry, sizeof(entry));
+  *(ub64 *)(buf + 24) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 4;
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_SPARC_64:
+  case R_SPARC_UA64:
+    *(ub64 *)loc = val;
+    break;
+  case R_SPARC_DISP32:
+    *(ub32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = (get_addr() + rel.r_offset);
+    u64 G = (sym.get_got_idx(ctx) * sizeof(Word<E>));
+    u64 GOT = ctx.got->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_SPARC_64:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_SPARC_5:
+      check(S + A, 0, 1 << 5);
+      *(ub32 *)loc |= bits(S + A, 4, 0);
+      break;
+    case R_SPARC_6:
+      check(S + A, 0, 1 << 6);
+      *(ub32 *)loc |= bits(S + A, 5, 0);
+      break;
+    case R_SPARC_7:
+      check(S + A, 0, 1 << 7);
+      *(ub32 *)loc |= bits(S + A, 6, 0);
+      break;
+    case R_SPARC_8:
+      check(S + A, 0, 1 << 8);
+      *(u8 *)loc = S + A;
+      break;
+    case R_SPARC_10:
+      check(S + A, 0, 1 << 10);
+      *(ub32 *)loc |= bits(S + A, 9, 0);
+      break;
+    case R_SPARC_LO10:
+    case R_SPARC_LOPLT10:
+      *(ub32 *)loc |= bits(S + A, 9, 0);
+      break;
+    case R_SPARC_11:
+      check(S + A, 0, 1 << 11);
+      *(ub32 *)loc |= bits(S + A, 10, 0);
+      break;
+    case R_SPARC_13:
+      check(S + A, 0, 1 << 13);
+      *(ub32 *)loc |= bits(S + A, 12, 0);
+      break;
+    case R_SPARC_16:
+    case R_SPARC_UA16:
+      check(S + A, 0, 1 << 16);
+      *(ub16 *)loc = S + A;
+      break;
+    case R_SPARC_22:
+      check(S + A, 0, 1 << 22);
+      *(ub32 *)loc |= bits(S + A, 21, 0);
+      break;
+    case R_SPARC_32:
+    case R_SPARC_UA32:
+    case R_SPARC_PLT32:
+      check(S + A, 0, 1LL << 32);
+      *(ub32 *)loc = S + A;
+      break;
+    case R_SPARC_PLT64:
+    case R_SPARC_UA64:
+    case R_SPARC_REGISTER:
+      *(ub64 *)loc = S + A;
+      break;
+    case R_SPARC_DISP8:
+      check(S + A - P, -(1 << 7), 1 << 7);
+      *(u8 *)loc = S + A - P;
+      break;
+    case R_SPARC_DISP16:
+      check(S + A - P, -(1 << 15), 1 << 15);
+      *(ub16 *)loc = S + A - P;
+      break;
+    case R_SPARC_DISP32:
+    case R_SPARC_PCPLT32:
+      check(S + A - P, -(1LL << 31), 1LL << 31);
+      *(ub32 *)loc = S + A - P;
+      break;
+    case R_SPARC_DISP64:
+      *(ub64 *)loc = S + A - P;
+      break;
+    case R_SPARC_WDISP16: {
+      i64 val = S + A - P;
+      check(val, -(1 << 16), 1 << 16);
+      *(ub16 *)loc |= (bit(val, 16) << 21) | bits(val, 15, 2);
+      break;
+    }
+    case R_SPARC_WDISP19:
+      check(S + A - P, -(1 << 20), 1 << 20);
+      *(ub32 *)loc |= bits(S + A - P, 20, 2);
+      break;
+    case R_SPARC_WDISP22:
+      check(S + A - P, -(1 << 23), 1 << 23);
+      *(ub32 *)loc |= bits(S + A - P, 23, 2);
+      break;
+    case R_SPARC_WDISP30:
+    case R_SPARC_WPLT30:
+      check(S + A - P, -(1LL << 31), 1LL << 31);
+      *(ub32 *)loc |= bits(S + A - P, 31, 2);
+      break;
+    case R_SPARC_HI22:
+    case R_SPARC_HIPLT22:
+    case R_SPARC_LM22:
+      *(ub32 *)loc |= bits(S + A, 31, 10);
+      break;
+    case R_SPARC_GOT10:
+      *(ub32 *)loc |= bits(G, 9, 0);
+      break;
+    case R_SPARC_GOT13:
+      check(G, 0, 1 << 12);
+      *(ub32 *)loc |= bits(G, 12, 0);
+      break;
+    case R_SPARC_GOT22:
+      *(ub32 *)loc |= bits(G, 31, 10);
+      break;
+    case R_SPARC_GOTDATA_HIX22: {
+      i64 val = S + A - GOT;
+      *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
+      break;
+    }
+    case R_SPARC_GOTDATA_LOX10: {
+      i64 val = S + A - GOT;
+      *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
+      break;
+    }
+    case R_SPARC_GOTDATA_OP_HIX22:
+      // We always have to relax a GOT load to a load immediate if a
+      // symbol is local, because R_SPARC_GOTDATA_OP cannot represent
+      // an addend for a local symbol.
+      if (sym.is_imported || sym.is_ifunc()) {
+        *(ub32 *)loc |= bits(G, 31, 10);
+      } else if (sym.is_absolute()) {
+        i64 val = S + A;
+        *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
+      } else {
+        i64 val = S + A - GOT;
+        *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
+      }
+      break;
+    case R_SPARC_GOTDATA_OP_LOX10: {
+      if (sym.is_imported || sym.is_ifunc()) {
+        *(ub32 *)loc |= bits(G, 9, 0);
+      } else if (sym.is_absolute()) {
+        i64 val = S + A;
+        *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
+      } else {
+        i64 val = S + A - GOT;
+        *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
+      }
+      break;
+    }
+    case R_SPARC_GOTDATA_OP:
+      if (sym.is_imported || sym.is_ifunc())
+        break;
+
+      if (sym.is_absolute()) {
+        // ldx [ %g2 + %g1 ], %g1  →  nop
+        *(ub32 *)loc = 0x0100'0000;
+      } else {
+        // ldx [ %g2 + %g1 ], %g1  →  add %g2, %g1, %g1
+        *(ub32 *)loc &= 0b00'11111'000000'11111'1'11111111'11111;
+        *(ub32 *)loc |= 0b10'00000'000000'00000'0'00000000'00000;
+      }
+      break;
+    case R_SPARC_PC10:
+    case R_SPARC_PCPLT10:
+      *(ub32 *)loc |= bits(S + A - P, 9, 0);
+      break;
+    case R_SPARC_PC22:
+    case R_SPARC_PCPLT22:
+    case R_SPARC_PC_LM22:
+      *(ub32 *)loc |= bits(S + A - P, 31, 10);
+      break;
+    case R_SPARC_OLO10:
+      *(ub32 *)loc |= bits(bits(S + A, 9, 0) + rel.r_type_data, 12, 0);
+      break;
+    case R_SPARC_HH22:
+      *(ub32 *)loc |= bits(S + A, 63, 42);
+      break;
+    case R_SPARC_HM10:
+      *(ub32 *)loc |= bits(S + A, 41, 32);
+      break;
+    case R_SPARC_PC_HH22:
+      *(ub32 *)loc |= bits(S + A - P, 63, 42);
+      break;
+    case R_SPARC_PC_HM10:
+      *(ub32 *)loc |= bits(S + A - P, 41, 32);
+      break;
+    case R_SPARC_HIX22:
+      *(ub32 *)loc |= bits(~(S + A), 31, 10);
+      break;
+    case R_SPARC_LOX10:
+      *(ub32 *)loc |= bits(S + A, 9, 0) | 0b1'1100'0000'0000;
+      break;
+    case R_SPARC_H44:
+      *(ub32 *)loc |= bits(S + A, 43, 22);
+      break;
+    case R_SPARC_M44:
+      *(ub32 *)loc |= bits(S + A, 21, 12);
+      break;
+    case R_SPARC_L44:
+      *(ub32 *)loc |= bits(S + A, 11, 0);
+      break;
+    case R_SPARC_TLS_GD_HI22:
+      *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10);
+      break;
+    case R_SPARC_TLS_GD_LO10:
+      *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0);
+      break;
+    case R_SPARC_TLS_GD_CALL:
+    case R_SPARC_TLS_LDM_CALL: {
+      u64 addr;
+      if (ctx.arg.is_static)
+        addr = ctx.extra.tls_get_addr_sec->shdr.sh_addr;
+      else
+        addr = ctx.extra.tls_get_addr_sym->get_addr(ctx);
+
+      *(ub32 *)loc |= bits(addr + A - P, 31, 2);
+      break;
+    }
+    case R_SPARC_TLS_LDM_HI22:
+      *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10);
+      break;
+    case R_SPARC_TLS_LDM_LO10:
+      *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0);
+      break;
+    case R_SPARC_TLS_LDO_HIX22:
+      *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 31, 10);
+      break;
+    case R_SPARC_TLS_LDO_LOX10:
+      *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 9, 0);
+      break;
+    case R_SPARC_TLS_IE_HI22:
+      *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 31, 10);
+      break;
+    case R_SPARC_TLS_IE_LO10:
+      *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 9, 0);
+      break;
+    case R_SPARC_TLS_LE_HIX22:
+      *(ub32 *)loc |= bits(~(S + A - ctx.tp_addr), 31, 10);
+      break;
+    case R_SPARC_TLS_LE_LOX10:
+      *(ub32 *)loc |= bits(S + A - ctx.tp_addr, 9, 0) | 0b1'1100'0000'0000;
+      break;
+    case R_SPARC_SIZE32:
+      *(ub32 *)loc = sym.esym().st_size + A;
+      break;
+    case R_SPARC_TLS_GD_ADD:
+    case R_SPARC_TLS_LDM_ADD:
+    case R_SPARC_TLS_LDO_ADD:
+    case R_SPARC_TLS_IE_LD:
+    case R_SPARC_TLS_IE_LDX:
+    case R_SPARC_TLS_IE_ADD:
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_SPARC_64:
+    case R_SPARC_UA64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ub64 *)loc = *val;
+      else
+        *(ub64 *)loc = S + A;
+      break;
+    case R_SPARC_32:
+    case R_SPARC_UA32: {
+      i64 val = S + A;
+      check(val, 0, 1LL << 32);
+      *(ub32 *)loc = val;
+      break;
+    }
+    case R_SPARC_TLS_DTPOFF32:
+      *(ub32 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_SPARC_TLS_DTPOFF64:
+      *(ub64 *)loc = S + A - ctx.dtp_addr;
+      break;
+    default:
+      Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel;
+    }
+  }
+}
+
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_SPARC_64:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_SPARC_8:
+    case R_SPARC_5:
+    case R_SPARC_6:
+    case R_SPARC_7:
+    case R_SPARC_10:
+    case R_SPARC_11:
+    case R_SPARC_13:
+    case R_SPARC_16:
+    case R_SPARC_22:
+    case R_SPARC_32:
+    case R_SPARC_REGISTER:
+    case R_SPARC_UA16:
+    case R_SPARC_UA32:
+    case R_SPARC_UA64:
+    case R_SPARC_PC_HM10:
+    case R_SPARC_OLO10:
+    case R_SPARC_LOX10:
+    case R_SPARC_HM10:
+    case R_SPARC_M44:
+    case R_SPARC_HIX22:
+    case R_SPARC_LO10:
+    case R_SPARC_L44:
+    case R_SPARC_LM22:
+    case R_SPARC_HI22:
+    case R_SPARC_H44:
+    case R_SPARC_HH22:
+      scan_absrel(ctx, sym, rel);
+      break;
+    case R_SPARC_PLT32:
+    case R_SPARC_WPLT30:
+    case R_SPARC_WDISP30:
+    case R_SPARC_HIPLT22:
+    case R_SPARC_LOPLT10:
+    case R_SPARC_PCPLT32:
+    case R_SPARC_PCPLT22:
+    case R_SPARC_PCPLT10:
+    case R_SPARC_PLT64:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_SPARC_GOT13:
+    case R_SPARC_GOT10:
+    case R_SPARC_GOT22:
+    case R_SPARC_GOTDATA_HIX22:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_SPARC_GOTDATA_OP_HIX22:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_GOT;
+      break;
+    case R_SPARC_DISP16:
+    case R_SPARC_DISP32:
+    case R_SPARC_DISP64:
+    case R_SPARC_DISP8:
+    case R_SPARC_PC10:
+    case R_SPARC_PC22:
+    case R_SPARC_PC_LM22:
+    case R_SPARC_WDISP16:
+    case R_SPARC_WDISP19:
+    case R_SPARC_WDISP22:
+    case R_SPARC_PC_HH22:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_SPARC_TLS_GD_HI22:
+      sym.flags |= NEEDS_TLSGD;
+      break;
+    case R_SPARC_TLS_LDM_HI22:
+      ctx.needs_tlsld = true;
+      break;
+    case R_SPARC_TLS_IE_HI22:
+      sym.flags |= NEEDS_GOTTP;
+      break;
+    case R_SPARC_TLS_GD_CALL:
+    case R_SPARC_TLS_LDM_CALL:
+      if (!ctx.arg.is_static && ctx.extra.tls_get_addr_sym->is_imported)
+        ctx.extra.tls_get_addr_sym->flags |= NEEDS_PLT;
+      break;
+    case R_SPARC_TLS_LE_HIX22:
+    case R_SPARC_TLS_LE_LOX10:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_SPARC_GOTDATA_OP_LOX10:
+    case R_SPARC_GOTDATA_OP:
+    case R_SPARC_GOTDATA_LOX10:
+    case R_SPARC_TLS_GD_LO10:
+    case R_SPARC_TLS_GD_ADD:
+    case R_SPARC_TLS_LDM_LO10:
+    case R_SPARC_TLS_LDM_ADD:
+    case R_SPARC_TLS_LDO_HIX22:
+    case R_SPARC_TLS_LDO_LOX10:
+    case R_SPARC_TLS_LDO_ADD:
+    case R_SPARC_TLS_IE_ADD:
+    case R_SPARC_TLS_IE_LD:
+    case R_SPARC_TLS_IE_LDX:
+    case R_SPARC_TLS_IE_LO10:
+    case R_SPARC_SIZE32:
+      break;
+    default:
+      Fatal(ctx) << *this << ": scan_relocations: " << rel;
+    }
+  }
+}
+
+// __tls_get_addr is not defined by libc.a, so we can't use that function
+// in statically-linked executables. This section provides a replacement.
+void SparcTlsGetAddrSection::copy_buf(Context<E> &ctx) {
+  ub32 *buf = (ub32 *)(ctx.buf + this->shdr.sh_offset);
+
+  static const ub32 insn[] = {
+    0x0300'0000, // sethi  %hi(TP_SIZE), %g1
+    0x8210'6000, // or   %g1, %lo(TP_SIZE), %g1
+    0x8221'c001, // sub  %g7, %g1, %g1
+    0xd05a'2008, // ldx  [ %o0 + 8 ], %o0
+    0x81c3'e008, // retl
+    0x9000'4008, // add  %g1, %o0, %o0
+  };
+
+  assert(this->shdr.sh_size == sizeof(insn));
+  memcpy(buf, insn, sizeof(insn));
+
+  buf[0] |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10);
+  buf[1] |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0);
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/arch-x86-64.cc
+++ b/third_party/mold/elf/arch-x86-64.cc
@ -0,0 +1,773 @@
+// clang-format off
+// Supporting x86-64 is straightforward. Unlike its predecessor, i386,
+// x86-64 supports PC-relative addressing for position-independent code.
+// Being CISC, its instructions are variable in size. Branch instructions
+// take 4 bytes offsets, so we don't need range extension thunks.
+//
+// The psABI specifies %r11 as neither caller- nor callee-saved. It's
+// intentionally left out so that we can use it as a scratch register in
+// PLT.
+//
+// Thread Pointer (TP) is stored not to a general-purpose register but to
+// FS segment register. Segment register is a 64-bits register which can
+// be used as a base address for memory access. Each thread has a unique
+// FS value, and they access their thread-local variables relative to FS
+// as %fs:offset_from_tp.
+//
+// The value of a segment register itself is not generally readable from
+// the user space. As a workaround, libc initializes %fs:0 (the first word
+// referenced by FS) to the value of %fs itself. So we can obtain TP just
+// by `mov %fs:0, %rax` if we need it.
+//
+// For historical reasons, TP points past the end of the TLS block on x86.
+// This is contrary to other psABIs which usually use the beginning of the
+// TLS block as TP (with some addend). As a result, offsets from TP to
+// thread-local variables (TLVs) in the main executable are all negative.
+//
+// https://github.com/rui314/psabi/blob/main/x86-64.pdf
+// https://github.com/rui314/psabi/blob/main/i386.pdf
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+using E = X86_64;
+
+// This is a security-enhanced version of the regular PLT. The PLT
+// header and each PLT entry starts with endbr64 for the Intel's
+// control-flow enforcement security mechanism.
+//
+// Note that our IBT-enabled PLT instruction sequence is different
+// from the one used in GNU ld. GNU's IBTPLT implementation uses two
+// separate sections (.plt and .plt.sec) in which one PLT entry takes
+// 32 bytes in total. Our IBTPLT consists of just .plt and each entry
+// is 16 bytes long.
+//
+// Our PLT entry clobbers %r11, but that's fine because the resolver
+// function (_dl_runtime_resolve) clobbers %r11 anyway.
+template <>
+void write_plt_header(Context<E> &ctx, u8 *buf) {
+  static const u8 insn[] = {
+    0xf3, 0x0f, 0x1e, 0xfa, // endbr64
+    0x41, 0x53,             // push %r11
+    0xff, 0x35, 0, 0, 0, 0, // push GOTPLT+8(%rip)
+    0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip)
+    0xcc, 0xcc, 0xcc, 0xcc, // (padding)
+    0xcc, 0xcc, 0xcc, 0xcc, // (padding)
+    0xcc, 0xcc, 0xcc, 0xcc, // (padding)
+    0xcc, 0xcc,             // (padding)
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ul32 *)(buf + 8) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 4;
+  *(ul32 *)(buf + 14) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 2;
+}
+
+template <>
+void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static const u8 insn[] = {
+    0xf3, 0x0f, 0x1e, 0xfa, // endbr64
+    0x41, 0xbb, 0, 0, 0, 0, // mov $index_in_relplt, %r11d
+    0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOTPLT
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ul32 *)(buf + 6) = sym.get_plt_idx(ctx);
+  *(ul32 *)(buf + 12) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 16;
+}
+
+template <>
+void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
+  static const u8 insn[] = {
+    0xf3, 0x0f, 0x1e, 0xfa, // endbr64
+    0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT
+    0xcc, 0xcc, 0xcc, 0xcc, // (padding)
+    0xcc, 0xcc,             // (padding)
+  };
+
+  memcpy(buf, insn, sizeof(insn));
+  *(ul32 *)(buf + 6) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 10;
+}
+
+template <>
+void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
+                                    u64 offset, u64 val) {
+  u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
+
+  switch (rel.r_type) {
+  case R_NONE:
+    break;
+  case R_X86_64_32:
+    *(ul32 *)loc = val;
+    break;
+  case R_X86_64_64:
+    *(ul64 *)loc = val;
+    break;
+  case R_X86_64_PC32:
+    *(ul32 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  case R_X86_64_PC64:
+    *(ul64 *)loc = val - this->shdr.sh_addr - offset;
+    break;
+  default:
+    Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
+  }
+}
+
+static u32 relax_gotpcrelx(u8 *loc) {
+  switch ((loc[0] << 8) | loc[1]) {
+  case 0xff15: return 0x90e8; // call *0(%rip) -> call 0
+  case 0xff25: return 0x90e9; // jmp  *0(%rip) -> jmp  0
+  }
+  return 0;
+}
+
+static u32 relax_rex_gotpcrelx(u8 *loc) {
+  switch ((loc[0] << 16) | (loc[1] << 8) | loc[2]) {
+  case 0x488b05: return 0x488d05; // mov 0(%rip), %rax -> lea 0(%rip), %rax
+  case 0x488b0d: return 0x488d0d; // mov 0(%rip), %rcx -> lea 0(%rip), %rcx
+  case 0x488b15: return 0x488d15; // mov 0(%rip), %rdx -> lea 0(%rip), %rdx
+  case 0x488b1d: return 0x488d1d; // mov 0(%rip), %rbx -> lea 0(%rip), %rbx
+  case 0x488b25: return 0x488d25; // mov 0(%rip), %rsp -> lea 0(%rip), %rsp
+  case 0x488b2d: return 0x488d2d; // mov 0(%rip), %rbp -> lea 0(%rip), %rbp
+  case 0x488b35: return 0x488d35; // mov 0(%rip), %rsi -> lea 0(%rip), %rsi
+  case 0x488b3d: return 0x488d3d; // mov 0(%rip), %rdi -> lea 0(%rip), %rdi
+  case 0x4c8b05: return 0x4c8d05; // mov 0(%rip), %r8  -> lea 0(%rip), %r8
+  case 0x4c8b0d: return 0x4c8d0d; // mov 0(%rip), %r9  -> lea 0(%rip), %r9
+  case 0x4c8b15: return 0x4c8d15; // mov 0(%rip), %r10 -> lea 0(%rip), %r10
+  case 0x4c8b1d: return 0x4c8d1d; // mov 0(%rip), %r11 -> lea 0(%rip), %r11
+  case 0x4c8b25: return 0x4c8d25; // mov 0(%rip), %r12 -> lea 0(%rip), %r12
+  case 0x4c8b2d: return 0x4c8d2d; // mov 0(%rip), %r13 -> lea 0(%rip), %r13
+  case 0x4c8b35: return 0x4c8d35; // mov 0(%rip), %r14 -> lea 0(%rip), %r14
+  case 0x4c8b3d: return 0x4c8d3d; // mov 0(%rip), %r15 -> lea 0(%rip), %r15
+  }
+  return 0;
+}
+
+static u32 relax_gottpoff(u8 *loc) {
+  switch ((loc[0] << 16) | (loc[1] << 8) | loc[2]) {
+  case 0x488b05: return 0x48c7c0; // mov 0(%rip), %rax -> mov $0, %rax
+  case 0x488b0d: return 0x48c7c1; // mov 0(%rip), %rcx -> mov $0, %rcx
+  case 0x488b15: return 0x48c7c2; // mov 0(%rip), %rdx -> mov $0, %rdx
+  case 0x488b1d: return 0x48c7c3; // mov 0(%rip), %rbx -> mov $0, %rbx
+  case 0x488b25: return 0x48c7c4; // mov 0(%rip), %rsp -> mov $0, %rsp
+  case 0x488b2d: return 0x48c7c5; // mov 0(%rip), %rbp -> mov $0, %rbp
+  case 0x488b35: return 0x48c7c6; // mov 0(%rip), %rsi -> mov $0, %rsi
+  case 0x488b3d: return 0x48c7c7; // mov 0(%rip), %rdi -> mov $0, %rdi
+  case 0x4c8b05: return 0x49c7c0; // mov 0(%rip), %r8  -> mov $0, %r8
+  case 0x4c8b0d: return 0x49c7c1; // mov 0(%rip), %r9  -> mov $0, %r9
+  case 0x4c8b15: return 0x49c7c2; // mov 0(%rip), %r10 -> mov $0, %r10
+  case 0x4c8b1d: return 0x49c7c3; // mov 0(%rip), %r11 -> mov $0, %r11
+  case 0x4c8b25: return 0x49c7c4; // mov 0(%rip), %r12 -> mov $0, %r12
+  case 0x4c8b2d: return 0x49c7c5; // mov 0(%rip), %r13 -> mov $0, %r13
+  case 0x4c8b35: return 0x49c7c6; // mov 0(%rip), %r14 -> mov $0, %r14
+  case 0x4c8b3d: return 0x49c7c7; // mov 0(%rip), %r15 -> mov $0, %r15
+  }
+  return 0;
+}
+
+static u32 relax_gotpc32_tlsdesc(u8 *loc) {
+  switch ((loc[0] << 16) | (loc[1] << 8) | loc[2]) {
+  case 0x488d05: return 0x48c7c0; // lea 0(%rip), %rax -> mov $0, %rax
+  case 0x488d0d: return 0x48c7c1; // lea 0(%rip), %rcx -> mov $0, %rcx
+  case 0x488d15: return 0x48c7c2; // lea 0(%rip), %rdx -> mov $0, %rdx
+  case 0x488d1d: return 0x48c7c3; // lea 0(%rip), %rbx -> mov $0, %rbx
+  case 0x488d25: return 0x48c7c4; // lea 0(%rip), %rsp -> mov $0, %rsp
+  case 0x488d2d: return 0x48c7c5; // lea 0(%rip), %rbp -> mov $0, %rbp
+  case 0x488d35: return 0x48c7c6; // lea 0(%rip), %rsi -> mov $0, %rsi
+  case 0x488d3d: return 0x48c7c7; // lea 0(%rip), %rdi -> mov $0, %rdi
+  case 0x4c8d05: return 0x49c7c0; // lea 0(%rip), %r8  -> mov $0, %r8
+  case 0x4c8d0d: return 0x49c7c1; // lea 0(%rip), %r9  -> mov $0, %r9
+  case 0x4c8d15: return 0x49c7c2; // lea 0(%rip), %r10 -> mov $0, %r10
+  case 0x4c8d1d: return 0x49c7c3; // lea 0(%rip), %r11 -> mov $0, %r11
+  case 0x4c8d25: return 0x49c7c4; // lea 0(%rip), %r12 -> mov $0, %r12
+  case 0x4c8d2d: return 0x49c7c5; // lea 0(%rip), %r13 -> mov $0, %r13
+  case 0x4c8d35: return 0x49c7c6; // lea 0(%rip), %r14 -> mov $0, %r14
+  case 0x4c8d3d: return 0x49c7c7; // lea 0(%rip), %r15 -> mov $0, %r15
+  }
+  return 0;
+}
+
+// Rewrite a function call to __tls_get_addr to a cheaper instruction
+// sequence. We can do this when we know the thread-local variable's TP-
+// relative address at link-time.
+static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
+  switch (rel.r_type) {
+  case R_X86_64_PLT32:
+  case R_X86_64_PC32:
+  case R_X86_64_GOTPCREL:
+  case R_X86_64_GOTPCRELX: {
+    // The original instructions are the following:
+    //
+    //  66 48 8d 3d 00 00 00 00    lea  foo@tlsgd(%rip), %rdi
+    //  66 66 48 e8 00 00 00 00    call __tls_get_addr
+    //
+    // or
+    //
+    //  66 48 8d 3d 00 00 00 00    lea foo@tlsgd(%rip), %rdi
+    //  66 48 ff 15 00 00 00 00    call *__tls_get_addr@GOT(%rip)
+    static const u8 insn[] = {
+      0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
+      0x48, 0x81, 0xc0, 0, 0, 0, 0,             // add $tp_offset, %rax
+    };
+    memcpy(loc - 4, insn, sizeof(insn));
+    *(ul32 *)(loc + 8) = val;
+    break;
+  }
+  case R_X86_64_PLTOFF64: {
+    // The original instructions are the following:
+    //
+    //  48 8d 3d 00 00 00 00           lea    foo@tlsgd(%rip), %rdi
+    //  48 b8 00 00 00 00 00 00 00 00  movabs __tls_get_addr, %rax
+    //  48 01 d8                       add    %rbx, %rax
+    //  ff d0                          call   *%rax
+    static const u8 insn[] = {
+      0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
+      0x48, 0x81, 0xc0, 0, 0, 0, 0,             // add $tp_offset, %rax
+      0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,       // nop
+    };
+    memcpy(loc - 3, insn, sizeof(insn));
+    *(ul32 *)(loc + 9) = val;
+    break;
+  }
+  default:
+    unreachable();
+  }
+}
+
+static void relax_gd_to_ie(u8 *loc, ElfRel<E> rel, u64 val) {
+  switch (rel.r_type) {
+  case R_X86_64_PLT32:
+  case R_X86_64_PC32:
+  case R_X86_64_GOTPCREL:
+  case R_X86_64_GOTPCRELX: {
+    static const u8 insn[] = {
+      0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
+      0x48, 0x03, 0x05, 0, 0, 0, 0,             // add foo@gottpoff(%rip), %rax
+    };
+    memcpy(loc - 4, insn, sizeof(insn));
+    *(ul32 *)(loc + 8) = val - 12;
+    break;
+  }
+  case R_X86_64_PLTOFF64: {
+    static const u8 insn[] = {
+      0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax
+      0x48, 0x03, 0x05, 0, 0, 0, 0,             // add foo@gottpoff(%rip), %rax
+      0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00,       // nop
+    };
+    memcpy(loc - 3, insn, sizeof(insn));
+    *(ul32 *)(loc + 9) = val - 13;
+    break;
+  }
+  default:
+    unreachable();
+  }
+}
+
+// Rewrite a function call to __tls_get_addr to a cheaper instruction
+// sequence. The difference from relax_gd_to_le is that we are
+// materializing a Dynamic Thread Pointer for the current ELF module
+// instead of an address for a particular thread-local variable.
+static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
+  switch (rel.r_type) {
+  case R_X86_64_PLT32:
+  case R_X86_64_PC32: {
+    // The original instructions are the following:
+    //
+    //  48 8d 3d 00 00 00 00    lea    foo@tlsld(%rip), %rdi
+    //  e8 00 00 00 00          call   __tls_get_addr
+    static const u8 insn[] = {
+      0x31, 0xc0,                   // xor %eax, %eax
+      0x64, 0x48, 0x8b, 0x00,       // mov %fs:(%rax), %rax
+      0x48, 0x2d, 0, 0, 0, 0,       // sub $tls_size, %rax
+    };
+    memcpy(loc - 3, insn, sizeof(insn));
+    *(ul32 *)(loc + 5) = val;
+    break;
+  }
+  case R_X86_64_GOTPCREL:
+  case R_X86_64_GOTPCRELX: {
+    // The original instructions are the following:
+    //
+    //  48 8d 3d 00 00 00 00    lea    foo@tlsld(%rip), %rdi
+    //  ff 15 00 00 00 00       call   *__tls_get_addr@GOT(%rip)
+    static const u8 insn[] = {
+      0x31, 0xc0,                   // xor %eax, %eax
+      0x64, 0x48, 0x8b, 0x00,       // mov %fs:(%rax), %rax
+      0x48, 0x2d, 0, 0, 0, 0,       // sub $tls_size, %rax
+      0x90,                         // nop
+    };
+    memcpy(loc - 3, insn, sizeof(insn));
+    *(ul32 *)(loc + 5) = val;
+    break;
+  }
+  case R_X86_64_PLTOFF64: {
+    // The original instructions are the following:
+    //
+    //  48 8d 3d 00 00 00 00           lea    foo@tlsld(%rip), %rdi
+    //  48 b8 00 00 00 00 00 00 00 00  movabs __tls_get_addr@GOTOFF, %rax
+    //  48 01 d8                       add    %rbx, %rax
+    //  ff d0                          call   *%rax
+    static const u8 insn[] = {
+      0x31, 0xc0,                   // xor %eax, %eax
+      0x64, 0x48, 0x8b, 0x00,       // mov %fs:(%rax), %rax
+      0x48, 0x2d, 0, 0, 0, 0,       // sub $tls_size, %rax
+      0x0f, 0x1f, 0x44, 0x00, 0x00, // nop
+      0x0f, 0x1f, 0x44, 0x00, 0x00, // nop
+    };
+    memcpy(loc - 3, insn, sizeof(insn));
+    *(ul32 *)(loc + 5) = val;
+    break;
+  }
+  default:
+    unreachable();
+  }
+}
+
+// Apply relocations to SHF_ALLOC sections (i.e. sections that are
+// mapped to memory at runtime) based on the result of
+// scan_relocations().
+template <>
+void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  ElfRel<E> *dynrel = nullptr;
+  if (ctx.reldyn)
+    dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
+                           file.reldyn_offset + this->reldyn_offset);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE)
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    auto write32 = [&](u64 val) {
+      check(val, 0, 1LL << 32);
+      *(ul32 *)loc = val;
+    };
+
+    auto write32s = [&](u64 val) {
+      check(val, -(1LL << 31), 1LL << 31);
+      *(ul32 *)loc = val;
+    };
+
+    u64 S = sym.get_addr(ctx);
+    u64 A = rel.r_addend;
+    u64 P = get_addr() + rel.r_offset;
+    u64 G = sym.get_got_addr(ctx) - ctx.gotplt->shdr.sh_addr;
+    u64 GOTPLT = ctx.gotplt->shdr.sh_addr;
+
+    switch (rel.r_type) {
+    case R_X86_64_8:
+      check(S + A, 0, 1 << 8);
+      *loc = S + A;
+      break;
+    case R_X86_64_16:
+      check(S + A, 0, 1 << 16);
+      *(ul16 *)loc = S + A;
+      break;
+    case R_X86_64_32:
+      write32(S + A);
+      break;
+    case R_X86_64_32S:
+      write32s(S + A);
+      break;
+    case R_X86_64_64:
+      apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
+      break;
+    case R_X86_64_PC8:
+      check(S + A - P, -(1 << 7), 1 << 7);
+      *loc = S + A - P;
+      break;
+    case R_X86_64_PC16:
+      check(S + A - P, -(1 << 15), 1 << 15);
+      *(ul16 *)loc = S + A - P;
+      break;
+    case R_X86_64_PC32:
+    case R_X86_64_PLT32:
+      write32s(S + A - P);
+      break;
+    case R_X86_64_PC64:
+      *(ul64 *)loc = S + A - P;
+      break;
+    case R_X86_64_GOT32:
+      write32s(G + A);
+      break;
+    case R_X86_64_GOT64:
+      *(ul64 *)loc = G + A;
+      break;
+    case R_X86_64_GOTOFF64:
+    case R_X86_64_PLTOFF64:
+      *(ul64 *)loc = S + A - GOTPLT;
+      break;
+    case R_X86_64_GOTPC32:
+      write32s(GOTPLT + A - P);
+      break;
+    case R_X86_64_GOTPC64:
+      *(ul64 *)loc = GOTPLT + A - P;
+      break;
+    case R_X86_64_GOTPCREL:
+      write32s(G + GOTPLT + A - P);
+      break;
+    case R_X86_64_GOTPCREL64:
+      *(ul64 *)loc = G + GOTPLT + A - P;
+      break;
+    case R_X86_64_GOTPCRELX:
+      // We always want to relax GOTPCRELX relocs even if --no-relax
+      // was given because some static PIE runtime code depends on these
+      // relaxations.
+      if (!sym.is_imported && !sym.is_ifunc() && sym.is_relative()) {
+        u32 insn = relax_gotpcrelx(loc - 2);
+        i64 val = S + A - P;
+        if (insn && (i32)val == val) {
+          loc[-2] = insn >> 8;
+          loc[-1] = insn;
+          *(ul32 *)loc = val;
+          break;
+        }
+      }
+      write32s(G + GOTPLT + A - P);
+      break;
+    case R_X86_64_REX_GOTPCRELX:
+      if (!sym.is_imported && !sym.is_ifunc() && sym.is_relative()) {
+        u32 insn = relax_rex_gotpcrelx(loc - 3);
+        i64 val = S + A - P;
+        if (insn && (i32)val == val) {
+          loc[-3] = insn >> 16;
+          loc[-2] = insn >> 8;
+          loc[-1] = insn;
+          *(ul32 *)loc = val;
+          break;
+        }
+      }
+      write32s(G + GOTPLT + A - P);
+      break;
+    case R_X86_64_TLSGD:
+      if (sym.has_tlsgd(ctx)) {
+        write32s(sym.get_tlsgd_addr(ctx) + A - P);
+      } else if (sym.has_gottp(ctx)) {
+        relax_gd_to_ie(loc, rels[i + 1], sym.get_gottp_addr(ctx) - P);
+        i++;
+      } else {
+        relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr);
+        i++;
+      }
+      break;
+    case R_X86_64_TLSLD:
+      if (ctx.got->has_tlsld(ctx)) {
+        write32s(ctx.got->get_tlsld_addr(ctx) + A - P);
+      } else {
+        relax_ld_to_le(loc, rels[i + 1], ctx.tp_addr - ctx.tls_begin);
+        i++;
+      }
+      break;
+    case R_X86_64_DTPOFF32:
+      write32s(S + A - ctx.dtp_addr);
+      break;
+    case R_X86_64_DTPOFF64:
+      *(ul64 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_X86_64_TPOFF32:
+      write32s(S + A - ctx.tp_addr);
+      break;
+    case R_X86_64_TPOFF64:
+      *(ul64 *)loc = S + A - ctx.tp_addr;
+      break;
+    case R_X86_64_GOTTPOFF:
+      if (sym.has_gottp(ctx)) {
+        write32s(sym.get_gottp_addr(ctx) + A - P);
+      } else {
+        u32 insn = relax_gottpoff(loc - 3);
+        loc[-3] = insn >> 16;
+        loc[-2] = insn >> 8;
+        loc[-1] = insn;
+        write32s(S - ctx.tp_addr);
+        assert(A == -4);
+      }
+      break;
+    case R_X86_64_GOTPC32_TLSDESC:
+      if (sym.has_tlsdesc(ctx)) {
+        write32s(sym.get_tlsdesc_addr(ctx) + A - P);
+      } else {
+        u32 insn = relax_gotpc32_tlsdesc(loc - 3);
+        loc[-3] = insn >> 16;
+        loc[-2] = insn >> 8;
+        loc[-1] = insn;
+        write32s(S - ctx.tp_addr);
+        assert(A == -4);
+      }
+      break;
+    case R_X86_64_SIZE32:
+      write32(sym.esym().st_size + A);
+      break;
+    case R_X86_64_SIZE64:
+      *(ul64 *)loc = sym.esym().st_size + A;
+      break;
+    case R_X86_64_TLSDESC_CALL:
+      if (!sym.has_tlsdesc(ctx)) {
+        // call *(%rax) -> nop
+        loc[0] = 0x66;
+        loc[1] = 0x90;
+      }
+      break;
+    default:
+      unreachable();
+    }
+  }
+}
+
+// This function is responsible for applying relocations against
+// non-SHF_ALLOC sections (i.e. sections that are not mapped to memory
+// at runtime).
+//
+// Relocations against non-SHF_ALLOC sections are much easier to
+// handle than that against SHF_ALLOC sections. It is because, since
+// they are not mapped to memory, they don't contain any variable or
+// function and never need PLT or GOT. Non-SHF_ALLOC sections are
+// mostly debug info sections.
+//
+// Relocations against non-SHF_ALLOC sections are not scanned by
+// scan_relocations.
+template <>
+void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = base + rel.r_offset;
+
+    auto check = [&](i64 val, i64 lo, i64 hi) {
+      if (val < lo || hi <= val)
+        Error(ctx) << *this << ": relocation " << rel << " against "
+                   << sym << " out of range: " << val << " is not in ["
+                   << lo << ", " << hi << ")";
+    };
+
+    auto write32 = [&](u64 val) {
+      check(val, 0, 1LL << 32);
+      *(ul32 *)loc = val;
+    };
+
+    auto write32s = [&](u64 val) {
+      check(val, -(1LL << 31), 1LL << 31);
+      *(ul32 *)loc = val;
+    };
+
+    SectionFragment<E> *frag;
+    i64 frag_addend;
+    std::tie(frag, frag_addend) = get_fragment(ctx, rel);
+
+    u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
+    u64 A = frag ? frag_addend : (i64)rel.r_addend;
+
+    switch (rel.r_type) {
+    case R_X86_64_8:
+      check(S + A, 0, 1 << 8);
+      *loc = S + A;
+      break;
+    case R_X86_64_16:
+      check(S + A, 0, 1 << 16);
+      *(ul16 *)loc = S + A;
+      break;
+    case R_X86_64_32:
+      write32(S + A);
+      break;
+    case R_X86_64_32S:
+      write32s(S + A);
+      break;
+    case R_X86_64_64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul64 *)loc = *val;
+      else
+        *(ul64 *)loc = S + A;
+      break;
+    case R_X86_64_DTPOFF32:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul32 *)loc = *val;
+      else
+        write32s(S + A - ctx.dtp_addr);
+      break;
+    case R_X86_64_DTPOFF64:
+      if (std::optional<u64> val = get_tombstone(sym, frag))
+        *(ul64 *)loc = *val;
+      else
+        *(ul64 *)loc = S + A - ctx.dtp_addr;
+      break;
+    case R_X86_64_GOTOFF64:
+      *(ul64 *)loc = S + A - ctx.gotplt->shdr.sh_addr;
+      break;
+    case R_X86_64_GOTPC64:
+      // PC-relative relocation doesn't make sense for non-memory-allocated
+      // section, but GCC 6.3.0 seems to create this reloc for
+      // _GLOBAL_OFFSET_TABLE_.
+      *(ul64 *)loc = ctx.gotplt->shdr.sh_addr + A;
+      break;
+    case R_X86_64_SIZE32:
+      write32(sym.esym().st_size + A);
+      break;
+    case R_X86_64_SIZE64:
+      *(ul64 *)loc = sym.esym().st_size + A;
+      break;
+    default:
+      Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
+                 << rel;
+      break;
+    }
+  }
+}
+
+// Linker has to create data structures in an output file to apply
+// some type of relocations. For example, if a relocation refers a GOT
+// or a PLT entry of a symbol, linker has to create an entry in .got
+// or in .plt for that symbol. In order to fix the file layout, we
+// need to scan relocations.
+template <>
+void InputSection<E>::scan_relocations(Context<E> &ctx) {
+  assert(shdr().sh_flags & SHF_ALLOC);
+
+  this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
+  std::span<const ElfRel<E>> rels = get_rels(ctx);
+
+  // Scan relocations
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
+      continue;
+
+    Symbol<E> &sym = *file.symbols[rel.r_sym];
+    u8 *loc = (u8 *)(contents.data() + rel.r_offset);
+
+    if (sym.is_ifunc())
+      sym.flags |= NEEDS_GOT | NEEDS_PLT;
+
+    switch (rel.r_type) {
+    case R_X86_64_8:
+    case R_X86_64_16:
+    case R_X86_64_32:
+    case R_X86_64_32S:
+      scan_absrel(ctx, sym, rel);
+      break;
+    case R_X86_64_64:
+      scan_dyn_absrel(ctx, sym, rel);
+      break;
+    case R_X86_64_PC8:
+    case R_X86_64_PC16:
+    case R_X86_64_PC32:
+    case R_X86_64_PC64:
+      scan_pcrel(ctx, sym, rel);
+      break;
+    case R_X86_64_GOT32:
+    case R_X86_64_GOT64:
+    case R_X86_64_GOTPC32:
+    case R_X86_64_GOTPC64:
+    case R_X86_64_GOTPCREL:
+    case R_X86_64_GOTPCREL64:
+    case R_X86_64_GOTPCRELX:
+    case R_X86_64_REX_GOTPCRELX:
+      sym.flags |= NEEDS_GOT;
+      break;
+    case R_X86_64_PLT32:
+    case R_X86_64_PLTOFF64:
+      if (sym.is_imported)
+        sym.flags |= NEEDS_PLT;
+      break;
+    case R_X86_64_TLSGD:
+      if (rel.r_addend != -4)
+        Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSGD";
+
+      if (i + 1 == rels.size())
+        Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL";
+
+      if (u32 ty = rels[i + 1].r_type;
+          ty != R_X86_64_PLT32 && ty != R_X86_64_PC32 &&
+          ty != R_X86_64_PLTOFF64 && ty != R_X86_64_GOTPCREL &&
+          ty != R_X86_64_GOTPCRELX)
+        Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL";
+
+      if (ctx.arg.is_static ||
+          (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared)) {
+        // We always relax if -static because libc.a doesn't contain
+        // __tls_get_addr().
+        i++;
+      } else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared &&
+                 !ctx.arg.z_dlopen) {
+        sym.flags |= NEEDS_GOTTP;
+        i++;
+      } else {
+        sym.flags |= NEEDS_TLSGD;
+      }
+      break;
+    case R_X86_64_TLSLD:
+      if (rel.r_addend != -4)
+        Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSLD";
+
+      if (i + 1 == rels.size())
+        Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL";
+
+      if (u32 ty = rels[i + 1].r_type;
+          ty != R_X86_64_PLT32 && ty != R_X86_64_PC32 &&
+          ty != R_X86_64_PLTOFF64 && ty != R_X86_64_GOTPCREL &&
+          ty != R_X86_64_GOTPCRELX)
+        Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL";
+
+      // We always relax if -static because libc.a doesn't contain
+      // __tls_get_addr().
+      if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared))
+        i++;
+      else
+        ctx.needs_tlsld = true;
+      break;
+    case R_X86_64_GOTTPOFF: {
+      if (rel.r_addend != -4)
+        Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTTPOFF";
+
+      bool do_relax = ctx.arg.relax && !ctx.arg.shared &&
+                      !sym.is_imported && relax_gottpoff(loc - 3);
+      if (!do_relax)
+        sym.flags |= NEEDS_GOTTP;
+      break;
+    }
+    case R_X86_64_GOTPC32_TLSDESC: {
+      if (rel.r_addend != -4)
+        Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTPC32_TLSDESC";
+
+      if (relax_gotpc32_tlsdesc(loc - 3) == 0)
+        Fatal(ctx) << *this << ": GOTPC32_TLSDESC relocation is used"
+                   << " against an invalid code sequence";
+
+      if (!relax_tlsdesc(ctx, sym))
+        sym.flags |= NEEDS_TLSDESC;
+      break;
+    }
+    case R_X86_64_TPOFF32:
+    case R_X86_64_TPOFF64:
+      check_tlsle(ctx, sym, rel);
+      break;
+    case R_X86_64_GOTOFF64:
+    case R_X86_64_DTPOFF32:
+    case R_X86_64_DTPOFF64:
+    case R_X86_64_SIZE32:
+    case R_X86_64_SIZE64:
+    case R_X86_64_TLSDESC_CALL:
+      break;
+    default:
+      Error(ctx) << *this << ": unknown relocation: " << rel;
+    }
+  }
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/cmdline.cc
+++ b/third_party/mold/elf/cmdline.cc
--- a/third_party/mold/elf/dwarf.cc
+++ b/third_party/mold/elf/dwarf.cc
@ -0,0 +1,555 @@
+// clang-format off
+// This file contains code to read DWARF debug info to create .gdb_index.
+//
+// .gdb_index is an optional section to speed up GNU debugger. It contains
+// two maps: 1) a map from function/variable/type names to compunits, and
+// 2) a map from function address ranges to compunits. gdb uses these
+// maps to quickly find a compunit given a name or an instruction pointer.
+//
+// (Terminology: a compilation unit, which often abbreviated as compunit
+// or cu, is a unit of debug info. An input .debug_info section usually
+// contains one compunit, and thus an output .debug_info contains as
+// many compunits as the number of input files.)
+//
+// .gdb_index is not mandatory. All the information in .gdb_index is
+// also in other debug info sections. You can actually create an
+// executable without .gdb_index and later add it using `gdb-add-index`
+// post-processing tool that comes with gdb.
+//
+// The mapping from names to compunits is 1:n while the mapping from
+// address ranges to compunits is 1:1. That is, two object files may
+// define the same type name (with the same definition), while there
+// should be no two functions that overlap with each other in memory.
+//
+// .gdb_index contains an on-disk hash table for names, so gdb can
+// lookup names without loading all strings into memory and construct an
+// in-memory hash table.
+//
+// Names are in .debug_gnu_pubnames and .debug_gnu_pubtypes input
+// sections. These sections are created if `-ggnu-pubnames` is given.
+// Besides names, these sections contains attributes for each name so
+// that gdb can distinguish type names from function names, for example.
+//
+// A compunit contains one or more function address ranges. If an
+// object file is compiled without -ffunction-sections, it contains
+// only one .text section and therefore contains a single address range.
+// Such range is typically stored directly to the compunit.
+//
+// If an object file is compiled with -fucntion-sections, it contains
+// more than one .text section, and it has as many address ranges as
+// the number of .text sections. Such discontiguous address ranges are
+// stored to .debug_ranges in DWARF 2/3/4/5 and
+// .debug_rnglists/.debug_addr in DWARF 5.
+//
+// .debug_info section contains DWARF debug info. Although we don't need
+// to parse the whole .debug_info section to read address ranges, we
+// have to do a little bit. DWARF is complicated and often handled using
+// a library such as libdwarf. But we don't use any library because we
+// don't want to add an extra run-time dependency just for --gdb-index.
+//
+// This page explains the format of .gdb_index:
+// https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+// The hash function for .gdb_index.
+static u32 gdb_hash(std::string_view name) {
+  u32 h = 0;
+  for (u8 c : name) {
+    if ('A' <= c && c <= 'Z')
+      c = 'a' + c - 'A';
+    h = h * 67 + c - 113;
+  }
+  return h;
+}
+
+// Split .debug_info into so-called "compilation units". A .debug_info
+// section usually contains one compunit unless it was created by `ld -r`.
+// This is for --gdb-index.
+template <typename E>
+std::vector<std::string_view>
+read_compunits(Context<E> &ctx, ObjectFile<E> &file) {
+  file.debug_info->uncompress(ctx);
+  std::string_view data = file.debug_info->contents;
+  std::vector<std::string_view> vec;
+
+  while (!data.empty()) {
+    if (data.size() < 4)
+      Fatal(ctx) << *file.debug_info << ": corrupted .debug_info";
+    if (*(U32<E> *)data.data() == 0xffff'ffff)
+      Fatal(ctx) << *file.debug_info << ": --gdb-index: DWARF64 not supported";
+    i64 len = *(U32<E> *)data.data() + 4;
+    vec.push_back(data.substr(0, len));
+    data = data.substr(len);
+  }
+  return vec;
+}
+
+// Parses .debug_gnu_pubnames and .debug_gnu_pubtypes. These sections
+// start with a 14 bytes header followed by (4-byte offset, 1-byte type,
+// null-terminated string) tuples.
+//
+// The 4-byte offset is an offset into .debug_info that contains details
+// about the name. The 1-byte type is a type of the corresponding name
+// (e.g. function, variable or datatype). The string is a name of a
+// function, a variable or a type.
+template <typename E>
+std::vector<GdbIndexName> read_pubnames(Context<E> &ctx, ObjectFile<E> &file) {
+  std::vector<GdbIndexName> vec;
+
+  auto get_cu_idx = [&](InputSection<E> &isec, i64 offset) {
+    i64 off = 0;
+    for (i64 i = 0; i < file.compunits.size(); i++) {
+      if (offset == off)
+        return file.compunits_idx + i;
+      off += file.compunits[i].size();
+    }
+    Fatal(ctx) << isec << ": corrupted debug_info_offset";
+  };
+
+  auto read = [&](InputSection<E> &isec) {
+    isec.uncompress(ctx);
+    std::string_view contents = isec.contents;
+
+    while (!contents.empty()) {
+      if (contents.size() < 14)
+        Fatal(ctx) << isec << ": corrupted header";
+
+      u32 len = *(U32<E> *)contents.data() + 4;
+      u32 debug_info_offset = *(U32<E> *)(contents.data() + 6);
+      u32 cu_idx = get_cu_idx(isec, debug_info_offset);
+
+      std::string_view data = contents.substr(14, len - 14);
+      contents = contents.substr(len);
+
+      while (!data.empty()) {
+        u32 offset = *(U32<E> *)data.data();
+        data = data.substr(4);
+        if (offset == 0)
+          break;
+
+        u8 type = data[0];
+        data = data.substr(1);
+
+        std::string_view name = data.data();
+        data = data.substr(name.size() + 1);
+
+        vec.push_back({name, gdb_hash(name), (type << 24) | cu_idx});
+      }
+    }
+  };
+
+  if (file.debug_pubnames)
+    read(*file.debug_pubnames);
+  if (file.debug_pubtypes)
+    read(*file.debug_pubtypes);
+
+  // Uniquify elements because GCC 11 seems to emit one record for each
+  // comdat group which results in having a lot of duplicate records.
+  auto less = [](const GdbIndexName &a, const GdbIndexName &b) {
+    return std::tuple{a.hash, a.attr, a.name} <
+           std::tuple{b.hash, b.attr, b.name};
+  };
+
+  auto equal = [](const GdbIndexName &a, const GdbIndexName &b) {
+    return std::tuple{a.hash, a.attr, a.name} ==
+           std::tuple{b.hash, b.attr, b.name};
+  };
+
+  std::sort(vec.begin(), vec.end(), less);
+  vec.erase(std::unique(vec.begin(), vec.end(), equal), vec.end());
+  return vec;
+}
+
+template <typename E>
+static u8 *get_buffer(Context<E> &ctx, Chunk<E> *chunk) {
+  if (u8 *buf = chunk->get_uncompressed_data())
+    return buf;
+  return ctx.buf + chunk->shdr.sh_offset;
+}
+
+// Try to find a compilation unit from .debug_info and its
+// corresponding record from .debug_abbrev and returns them.
+template <typename E>
+static std::tuple<u8 *, u8 *, u32>
+find_compunit(Context<E> &ctx, ObjectFile<E> &file, i64 offset) {
+  // Read .debug_info to find the record at a given offset.
+  u8 *cu = get_buffer(ctx, ctx.debug_info) + offset;
+  u32 dwarf_version = *(U16<E> *)(cu + 4);
+  u32 abbrev_offset;
+
+  // Skip a header.
+  switch (dwarf_version) {
+  case 2:
+  case 3:
+  case 4:
+    abbrev_offset = *(U32<E> *)(cu + 6);
+    if (u32 address_size = cu[10]; address_size != sizeof(Word<E>))
+      Fatal(ctx) << file << ": --gdb-index: unsupported address size "
+                 << address_size;
+    cu += 11;
+    break;
+  case 5: {
+    abbrev_offset = *(U32<E> *)(cu + 8);
+    if (u32 address_size = cu[7]; address_size != sizeof(Word<E>))
+      Fatal(ctx) << file << ": --gdb-index: unsupported address size "
+                 << address_size;
+
+    switch (u32 unit_type = cu[6]; unit_type) {
+    case DW_UT_compile:
+    case DW_UT_partial:
+      cu += 12;
+      break;
+    case DW_UT_skeleton:
+    case DW_UT_split_compile:
+      cu += 20;
+      break;
+    default:
+      Fatal(ctx) << file << ": --gdb-index: unknown DW_UT_* value: 0x"
+                 << std::hex << unit_type;
+    }
+    break;
+  }
+  default:
+    Fatal(ctx) << file << ": --gdb-index: unknown DWARF version: "
+               << dwarf_version;
+  }
+
+  u32 abbrev_code = read_uleb(cu);
+
+  // Find a .debug_abbrev record corresponding to the .debug_info record.
+  // We assume the .debug_info record at a given offset is of
+  // DW_TAG_compile_unit which describes a compunit.
+  u8 *abbrev = get_buffer(ctx, ctx.debug_abbrev) + abbrev_offset;
+
+  for (;;) {
+    u32 code = read_uleb(abbrev);
+    if (code == 0)
+      Fatal(ctx) << file << ": --gdb-index: .debug_abbrev does not contain"
+                 << " a record for the first .debug_info record";
+
+    if (code == abbrev_code) {
+      // Found a record
+      u64 abbrev_tag = read_uleb(abbrev);
+      if (abbrev_tag != DW_TAG_compile_unit && abbrev_tag != DW_TAG_skeleton_unit)
+        Fatal(ctx) << file << ": --gdb-index: the first entry's tag is not"
+                   << " DW_TAG_compile_unit/DW_TAG_skeleton_unit but 0x"
+                   << std::hex << abbrev_tag;
+      break;
+    }
+
+    // Skip an uninteresting record
+    read_uleb(abbrev); // tag
+    abbrev++; // has_children byte
+    for (;;) {
+      u64 name = read_uleb(abbrev);
+      u64 form = read_uleb(abbrev);
+      if (name == 0 && form == 0)
+        break;
+      if (form == DW_FORM_implicit_const)
+        read_uleb(abbrev);
+    }
+  }
+
+  abbrev++; // skip has_children byte
+  return {cu, abbrev, dwarf_version};
+}
+
+// Estimate the number of address ranges contained in a given file.
+// It may over-estimate but never under-estimate.
+template <typename E>
+i64 estimate_address_areas(Context<E> &ctx, ObjectFile<E> &file) {
+  // Each CU contains zero or one address area.
+  i64 ret = file.compunits.size();
+
+  // In DWARF 4, a CU can refer address ranges in .debug_ranges.
+  // .debug_ranges contains a vector of [begin, end) address pairs.
+  // The last entry must be a null terminator, so we do -1.
+  if (file.debug_ranges)
+    ret += file.debug_ranges->sh_size / sizeof(Word<E>) / 2 - 1;
+
+  // In DWARF 5, a CU can refer address ranges in .debug_rnglists, which
+  // contains variable-length entries. The smallest possible range entry
+  // is one byte for the code and two ULEB128 values (each can be as
+  // small as one byte), so 3 bytes.
+  if (file.debug_rnglists)
+    ret += file.debug_rnglists->sh_size / 3;
+  return ret;
+}
+
+// .debug_info contains variable-length fields. This class reads them.
+template <typename E>
+class DebugInfoReader {
+public:
+  DebugInfoReader(Context<E> &ctx, ObjectFile<E> &file, u8 *cu)
+    : ctx(ctx), file(file), cu(cu) {}
+
+  u64 read(u64 form);
+
+  Context<E> &ctx;
+  ObjectFile<E> &file;
+  u8 *cu;
+};
+
+// Read value of the given DW_FORM_* form. If a value is not scalar,
+// returns a dummy value 0.
+template <typename E>
+u64 DebugInfoReader<E>::read(u64 form) {
+  switch (form) {
+  case DW_FORM_flag_present:
+    return 0;
+  case DW_FORM_data1:
+  case DW_FORM_flag:
+  case DW_FORM_strx1:
+  case DW_FORM_addrx1:
+  case DW_FORM_ref1:
+    return *cu++;
+  case DW_FORM_data2:
+  case DW_FORM_strx2:
+  case DW_FORM_addrx2:
+  case DW_FORM_ref2: {
+    u64 val = *(U16<E> *)cu;
+    cu += 2;
+    return val;
+  }
+  case DW_FORM_strx3:
+  case DW_FORM_addrx3: {
+    u64 val = *(U24<E> *)cu;
+    cu += 3;
+    return val;
+  }
+  case DW_FORM_data4:
+  case DW_FORM_strp:
+  case DW_FORM_sec_offset:
+  case DW_FORM_line_strp:
+  case DW_FORM_strx4:
+  case DW_FORM_addrx4:
+  case DW_FORM_ref4: {
+    u64 val = *(U32<E> *)cu;
+    cu += 4;
+    return val;
+  }
+  case DW_FORM_data8:
+  case DW_FORM_ref8: {
+    u64 val = *(U64<E> *)cu;
+    cu += 8;
+    return val;
+  }
+  case DW_FORM_addr:
+  case DW_FORM_ref_addr: {
+    u64 val = *(Word<E> *)cu;
+    cu += sizeof(Word<E>);
+    return val;
+  }
+  case DW_FORM_strx:
+  case DW_FORM_addrx:
+  case DW_FORM_udata:
+  case DW_FORM_ref_udata:
+  case DW_FORM_loclistx:
+  case DW_FORM_rnglistx:
+    return read_uleb(cu);
+  case DW_FORM_string:
+    cu += strlen((char *)cu) + 1;
+    return 0;
+  default:
+    Fatal(ctx) << file << ": --gdb-index: unhandled debug info form: 0x"
+               << std::hex << form;
+    return 0;
+  }
+}
+
+// Read a range list from .debug_ranges starting at the given offset.
+template <typename E>
+static std::vector<u64>
+read_debug_range(Context<E> &ctx, ObjectFile<E> &file, Word<E> *range) {
+  std::vector<u64> vec;
+  u64 base = 0;
+
+  for (i64 i = 0; range[i] || range[i + 1]; i += 2) {
+    if (range[i] + 1 == 0) {
+      // base address selection entry
+      base = range[i + 1];
+    } else {
+      vec.push_back(range[i] + base);
+      vec.push_back(range[i + 1] + base);
+    }
+  }
+  return vec;
+}
+
+// Read a range list from .debug_rnglists starting at the given offset.
+template <typename E>
+static std::vector<u64>
+read_rnglist_range(Context<E> &ctx, ObjectFile<E> &file, u8 *rnglist,
+                   Word<E> *addrx) {
+  std::vector<u64> vec;
+  u64 base = 0;
+
+  for (;;) {
+    switch (*rnglist++) {
+    case DW_RLE_end_of_list:
+      return vec;
+    case DW_RLE_base_addressx:
+      base = addrx[read_uleb(rnglist)];
+      break;
+    case DW_RLE_startx_endx:
+      vec.push_back(addrx[read_uleb(rnglist)]);
+      vec.push_back(addrx[read_uleb(rnglist)]);
+      break;
+    case DW_RLE_startx_length:
+      vec.push_back(addrx[read_uleb(rnglist)]);
+      vec.push_back(vec.back() + read_uleb(rnglist));
+      break;
+    case DW_RLE_offset_pair:
+      vec.push_back(base + read_uleb(rnglist));
+      vec.push_back(base + read_uleb(rnglist));
+      break;
+    case DW_RLE_base_address:
+      base = *(Word<E> *)rnglist;
+      rnglist += sizeof(Word<E>);
+      break;
+    case DW_RLE_start_end:
+      vec.push_back(*(Word<E> *)rnglist);
+      rnglist += sizeof(Word<E>);
+      vec.push_back(*(Word<E> *)rnglist);
+      rnglist += sizeof(Word<E>);
+      break;
+    case DW_RLE_start_length:
+      vec.push_back(*(Word<E> *)rnglist);
+      rnglist += sizeof(Word<E>);
+      vec.push_back(vec.back() + read_uleb(rnglist));
+      break;
+    }
+  }
+}
+
+// Returns a list of address ranges explained by a compunit at the
+// `offset` in an output .debug_info section.
+//
+// .debug_info contains DWARF debug info records, so this function
+// parses DWARF. If a designated compunit contains multiple ranges, the
+// ranges are read from .debug_ranges (or .debug_rnglists for DWARF5).
+// Otherwise, a range is read directly from .debug_info (or possibly
+// from .debug_addr for DWARF5).
+template <typename E>
+std::vector<u64>
+read_address_areas(Context<E> &ctx, ObjectFile<E> &file, i64 offset) {
+  u8 *cu;
+  u8 *abbrev;
+  u32 dwarf_version;
+  std::tie(cu, abbrev, dwarf_version) = find_compunit(ctx, file, offset);
+
+  DebugInfoReader<E> reader{ctx, file, cu};
+
+  struct Record {
+    u64 form = 0;
+    u64 value = 0;
+  };
+
+  Record low_pc;
+  Record high_pc;
+  Record ranges;
+  std::optional<u64> rnglists_base;
+  Word<E> *addrx = nullptr;
+
+  // Read all interesting debug records.
+  for (;;) {
+    u64 name = read_uleb(abbrev);
+    u64 form = read_uleb(abbrev);
+    if (name == 0 && form == 0)
+      break;
+
+    u64 val = reader.read(form);
+
+    switch (name) {
+    case DW_AT_low_pc:
+      low_pc = {form, val};
+      break;
+    case DW_AT_high_pc:
+      high_pc = {form, val};
+      break;
+    case DW_AT_rnglists_base:
+      rnglists_base = val;
+      break;
+    case DW_AT_addr_base:
+      addrx = (Word<E> *)(get_buffer(ctx, ctx.debug_addr) + val);
+      break;
+    case DW_AT_ranges:
+      ranges = {form, val};
+      break;
+    }
+  }
+
+  // Handle non-contiguous address ranges.
+  if (ranges.form) {
+    if (dwarf_version <= 4) {
+       Word<E> *range_begin =
+        (Word<E> *)(get_buffer(ctx, ctx.debug_ranges) + ranges.value);
+      return read_debug_range(ctx, file, range_begin);
+    }
+
+    assert(dwarf_version == 5);
+
+    u8 *buf = get_buffer(ctx, ctx.debug_rnglists);
+    if (ranges.form == DW_FORM_sec_offset)
+      return read_rnglist_range(ctx, file, buf + ranges.value, addrx);
+
+    if (!rnglists_base)
+      Fatal(ctx) << file << ": --gdb-index: missing DW_AT_rnglists_base";
+
+    u8 *base = buf + *rnglists_base;
+    return read_rnglist_range(ctx, file, base + *(U32<E> *)base, addrx);
+  }
+
+  // Handle a contiguous address range.
+  if (low_pc.form && high_pc.form) {
+    u64 lo;
+
+    switch (low_pc.form) {
+    case DW_FORM_addr:
+      lo = low_pc.value;
+      break;
+    case DW_FORM_addrx:
+    case DW_FORM_addrx1:
+    case DW_FORM_addrx2:
+    case DW_FORM_addrx4:
+      lo = addrx[low_pc.value];
+      break;
+    default:
+      Fatal(ctx) << file << ": --gdb-index: unhandled form for DW_AT_low_pc: 0x"
+                 << std::hex << high_pc.form;
+    }
+
+    switch (high_pc.form) {
+    case DW_FORM_addr:
+      return {lo, high_pc.value};
+    case DW_FORM_addrx:
+    case DW_FORM_addrx1:
+    case DW_FORM_addrx2:
+    case DW_FORM_addrx4:
+      return {lo, addrx[high_pc.value]};
+    case DW_FORM_udata:
+    case DW_FORM_data1:
+    case DW_FORM_data2:
+    case DW_FORM_data4:
+    case DW_FORM_data8:
+      return {lo, lo + high_pc.value};
+    default:
+      Fatal(ctx) << file << ": --gdb-index: unhandled form for DW_AT_high_pc: 0x"
+                 << std::hex << high_pc.form;
+    }
+  }
+
+  return {};
+}
+
+using E = MOLD_TARGET;
+
+template std::vector<std::string_view> read_compunits(Context<E> &, ObjectFile<E> &);
+template std::vector<GdbIndexName> read_pubnames(Context<E> &, ObjectFile<E> &);
+template i64 estimate_address_areas(Context<E> &, ObjectFile<E> &);
+template std::vector<u64> read_address_areas(Context<E> &, ObjectFile<E> &, i64);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/elf.cc
+++ b/third_party/mold/elf/elf.cc
@ -0,0 +1,922 @@
+// clang-format off
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+template <>
+std::string rel_to_string<X86_64>(u32 r_type) {
+  switch (r_type) {
+  case R_X86_64_NONE: return "R_X86_64_NONE";
+  case R_X86_64_64: return "R_X86_64_64";
+  case R_X86_64_PC32: return "R_X86_64_PC32";
+  case R_X86_64_GOT32: return "R_X86_64_GOT32";
+  case R_X86_64_PLT32: return "R_X86_64_PLT32";
+  case R_X86_64_COPY: return "R_X86_64_COPY";
+  case R_X86_64_GLOB_DAT: return "R_X86_64_GLOB_DAT";
+  case R_X86_64_JUMP_SLOT: return "R_X86_64_JUMP_SLOT";
+  case R_X86_64_RELATIVE: return "R_X86_64_RELATIVE";
+  case R_X86_64_GOTPCREL: return "R_X86_64_GOTPCREL";
+  case R_X86_64_32: return "R_X86_64_32";
+  case R_X86_64_32S: return "R_X86_64_32S";
+  case R_X86_64_16: return "R_X86_64_16";
+  case R_X86_64_PC16: return "R_X86_64_PC16";
+  case R_X86_64_8: return "R_X86_64_8";
+  case R_X86_64_PC8: return "R_X86_64_PC8";
+  case R_X86_64_DTPMOD64: return "R_X86_64_DTPMOD64";
+  case R_X86_64_DTPOFF64: return "R_X86_64_DTPOFF64";
+  case R_X86_64_TPOFF64: return "R_X86_64_TPOFF64";
+  case R_X86_64_TLSGD: return "R_X86_64_TLSGD";
+  case R_X86_64_TLSLD: return "R_X86_64_TLSLD";
+  case R_X86_64_DTPOFF32: return "R_X86_64_DTPOFF32";
+  case R_X86_64_GOTTPOFF: return "R_X86_64_GOTTPOFF";
+  case R_X86_64_TPOFF32: return "R_X86_64_TPOFF32";
+  case R_X86_64_PC64: return "R_X86_64_PC64";
+  case R_X86_64_GOTOFF64: return "R_X86_64_GOTOFF64";
+  case R_X86_64_GOTPC32: return "R_X86_64_GOTPC32";
+  case R_X86_64_GOT64: return "R_X86_64_GOT64";
+  case R_X86_64_GOTPCREL64: return "R_X86_64_GOTPCREL64";
+  case R_X86_64_GOTPC64: return "R_X86_64_GOTPC64";
+  case R_X86_64_GOTPLT64: return "R_X86_64_GOTPLT64";
+  case R_X86_64_PLTOFF64: return "R_X86_64_PLTOFF64";
+  case R_X86_64_SIZE32: return "R_X86_64_SIZE32";
+  case R_X86_64_SIZE64: return "R_X86_64_SIZE64";
+  case R_X86_64_GOTPC32_TLSDESC: return "R_X86_64_GOTPC32_TLSDESC";
+  case R_X86_64_TLSDESC_CALL: return "R_X86_64_TLSDESC_CALL";
+  case R_X86_64_TLSDESC: return "R_X86_64_TLSDESC";
+  case R_X86_64_IRELATIVE: return "R_X86_64_IRELATIVE";
+  case R_X86_64_GOTPCRELX: return "R_X86_64_GOTPCRELX";
+  case R_X86_64_REX_GOTPCRELX: return "R_X86_64_REX_GOTPCRELX";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<I386>(u32 r_type) {
+  switch (r_type) {
+  case R_386_NONE: return "R_386_NONE";
+  case R_386_32: return "R_386_32";
+  case R_386_PC32: return "R_386_PC32";
+  case R_386_GOT32: return "R_386_GOT32";
+  case R_386_PLT32: return "R_386_PLT32";
+  case R_386_COPY: return "R_386_COPY";
+  case R_386_GLOB_DAT: return "R_386_GLOB_DAT";
+  case R_386_JUMP_SLOT: return "R_386_JUMP_SLOT";
+  case R_386_RELATIVE: return "R_386_RELATIVE";
+  case R_386_GOTOFF: return "R_386_GOTOFF";
+  case R_386_GOTPC: return "R_386_GOTPC";
+  case R_386_32PLT: return "R_386_32PLT";
+  case R_386_TLS_TPOFF: return "R_386_TLS_TPOFF";
+  case R_386_TLS_IE: return "R_386_TLS_IE";
+  case R_386_TLS_GOTIE: return "R_386_TLS_GOTIE";
+  case R_386_TLS_LE: return "R_386_TLS_LE";
+  case R_386_TLS_GD: return "R_386_TLS_GD";
+  case R_386_TLS_LDM: return "R_386_TLS_LDM";
+  case R_386_16: return "R_386_16";
+  case R_386_PC16: return "R_386_PC16";
+  case R_386_8: return "R_386_8";
+  case R_386_PC8: return "R_386_PC8";
+  case R_386_TLS_GD_32: return "R_386_TLS_GD_32";
+  case R_386_TLS_GD_PUSH: return "R_386_TLS_GD_PUSH";
+  case R_386_TLS_GD_CALL: return "R_386_TLS_GD_CALL";
+  case R_386_TLS_GD_POP: return "R_386_TLS_GD_POP";
+  case R_386_TLS_LDM_32: return "R_386_TLS_LDM_32";
+  case R_386_TLS_LDM_PUSH: return "R_386_TLS_LDM_PUSH";
+  case R_386_TLS_LDM_CALL: return "R_386_TLS_LDM_CALL";
+  case R_386_TLS_LDM_POP: return "R_386_TLS_LDM_POP";
+  case R_386_TLS_LDO_32: return "R_386_TLS_LDO_32";
+  case R_386_TLS_IE_32: return "R_386_TLS_IE_32";
+  case R_386_TLS_LE_32: return "R_386_TLS_LE_32";
+  case R_386_TLS_DTPMOD32: return "R_386_TLS_DTPMOD32";
+  case R_386_TLS_DTPOFF32: return "R_386_TLS_DTPOFF32";
+  case R_386_TLS_TPOFF32: return "R_386_TLS_TPOFF32";
+  case R_386_SIZE32: return "R_386_SIZE32";
+  case R_386_TLS_GOTDESC: return "R_386_TLS_GOTDESC";
+  case R_386_TLS_DESC_CALL: return "R_386_TLS_DESC_CALL";
+  case R_386_TLS_DESC: return "R_386_TLS_DESC";
+  case R_386_IRELATIVE: return "R_386_IRELATIVE";
+  case R_386_GOT32X: return "R_386_GOT32X";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<ARM64>(u32 r_type) {
+  switch (r_type) {
+  case R_AARCH64_NONE: return "R_AARCH64_NONE";
+  case R_AARCH64_ABS64: return "R_AARCH64_ABS64";
+  case R_AARCH64_ABS32: return "R_AARCH64_ABS32";
+  case R_AARCH64_ABS16: return "R_AARCH64_ABS16";
+  case R_AARCH64_PREL64: return "R_AARCH64_PREL64";
+  case R_AARCH64_PREL32: return "R_AARCH64_PREL32";
+  case R_AARCH64_PREL16: return "R_AARCH64_PREL16";
+  case R_AARCH64_MOVW_UABS_G0: return "R_AARCH64_MOVW_UABS_G0";
+  case R_AARCH64_MOVW_UABS_G0_NC: return "R_AARCH64_MOVW_UABS_G0_NC";
+  case R_AARCH64_MOVW_UABS_G1: return "R_AARCH64_MOVW_UABS_G1";
+  case R_AARCH64_MOVW_UABS_G1_NC: return "R_AARCH64_MOVW_UABS_G1_NC";
+  case R_AARCH64_MOVW_UABS_G2: return "R_AARCH64_MOVW_UABS_G2";
+  case R_AARCH64_MOVW_UABS_G2_NC: return "R_AARCH64_MOVW_UABS_G2_NC";
+  case R_AARCH64_MOVW_UABS_G3: return "R_AARCH64_MOVW_UABS_G3";
+  case R_AARCH64_MOVW_SABS_G0: return "R_AARCH64_MOVW_SABS_G0";
+  case R_AARCH64_MOVW_SABS_G1: return "R_AARCH64_MOVW_SABS_G1";
+  case R_AARCH64_MOVW_SABS_G2: return "R_AARCH64_MOVW_SABS_G2";
+  case R_AARCH64_LD_PREL_LO19: return "R_AARCH64_LD_PREL_LO19";
+  case R_AARCH64_ADR_PREL_LO21: return "R_AARCH64_ADR_PREL_LO21";
+  case R_AARCH64_ADR_PREL_PG_HI21: return "R_AARCH64_ADR_PREL_PG_HI21";
+  case R_AARCH64_ADR_PREL_PG_HI21_NC: return "R_AARCH64_ADR_PREL_PG_HI21_NC";
+  case R_AARCH64_ADD_ABS_LO12_NC: return "R_AARCH64_ADD_ABS_LO12_NC";
+  case R_AARCH64_LDST8_ABS_LO12_NC: return "R_AARCH64_LDST8_ABS_LO12_NC";
+  case R_AARCH64_TSTBR14: return "R_AARCH64_TSTBR14";
+  case R_AARCH64_CONDBR19: return "R_AARCH64_CONDBR19";
+  case R_AARCH64_JUMP26: return "R_AARCH64_JUMP26";
+  case R_AARCH64_CALL26: return "R_AARCH64_CALL26";
+  case R_AARCH64_LDST16_ABS_LO12_NC: return "R_AARCH64_LDST16_ABS_LO12_NC";
+  case R_AARCH64_LDST32_ABS_LO12_NC: return "R_AARCH64_LDST32_ABS_LO12_NC";
+  case R_AARCH64_LDST64_ABS_LO12_NC: return "R_AARCH64_LDST64_ABS_LO12_NC";
+  case R_AARCH64_MOVW_PREL_G0: return "R_AARCH64_MOVW_PREL_G0";
+  case R_AARCH64_MOVW_PREL_G0_NC: return "R_AARCH64_MOVW_PREL_G0_NC";
+  case R_AARCH64_MOVW_PREL_G1: return "R_AARCH64_MOVW_PREL_G1";
+  case R_AARCH64_MOVW_PREL_G1_NC: return "R_AARCH64_MOVW_PREL_G1_NC";
+  case R_AARCH64_MOVW_PREL_G2: return "R_AARCH64_MOVW_PREL_G2";
+  case R_AARCH64_MOVW_PREL_G2_NC: return "R_AARCH64_MOVW_PREL_G2_NC";
+  case R_AARCH64_MOVW_PREL_G3: return "R_AARCH64_MOVW_PREL_G3";
+  case R_AARCH64_LDST128_ABS_LO12_NC: return "R_AARCH64_LDST128_ABS_LO12_NC";
+  case R_AARCH64_ADR_GOT_PAGE: return "R_AARCH64_ADR_GOT_PAGE";
+  case R_AARCH64_LD64_GOT_LO12_NC: return "R_AARCH64_LD64_GOT_LO12_NC";
+  case R_AARCH64_LD64_GOTPAGE_LO15: return "R_AARCH64_LD64_GOTPAGE_LO15";
+  case R_AARCH64_PLT32: return "R_AARCH64_PLT32";
+  case R_AARCH64_TLSGD_ADR_PREL21: return "R_AARCH64_TLSGD_ADR_PREL21";
+  case R_AARCH64_TLSGD_ADR_PAGE21: return "R_AARCH64_TLSGD_ADR_PAGE21";
+  case R_AARCH64_TLSGD_ADD_LO12_NC: return "R_AARCH64_TLSGD_ADD_LO12_NC";
+  case R_AARCH64_TLSGD_MOVW_G1: return "R_AARCH64_TLSGD_MOVW_G1";
+  case R_AARCH64_TLSGD_MOVW_G0_NC: return "R_AARCH64_TLSGD_MOVW_G0_NC";
+  case R_AARCH64_TLSLD_ADR_PREL21: return "R_AARCH64_TLSLD_ADR_PREL21";
+  case R_AARCH64_TLSLD_ADR_PAGE21: return "R_AARCH64_TLSLD_ADR_PAGE21";
+  case R_AARCH64_TLSLD_ADD_LO12_NC: return "R_AARCH64_TLSLD_ADD_LO12_NC";
+  case R_AARCH64_TLSLD_MOVW_G1: return "R_AARCH64_TLSLD_MOVW_G1";
+  case R_AARCH64_TLSLD_MOVW_G0_NC: return "R_AARCH64_TLSLD_MOVW_G0_NC";
+  case R_AARCH64_TLSLD_LD_PREL19: return "R_AARCH64_TLSLD_LD_PREL19";
+  case R_AARCH64_TLSLD_MOVW_DTPREL_G2: return "R_AARCH64_TLSLD_MOVW_DTPREL_G2";
+  case R_AARCH64_TLSLD_MOVW_DTPREL_G1: return "R_AARCH64_TLSLD_MOVW_DTPREL_G1";
+  case R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: return "R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC";
+  case R_AARCH64_TLSLD_MOVW_DTPREL_G0: return "R_AARCH64_TLSLD_MOVW_DTPREL_G0";
+  case R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: return "R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC";
+  case R_AARCH64_TLSLD_ADD_DTPREL_HI12: return "R_AARCH64_TLSLD_ADD_DTPREL_HI12";
+  case R_AARCH64_TLSLD_ADD_DTPREL_LO12: return "R_AARCH64_TLSLD_ADD_DTPREL_LO12";
+  case R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC";
+  case R_AARCH64_TLSLD_LDST8_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST8_DTPREL_LO12";
+  case R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC";
+  case R_AARCH64_TLSLD_LDST16_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST16_DTPREL_LO12";
+  case R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC";
+  case R_AARCH64_TLSLD_LDST32_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST32_DTPREL_LO12";
+  case R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC";
+  case R_AARCH64_TLSLD_LDST64_DTPREL_LO12: return "R_AARCH64_TLSLD_LDST64_DTPREL_LO12";
+  case R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC: return "R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC";
+  case R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: return "R_AARCH64_TLSIE_MOVW_GOTTPREL_G1";
+  case R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: return "R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC";
+  case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: return "R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21";
+  case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: return "R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC";
+  case R_AARCH64_TLSIE_LD_GOTTPREL_PREL19: return "R_AARCH64_TLSIE_LD_GOTTPREL_PREL19";
+  case R_AARCH64_TLSLE_MOVW_TPREL_G2: return "R_AARCH64_TLSLE_MOVW_TPREL_G2";
+  case R_AARCH64_TLSLE_MOVW_TPREL_G1: return "R_AARCH64_TLSLE_MOVW_TPREL_G1";
+  case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC: return "R_AARCH64_TLSLE_MOVW_TPREL_G1_NC";
+  case R_AARCH64_TLSLE_MOVW_TPREL_G0: return "R_AARCH64_TLSLE_MOVW_TPREL_G0";
+  case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: return "R_AARCH64_TLSLE_MOVW_TPREL_G0_NC";
+  case R_AARCH64_TLSLE_ADD_TPREL_HI12: return "R_AARCH64_TLSLE_ADD_TPREL_HI12";
+  case R_AARCH64_TLSLE_ADD_TPREL_LO12: return "R_AARCH64_TLSLE_ADD_TPREL_LO12";
+  case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: return "R_AARCH64_TLSLE_ADD_TPREL_LO12_NC";
+  case R_AARCH64_TLSLE_LDST8_TPREL_LO12: return "R_AARCH64_TLSLE_LDST8_TPREL_LO12";
+  case R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC";
+  case R_AARCH64_TLSLE_LDST16_TPREL_LO12: return "R_AARCH64_TLSLE_LDST16_TPREL_LO12";
+  case R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC";
+  case R_AARCH64_TLSLE_LDST32_TPREL_LO12: return "R_AARCH64_TLSLE_LDST32_TPREL_LO12";
+  case R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC";
+  case R_AARCH64_TLSLE_LDST64_TPREL_LO12: return "R_AARCH64_TLSLE_LDST64_TPREL_LO12";
+  case R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC";
+  case R_AARCH64_TLSDESC_ADR_PAGE21: return "R_AARCH64_TLSDESC_ADR_PAGE21";
+  case R_AARCH64_TLSDESC_LD64_LO12: return "R_AARCH64_TLSDESC_LD64_LO12";
+  case R_AARCH64_TLSDESC_ADD_LO12: return "R_AARCH64_TLSDESC_ADD_LO12";
+  case R_AARCH64_TLSDESC_CALL: return "R_AARCH64_TLSDESC_CALL";
+  case R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC: return "R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC";
+  case R_AARCH64_COPY: return "R_AARCH64_COPY";
+  case R_AARCH64_GLOB_DAT: return "R_AARCH64_GLOB_DAT";
+  case R_AARCH64_JUMP_SLOT: return "R_AARCH64_JUMP_SLOT";
+  case R_AARCH64_RELATIVE: return "R_AARCH64_RELATIVE";
+  case R_AARCH64_TLS_DTPMOD64: return "R_AARCH64_TLS_DTPMOD64";
+  case R_AARCH64_TLS_DTPREL64: return "R_AARCH64_TLS_DTPREL64";
+  case R_AARCH64_TLS_TPREL64: return "R_AARCH64_TLS_TPREL64";
+  case R_AARCH64_TLSDESC: return "R_AARCH64_TLSDESC";
+  case R_AARCH64_IRELATIVE: return "R_AARCH64_IRELATIVE";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<ARM32>(u32 r_type) {
+  switch (r_type) {
+  case R_ARM_NONE: return "R_ARM_NONE";
+  case R_ARM_PC24: return "R_ARM_PC24";
+  case R_ARM_ABS32: return "R_ARM_ABS32";
+  case R_ARM_REL32: return "R_ARM_REL32";
+  case R_ARM_LDR_PC_G0: return "R_ARM_LDR_PC_G0";
+  case R_ARM_ABS16: return "R_ARM_ABS16";
+  case R_ARM_ABS12: return "R_ARM_ABS12";
+  case R_ARM_THM_ABS5: return "R_ARM_THM_ABS5";
+  case R_ARM_ABS8: return "R_ARM_ABS8";
+  case R_ARM_SBREL32: return "R_ARM_SBREL32";
+  case R_ARM_THM_CALL: return "R_ARM_THM_CALL";
+  case R_ARM_THM_PC8: return "R_ARM_THM_PC8";
+  case R_ARM_BREL_ADJ: return "R_ARM_BREL_ADJ";
+  case R_ARM_TLS_DESC: return "R_ARM_TLS_DESC";
+  case R_ARM_THM_SWI8: return "R_ARM_THM_SWI8";
+  case R_ARM_XPC25: return "R_ARM_XPC25";
+  case R_ARM_THM_XPC22: return "R_ARM_THM_XPC22";
+  case R_ARM_TLS_DTPMOD32: return "R_ARM_TLS_DTPMOD32";
+  case R_ARM_TLS_DTPOFF32: return "R_ARM_TLS_DTPOFF32";
+  case R_ARM_TLS_TPOFF32: return "R_ARM_TLS_TPOFF32";
+  case R_ARM_COPY: return "R_ARM_COPY";
+  case R_ARM_GLOB_DAT: return "R_ARM_GLOB_DAT";
+  case R_ARM_JUMP_SLOT: return "R_ARM_JUMP_SLOT";
+  case R_ARM_RELATIVE: return "R_ARM_RELATIVE";
+  case R_ARM_GOTOFF32: return "R_ARM_GOTOFF32";
+  case R_ARM_BASE_PREL: return "R_ARM_BASE_PREL";
+  case R_ARM_GOT_BREL: return "R_ARM_GOT_BREL";
+  case R_ARM_PLT32: return "R_ARM_PLT32";
+  case R_ARM_CALL: return "R_ARM_CALL";
+  case R_ARM_JUMP24: return "R_ARM_JUMP24";
+  case R_ARM_THM_JUMP24: return "R_ARM_THM_JUMP24";
+  case R_ARM_BASE_ABS: return "R_ARM_BASE_ABS";
+  case R_ARM_ALU_PCREL_7_0: return "R_ARM_ALU_PCREL_7_0";
+  case R_ARM_ALU_PCREL_15_8: return "R_ARM_ALU_PCREL_15_8";
+  case R_ARM_ALU_PCREL_23_15: return "R_ARM_ALU_PCREL_23_15";
+  case R_ARM_LDR_SBREL_11_0_NC: return "R_ARM_LDR_SBREL_11_0_NC";
+  case R_ARM_ALU_SBREL_19_12_NC: return "R_ARM_ALU_SBREL_19_12_NC";
+  case R_ARM_ALU_SBREL_27_20_CK: return "R_ARM_ALU_SBREL_27_20_CK";
+  case R_ARM_TARGET1: return "R_ARM_TARGET1";
+  case R_ARM_SBREL31: return "R_ARM_SBREL31";
+  case R_ARM_V4BX: return "R_ARM_V4BX";
+  case R_ARM_TARGET2: return "R_ARM_TARGET2";
+  case R_ARM_PREL31: return "R_ARM_PREL31";
+  case R_ARM_MOVW_ABS_NC: return "R_ARM_MOVW_ABS_NC";
+  case R_ARM_MOVT_ABS: return "R_ARM_MOVT_ABS";
+  case R_ARM_MOVW_PREL_NC: return "R_ARM_MOVW_PREL_NC";
+  case R_ARM_MOVT_PREL: return "R_ARM_MOVT_PREL";
+  case R_ARM_THM_MOVW_ABS_NC: return "R_ARM_THM_MOVW_ABS_NC";
+  case R_ARM_THM_MOVT_ABS: return "R_ARM_THM_MOVT_ABS";
+  case R_ARM_THM_MOVW_PREL_NC: return "R_ARM_THM_MOVW_PREL_NC";
+  case R_ARM_THM_MOVT_PREL: return "R_ARM_THM_MOVT_PREL";
+  case R_ARM_THM_JUMP19: return "R_ARM_THM_JUMP19";
+  case R_ARM_THM_JUMP6: return "R_ARM_THM_JUMP6";
+  case R_ARM_THM_ALU_PREL_11_0: return "R_ARM_THM_ALU_PREL_11_0";
+  case R_ARM_THM_PC12: return "R_ARM_THM_PC12";
+  case R_ARM_ABS32_NOI: return "R_ARM_ABS32_NOI";
+  case R_ARM_REL32_NOI: return "R_ARM_REL32_NOI";
+  case R_ARM_ALU_PC_G0_NC: return "R_ARM_ALU_PC_G0_NC";
+  case R_ARM_ALU_PC_G0: return "R_ARM_ALU_PC_G0";
+  case R_ARM_ALU_PC_G1_NC: return "R_ARM_ALU_PC_G1_NC";
+  case R_ARM_ALU_PC_G1: return "R_ARM_ALU_PC_G1";
+  case R_ARM_ALU_PC_G2: return "R_ARM_ALU_PC_G2";
+  case R_ARM_LDR_PC_G1: return "R_ARM_LDR_PC_G1";
+  case R_ARM_LDR_PC_G2: return "R_ARM_LDR_PC_G2";
+  case R_ARM_LDRS_PC_G0: return "R_ARM_LDRS_PC_G0";
+  case R_ARM_LDRS_PC_G1: return "R_ARM_LDRS_PC_G1";
+  case R_ARM_LDRS_PC_G2: return "R_ARM_LDRS_PC_G2";
+  case R_ARM_LDC_PC_G0: return "R_ARM_LDC_PC_G0";
+  case R_ARM_LDC_PC_G1: return "R_ARM_LDC_PC_G1";
+  case R_ARM_LDC_PC_G2: return "R_ARM_LDC_PC_G2";
+  case R_ARM_ALU_SB_G0_NC: return "R_ARM_ALU_SB_G0_NC";
+  case R_ARM_ALU_SB_G0: return "R_ARM_ALU_SB_G0";
+  case R_ARM_ALU_SB_G1_NC: return "R_ARM_ALU_SB_G1_NC";
+  case R_ARM_ALU_SB_G1: return "R_ARM_ALU_SB_G1";
+  case R_ARM_ALU_SB_G2: return "R_ARM_ALU_SB_G2";
+  case R_ARM_LDR_SB_G0: return "R_ARM_LDR_SB_G0";
+  case R_ARM_LDR_SB_G1: return "R_ARM_LDR_SB_G1";
+  case R_ARM_LDR_SB_G2: return "R_ARM_LDR_SB_G2";
+  case R_ARM_LDRS_SB_G0: return "R_ARM_LDRS_SB_G0";
+  case R_ARM_LDRS_SB_G1: return "R_ARM_LDRS_SB_G1";
+  case R_ARM_LDRS_SB_G2: return "R_ARM_LDRS_SB_G2";
+  case R_ARM_LDC_SB_G0: return "R_ARM_LDC_SB_G0";
+  case R_ARM_LDC_SB_G1: return "R_ARM_LDC_SB_G1";
+  case R_ARM_LDC_SB_G2: return "R_ARM_LDC_SB_G2";
+  case R_ARM_MOVW_BREL_NC: return "R_ARM_MOVW_BREL_NC";
+  case R_ARM_MOVT_BREL: return "R_ARM_MOVT_BREL";
+  case R_ARM_MOVW_BREL: return "R_ARM_MOVW_BREL";
+  case R_ARM_THM_MOVW_BREL_NC: return "R_ARM_THM_MOVW_BREL_NC";
+  case R_ARM_THM_MOVT_BREL: return "R_ARM_THM_MOVT_BREL";
+  case R_ARM_THM_MOVW_BREL: return "R_ARM_THM_MOVW_BREL";
+  case R_ARM_TLS_GOTDESC: return "R_ARM_TLS_GOTDESC";
+  case R_ARM_TLS_CALL: return "R_ARM_TLS_CALL";
+  case R_ARM_TLS_DESCSEQ: return "R_ARM_TLS_DESCSEQ";
+  case R_ARM_THM_TLS_CALL: return "R_ARM_THM_TLS_CALL";
+  case R_ARM_PLT32_ABS: return "R_ARM_PLT32_ABS";
+  case R_ARM_GOT_ABS: return "R_ARM_GOT_ABS";
+  case R_ARM_GOT_PREL: return "R_ARM_GOT_PREL";
+  case R_ARM_GOT_BREL12: return "R_ARM_GOT_BREL12";
+  case R_ARM_GOTOFF12: return "R_ARM_GOTOFF12";
+  case R_ARM_GOTRELAX: return "R_ARM_GOTRELAX";
+  case R_ARM_GNU_VTENTRY: return "R_ARM_GNU_VTENTRY";
+  case R_ARM_GNU_VTINHERIT: return "R_ARM_GNU_VTINHERIT";
+  case R_ARM_THM_JUMP11: return "R_ARM_THM_JUMP11";
+  case R_ARM_THM_JUMP8: return "R_ARM_THM_JUMP8";
+  case R_ARM_TLS_GD32: return "R_ARM_TLS_GD32";
+  case R_ARM_TLS_LDM32: return "R_ARM_TLS_LDM32";
+  case R_ARM_TLS_LDO32: return "R_ARM_TLS_LDO32";
+  case R_ARM_TLS_IE32: return "R_ARM_TLS_IE32";
+  case R_ARM_TLS_LE32: return "R_ARM_TLS_LE32";
+  case R_ARM_TLS_LDO12: return "R_ARM_TLS_LDO12";
+  case R_ARM_TLS_LE12: return "R_ARM_TLS_LE12";
+  case R_ARM_TLS_IE12GP: return "R_ARM_TLS_IE12GP";
+  case R_ARM_PRIVATE_0: return "R_ARM_PRIVATE_0";
+  case R_ARM_PRIVATE_1: return "R_ARM_PRIVATE_1";
+  case R_ARM_PRIVATE_2: return "R_ARM_PRIVATE_2";
+  case R_ARM_PRIVATE_3: return "R_ARM_PRIVATE_3";
+  case R_ARM_PRIVATE_4: return "R_ARM_PRIVATE_4";
+  case R_ARM_PRIVATE_5: return "R_ARM_PRIVATE_5";
+  case R_ARM_PRIVATE_6: return "R_ARM_PRIVATE_6";
+  case R_ARM_PRIVATE_7: return "R_ARM_PRIVATE_7";
+  case R_ARM_PRIVATE_8: return "R_ARM_PRIVATE_8";
+  case R_ARM_PRIVATE_9: return "R_ARM_PRIVATE_9";
+  case R_ARM_PRIVATE_10: return "R_ARM_PRIVATE_10";
+  case R_ARM_PRIVATE_11: return "R_ARM_PRIVATE_11";
+  case R_ARM_PRIVATE_12: return "R_ARM_PRIVATE_12";
+  case R_ARM_PRIVATE_13: return "R_ARM_PRIVATE_13";
+  case R_ARM_PRIVATE_14: return "R_ARM_PRIVATE_14";
+  case R_ARM_PRIVATE_15: return "R_ARM_PRIVATE_15";
+  case R_ARM_ME_TOO: return "R_ARM_ME_TOO";
+  case R_ARM_THM_TLS_DESCSEQ16: return "R_ARM_THM_TLS_DESCSEQ16";
+  case R_ARM_THM_TLS_DESCSEQ32: return "R_ARM_THM_TLS_DESCSEQ32";
+  case R_ARM_THM_BF16: return "R_ARM_THM_BF16";
+  case R_ARM_THM_BF12: return "R_ARM_THM_BF12";
+  case R_ARM_THM_BF18: return "R_ARM_THM_BF18";
+  case R_ARM_IRELATIVE: return "R_ARM_IRELATIVE";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<RV64LE>(u32 r_type) {
+  switch (r_type) {
+  case R_RISCV_NONE: return "R_RISCV_NONE";
+  case R_RISCV_32: return "R_RISCV_32";
+  case R_RISCV_64: return "R_RISCV_64";
+  case R_RISCV_RELATIVE: return "R_RISCV_RELATIVE";
+  case R_RISCV_COPY: return "R_RISCV_COPY";
+  case R_RISCV_JUMP_SLOT: return "R_RISCV_JUMP_SLOT";
+  case R_RISCV_TLS_DTPMOD32: return "R_RISCV_TLS_DTPMOD32";
+  case R_RISCV_TLS_DTPMOD64: return "R_RISCV_TLS_DTPMOD64";
+  case R_RISCV_TLS_DTPREL32: return "R_RISCV_TLS_DTPREL32";
+  case R_RISCV_TLS_DTPREL64: return "R_RISCV_TLS_DTPREL64";
+  case R_RISCV_TLS_TPREL32: return "R_RISCV_TLS_TPREL32";
+  case R_RISCV_TLS_TPREL64: return "R_RISCV_TLS_TPREL64";
+  case R_RISCV_BRANCH: return "R_RISCV_BRANCH";
+  case R_RISCV_JAL: return "R_RISCV_JAL";
+  case R_RISCV_CALL: return "R_RISCV_CALL";
+  case R_RISCV_CALL_PLT: return "R_RISCV_CALL_PLT";
+  case R_RISCV_GOT_HI20: return "R_RISCV_GOT_HI20";
+  case R_RISCV_TLS_GOT_HI20: return "R_RISCV_TLS_GOT_HI20";
+  case R_RISCV_TLS_GD_HI20: return "R_RISCV_TLS_GD_HI20";
+  case R_RISCV_PCREL_HI20: return "R_RISCV_PCREL_HI20";
+  case R_RISCV_PCREL_LO12_I: return "R_RISCV_PCREL_LO12_I";
+  case R_RISCV_PCREL_LO12_S: return "R_RISCV_PCREL_LO12_S";
+  case R_RISCV_HI20: return "R_RISCV_HI20";
+  case R_RISCV_LO12_I: return "R_RISCV_LO12_I";
+  case R_RISCV_LO12_S: return "R_RISCV_LO12_S";
+  case R_RISCV_TPREL_HI20: return "R_RISCV_TPREL_HI20";
+  case R_RISCV_TPREL_LO12_I: return "R_RISCV_TPREL_LO12_I";
+  case R_RISCV_TPREL_LO12_S: return "R_RISCV_TPREL_LO12_S";
+  case R_RISCV_TPREL_ADD: return "R_RISCV_TPREL_ADD";
+  case R_RISCV_ADD8: return "R_RISCV_ADD8";
+  case R_RISCV_ADD16: return "R_RISCV_ADD16";
+  case R_RISCV_ADD32: return "R_RISCV_ADD32";
+  case R_RISCV_ADD64: return "R_RISCV_ADD64";
+  case R_RISCV_SUB8: return "R_RISCV_SUB8";
+  case R_RISCV_SUB16: return "R_RISCV_SUB16";
+  case R_RISCV_SUB32: return "R_RISCV_SUB32";
+  case R_RISCV_SUB64: return "R_RISCV_SUB64";
+  case R_RISCV_ALIGN: return "R_RISCV_ALIGN";
+  case R_RISCV_RVC_BRANCH: return "R_RISCV_RVC_BRANCH";
+  case R_RISCV_RVC_JUMP: return "R_RISCV_RVC_JUMP";
+  case R_RISCV_RVC_LUI: return "R_RISCV_RVC_LUI";
+  case R_RISCV_RELAX: return "R_RISCV_RELAX";
+  case R_RISCV_SUB6: return "R_RISCV_SUB6";
+  case R_RISCV_SET6: return "R_RISCV_SET6";
+  case R_RISCV_SET8: return "R_RISCV_SET8";
+  case R_RISCV_SET16: return "R_RISCV_SET16";
+  case R_RISCV_SET32: return "R_RISCV_SET32";
+  case R_RISCV_32_PCREL: return "R_RISCV_32_PCREL";
+  case R_RISCV_IRELATIVE: return "R_RISCV_IRELATIVE";
+  case R_RISCV_PLT32: return "R_RISCV_PLT32";
+  case R_RISCV_SET_ULEB128: return "R_RISCV_SET_ULEB128";
+  case R_RISCV_SUB_ULEB128: return "R_RISCV_SUB_ULEB128";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<RV64BE>(u32 r_type) {
+  return rel_to_string<RV64LE>(r_type);
+}
+
+template <>
+std::string rel_to_string<RV32LE>(u32 r_type) {
+  return rel_to_string<RV64LE>(r_type);
+}
+
+template <>
+std::string rel_to_string<RV32BE>(u32 r_type) {
+  return rel_to_string<RV64LE>(r_type);
+}
+
+template <>
+std::string rel_to_string<PPC32>(u32 r_type) {
+  switch (r_type) {
+  case R_PPC_NONE: return "R_PPC_NONE";
+  case R_PPC_ADDR32: return "R_PPC_ADDR32";
+  case R_PPC_ADDR24: return "R_PPC_ADDR24";
+  case R_PPC_ADDR16: return "R_PPC_ADDR16";
+  case R_PPC_ADDR16_LO: return "R_PPC_ADDR16_LO";
+  case R_PPC_ADDR16_HI: return "R_PPC_ADDR16_HI";
+  case R_PPC_ADDR16_HA: return "R_PPC_ADDR16_HA";
+  case R_PPC_ADDR14: return "R_PPC_ADDR14";
+  case R_PPC_ADDR14_BRTAKEN: return "R_PPC_ADDR14_BRTAKEN";
+  case R_PPC_ADDR14_BRNTAKEN: return "R_PPC_ADDR14_BRNTAKEN";
+  case R_PPC_REL24: return "R_PPC_REL24";
+  case R_PPC_REL14: return "R_PPC_REL14";
+  case R_PPC_REL14_BRTAKEN: return "R_PPC_REL14_BRTAKEN";
+  case R_PPC_REL14_BRNTAKEN: return "R_PPC_REL14_BRNTAKEN";
+  case R_PPC_GOT16: return "R_PPC_GOT16";
+  case R_PPC_GOT16_LO: return "R_PPC_GOT16_LO";
+  case R_PPC_GOT16_HI: return "R_PPC_GOT16_HI";
+  case R_PPC_GOT16_HA: return "R_PPC_GOT16_HA";
+  case R_PPC_PLTREL24: return "R_PPC_PLTREL24";
+  case R_PPC_COPY: return "R_PPC_COPY";
+  case R_PPC_GLOB_DAT: return "R_PPC_GLOB_DAT";
+  case R_PPC_JMP_SLOT: return "R_PPC_JMP_SLOT";
+  case R_PPC_RELATIVE: return "R_PPC_RELATIVE";
+  case R_PPC_LOCAL24PC: return "R_PPC_LOCAL24PC";
+  case R_PPC_UADDR32: return "R_PPC_UADDR32";
+  case R_PPC_UADDR16: return "R_PPC_UADDR16";
+  case R_PPC_REL32: return "R_PPC_REL32";
+  case R_PPC_PLT32: return "R_PPC_PLT32";
+  case R_PPC_PLTREL32: return "R_PPC_PLTREL32";
+  case R_PPC_PLT16_LO: return "R_PPC_PLT16_LO";
+  case R_PPC_PLT16_HI: return "R_PPC_PLT16_HI";
+  case R_PPC_PLT16_HA: return "R_PPC_PLT16_HA";
+  case R_PPC_SDAREL16: return "R_PPC_SDAREL16";
+  case R_PPC_SECTOFF: return "R_PPC_SECTOFF";
+  case R_PPC_SECTOFF_LO: return "R_PPC_SECTOFF_LO";
+  case R_PPC_SECTOFF_HI: return "R_PPC_SECTOFF_HI";
+  case R_PPC_SECTOFF_HA: return "R_PPC_SECTOFF_HA";
+  case R_PPC_ADDR30: return "R_PPC_ADDR30";
+  case R_PPC_TLS: return "R_PPC_TLS";
+  case R_PPC_DTPMOD32: return "R_PPC_DTPMOD32";
+  case R_PPC_TPREL16: return "R_PPC_TPREL16";
+  case R_PPC_TPREL16_LO: return "R_PPC_TPREL16_LO";
+  case R_PPC_TPREL16_HI: return "R_PPC_TPREL16_HI";
+  case R_PPC_TPREL16_HA: return "R_PPC_TPREL16_HA";
+  case R_PPC_TPREL32: return "R_PPC_TPREL32";
+  case R_PPC_DTPREL16: return "R_PPC_DTPREL16";
+  case R_PPC_DTPREL16_LO: return "R_PPC_DTPREL16_LO";
+  case R_PPC_DTPREL16_HI: return "R_PPC_DTPREL16_HI";
+  case R_PPC_DTPREL16_HA: return "R_PPC_DTPREL16_HA";
+  case R_PPC_DTPREL32: return "R_PPC_DTPREL32";
+  case R_PPC_GOT_TLSGD16: return "R_PPC_GOT_TLSGD16";
+  case R_PPC_GOT_TLSGD16_LO: return "R_PPC_GOT_TLSGD16_LO";
+  case R_PPC_GOT_TLSGD16_HI: return "R_PPC_GOT_TLSGD16_HI";
+  case R_PPC_GOT_TLSGD16_HA: return "R_PPC_GOT_TLSGD16_HA";
+  case R_PPC_GOT_TLSLD16: return "R_PPC_GOT_TLSLD16";
+  case R_PPC_GOT_TLSLD16_LO: return "R_PPC_GOT_TLSLD16_LO";
+  case R_PPC_GOT_TLSLD16_HI: return "R_PPC_GOT_TLSLD16_HI";
+  case R_PPC_GOT_TLSLD16_HA: return "R_PPC_GOT_TLSLD16_HA";
+  case R_PPC_GOT_TPREL16: return "R_PPC_GOT_TPREL16";
+  case R_PPC_GOT_TPREL16_LO: return "R_PPC_GOT_TPREL16_LO";
+  case R_PPC_GOT_TPREL16_HI: return "R_PPC_GOT_TPREL16_HI";
+  case R_PPC_GOT_TPREL16_HA: return "R_PPC_GOT_TPREL16_HA";
+  case R_PPC_GOT_DTPREL16: return "R_PPC_GOT_DTPREL16";
+  case R_PPC_GOT_DTPREL16_LO: return "R_PPC_GOT_DTPREL16_LO";
+  case R_PPC_GOT_DTPREL16_HI: return "R_PPC_GOT_DTPREL16_HI";
+  case R_PPC_GOT_DTPREL16_HA: return "R_PPC_GOT_DTPREL16_HA";
+  case R_PPC_TLSGD: return "R_PPC_TLSGD";
+  case R_PPC_TLSLD: return "R_PPC_TLSLD";
+  case R_PPC_PLTSEQ: return "R_PPC_PLTSEQ";
+  case R_PPC_PLTCALL: return "R_PPC_PLTCALL";
+  case R_PPC_IRELATIVE: return "R_PPC_IRELATIVE";
+  case R_PPC_REL16: return "R_PPC_REL16";
+  case R_PPC_REL16_LO: return "R_PPC_REL16_LO";
+  case R_PPC_REL16_HI: return "R_PPC_REL16_HI";
+  case R_PPC_REL16_HA: return "R_PPC_REL16_HA";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<PPC64V1>(u32 r_type) {
+  switch (r_type) {
+  case R_PPC64_NONE: return "R_PPC64_NONE";
+  case R_PPC64_ADDR32: return "R_PPC64_ADDR32";
+  case R_PPC64_ADDR24: return "R_PPC64_ADDR24";
+  case R_PPC64_ADDR16: return "R_PPC64_ADDR16";
+  case R_PPC64_ADDR16_LO: return "R_PPC64_ADDR16_LO";
+  case R_PPC64_ADDR16_HI: return "R_PPC64_ADDR16_HI";
+  case R_PPC64_ADDR16_HA: return "R_PPC64_ADDR16_HA";
+  case R_PPC64_ADDR14: return "R_PPC64_ADDR14";
+  case R_PPC64_ADDR14_BRTAKEN: return "R_PPC64_ADDR14_BRTAKEN";
+  case R_PPC64_ADDR14_BRNTAKEN: return "R_PPC64_ADDR14_BRNTAKEN";
+  case R_PPC64_REL24: return "R_PPC64_REL24";
+  case R_PPC64_REL14: return "R_PPC64_REL14";
+  case R_PPC64_REL14_BRTAKEN: return "R_PPC64_REL14_BRTAKEN";
+  case R_PPC64_REL14_BRNTAKEN: return "R_PPC64_REL14_BRNTAKEN";
+  case R_PPC64_GOT16: return "R_PPC64_GOT16";
+  case R_PPC64_GOT16_LO: return "R_PPC64_GOT16_LO";
+  case R_PPC64_GOT16_HI: return "R_PPC64_GOT16_HI";
+  case R_PPC64_GOT16_HA: return "R_PPC64_GOT16_HA";
+  case R_PPC64_COPY: return "R_PPC64_COPY";
+  case R_PPC64_GLOB_DAT: return "R_PPC64_GLOB_DAT";
+  case R_PPC64_JMP_SLOT: return "R_PPC64_JMP_SLOT";
+  case R_PPC64_RELATIVE: return "R_PPC64_RELATIVE";
+  case R_PPC64_REL32: return "R_PPC64_REL32";
+  case R_PPC64_PLT16_LO: return "R_PPC64_PLT16_LO";
+  case R_PPC64_PLT16_HI: return "R_PPC64_PLT16_HI";
+  case R_PPC64_PLT16_HA: return "R_PPC64_PLT16_HA";
+  case R_PPC64_ADDR64: return "R_PPC64_ADDR64";
+  case R_PPC64_ADDR16_HIGHER: return "R_PPC64_ADDR16_HIGHER";
+  case R_PPC64_ADDR16_HIGHERA: return "R_PPC64_ADDR16_HIGHERA";
+  case R_PPC64_ADDR16_HIGHEST: return "R_PPC64_ADDR16_HIGHEST";
+  case R_PPC64_ADDR16_HIGHESTA: return "R_PPC64_ADDR16_HIGHESTA";
+  case R_PPC64_REL64: return "R_PPC64_REL64";
+  case R_PPC64_TOC16: return "R_PPC64_TOC16";
+  case R_PPC64_TOC16_LO: return "R_PPC64_TOC16_LO";
+  case R_PPC64_TOC16_HI: return "R_PPC64_TOC16_HI";
+  case R_PPC64_TOC16_HA: return "R_PPC64_TOC16_HA";
+  case R_PPC64_TOC: return "R_PPC64_TOC";
+  case R_PPC64_ADDR16_DS: return "R_PPC64_ADDR16_DS";
+  case R_PPC64_ADDR16_LO_DS: return "R_PPC64_ADDR16_LO_DS";
+  case R_PPC64_GOT16_DS: return "R_PPC64_GOT16_DS";
+  case R_PPC64_GOT16_LO_DS: return "R_PPC64_GOT16_LO_DS";
+  case R_PPC64_PLT16_LO_DS: return "R_PPC64_PLT16_LO_DS";
+  case R_PPC64_TOC16_DS: return "R_PPC64_TOC16_DS";
+  case R_PPC64_TOC16_LO_DS: return "R_PPC64_TOC16_LO_DS";
+  case R_PPC64_TLS: return "R_PPC64_TLS";
+  case R_PPC64_DTPMOD64: return "R_PPC64_DTPMOD64";
+  case R_PPC64_TPREL16: return "R_PPC64_TPREL16";
+  case R_PPC64_TPREL16_LO: return "R_PPC64_TPREL16_LO";
+  case R_PPC64_TPREL16_HI: return "R_PPC64_TPREL16_HI";
+  case R_PPC64_TPREL16_HA: return "R_PPC64_TPREL16_HA";
+  case R_PPC64_TPREL64: return "R_PPC64_TPREL64";
+  case R_PPC64_DTPREL16: return "R_PPC64_DTPREL16";
+  case R_PPC64_DTPREL16_LO: return "R_PPC64_DTPREL16_LO";
+  case R_PPC64_DTPREL16_HI: return "R_PPC64_DTPREL16_HI";
+  case R_PPC64_DTPREL16_HA: return "R_PPC64_DTPREL16_HA";
+  case R_PPC64_DTPREL64: return "R_PPC64_DTPREL64";
+  case R_PPC64_GOT_TLSGD16: return "R_PPC64_GOT_TLSGD16";
+  case R_PPC64_GOT_TLSGD16_LO: return "R_PPC64_GOT_TLSGD16_LO";
+  case R_PPC64_GOT_TLSGD16_HI: return "R_PPC64_GOT_TLSGD16_HI";
+  case R_PPC64_GOT_TLSGD16_HA: return "R_PPC64_GOT_TLSGD16_HA";
+  case R_PPC64_GOT_TLSLD16: return "R_PPC64_GOT_TLSLD16";
+  case R_PPC64_GOT_TLSLD16_LO: return "R_PPC64_GOT_TLSLD16_LO";
+  case R_PPC64_GOT_TLSLD16_HI: return "R_PPC64_GOT_TLSLD16_HI";
+  case R_PPC64_GOT_TLSLD16_HA: return "R_PPC64_GOT_TLSLD16_HA";
+  case R_PPC64_GOT_TPREL16_DS: return "R_PPC64_GOT_TPREL16_DS";
+  case R_PPC64_GOT_TPREL16_LO_DS: return "R_PPC64_GOT_TPREL16_LO_DS";
+  case R_PPC64_GOT_TPREL16_HI: return "R_PPC64_GOT_TPREL16_HI";
+  case R_PPC64_GOT_TPREL16_HA: return "R_PPC64_GOT_TPREL16_HA";
+  case R_PPC64_GOT_DTPREL16_DS: return "R_PPC64_GOT_DTPREL16_DS";
+  case R_PPC64_GOT_DTPREL16_LO_DS: return "R_PPC64_GOT_DTPREL16_LO_DS";
+  case R_PPC64_GOT_DTPREL16_HI: return "R_PPC64_GOT_DTPREL16_HI";
+  case R_PPC64_GOT_DTPREL16_HA: return "R_PPC64_GOT_DTPREL16_HA";
+  case R_PPC64_TPREL16_DS: return "R_PPC64_TPREL16_DS";
+  case R_PPC64_TPREL16_LO_DS: return "R_PPC64_TPREL16_LO_DS";
+  case R_PPC64_TPREL16_HIGHER: return "R_PPC64_TPREL16_HIGHER";
+  case R_PPC64_TPREL16_HIGHERA: return "R_PPC64_TPREL16_HIGHERA";
+  case R_PPC64_TPREL16_HIGHEST: return "R_PPC64_TPREL16_HIGHEST";
+  case R_PPC64_TPREL16_HIGHESTA: return "R_PPC64_TPREL16_HIGHESTA";
+  case R_PPC64_DTPREL16_DS: return "R_PPC64_DTPREL16_DS";
+  case R_PPC64_DTPREL16_LO_DS: return "R_PPC64_DTPREL16_LO_DS";
+  case R_PPC64_DTPREL16_HIGHER: return "R_PPC64_DTPREL16_HIGHER";
+  case R_PPC64_DTPREL16_HIGHERA: return "R_PPC64_DTPREL16_HIGHERA";
+  case R_PPC64_DTPREL16_HIGHEST: return "R_PPC64_DTPREL16_HIGHEST";
+  case R_PPC64_DTPREL16_HIGHESTA: return "R_PPC64_DTPREL16_HIGHESTA";
+  case R_PPC64_TLSGD: return "R_PPC64_TLSGD";
+  case R_PPC64_TLSLD: return "R_PPC64_TLSLD";
+  case R_PPC64_ADDR16_HIGH: return "R_PPC64_ADDR16_HIGH";
+  case R_PPC64_ADDR16_HIGHA: return "R_PPC64_ADDR16_HIGHA";
+  case R_PPC64_TPREL16_HIGH: return "R_PPC64_TPREL16_HIGH";
+  case R_PPC64_TPREL16_HIGHA: return "R_PPC64_TPREL16_HIGHA";
+  case R_PPC64_DTPREL16_HIGH: return "R_PPC64_DTPREL16_HIGH";
+  case R_PPC64_DTPREL16_HIGHA: return "R_PPC64_DTPREL16_HIGHA";
+  case R_PPC64_REL24_NOTOC: return "R_PPC64_REL24_NOTOC";
+  case R_PPC64_PLTSEQ: return "R_PPC64_PLTSEQ";
+  case R_PPC64_PLTCALL: return "R_PPC64_PLTCALL";
+  case R_PPC64_PLTSEQ_NOTOC: return "R_PPC64_PLTSEQ_NOTOC";
+  case R_PPC64_PLTCALL_NOTOC: return "R_PPC64_PLTCALL_NOTOC";
+  case R_PPC64_PCREL_OPT: return "R_PPC64_PCREL_OPT";
+  case R_PPC64_PCREL34: return "R_PPC64_PCREL34";
+  case R_PPC64_GOT_PCREL34: return "R_PPC64_GOT_PCREL34";
+  case R_PPC64_PLT_PCREL34: return "R_PPC64_PLT_PCREL34";
+  case R_PPC64_PLT_PCREL34_NOTOC: return "R_PPC64_PLT_PCREL34_NOTOC";
+  case R_PPC64_TPREL34: return "R_PPC64_TPREL34";
+  case R_PPC64_DTPREL34: return "R_PPC64_DTPREL34";
+  case R_PPC64_GOT_TLSGD_PCREL34: return "R_PPC64_GOT_TLSGD_PCREL34";
+  case R_PPC64_GOT_TLSLD_PCREL34: return "R_PPC64_GOT_TLSLD_PCREL34";
+  case R_PPC64_GOT_TPREL_PCREL34: return "R_PPC64_GOT_TPREL_PCREL34";
+  case R_PPC64_IRELATIVE: return "R_PPC64_IRELATIVE";
+  case R_PPC64_REL16: return "R_PPC64_REL16";
+  case R_PPC64_REL16_LO: return "R_PPC64_REL16_LO";
+  case R_PPC64_REL16_HI: return "R_PPC64_REL16_HI";
+  case R_PPC64_REL16_HA: return "R_PPC64_REL16_HA";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<PPC64V2>(u32 r_type) {
+  return rel_to_string<PPC64V1>(r_type);
+}
+
+template <>
+std::string rel_to_string<SPARC64>(u32 r_type) {
+  switch (r_type) {
+  case R_SPARC_NONE: return "R_SPARC_NONE";
+  case R_SPARC_8: return "R_SPARC_8";
+  case R_SPARC_16: return "R_SPARC_16";
+  case R_SPARC_32: return "R_SPARC_32";
+  case R_SPARC_DISP8: return "R_SPARC_DISP8";
+  case R_SPARC_DISP16: return "R_SPARC_DISP16";
+  case R_SPARC_DISP32: return "R_SPARC_DISP32";
+  case R_SPARC_WDISP30: return "R_SPARC_WDISP30";
+  case R_SPARC_WDISP22: return "R_SPARC_WDISP22";
+  case R_SPARC_HI22: return "R_SPARC_HI22";
+  case R_SPARC_22: return "R_SPARC_22";
+  case R_SPARC_13: return "R_SPARC_13";
+  case R_SPARC_LO10: return "R_SPARC_LO10";
+  case R_SPARC_GOT10: return "R_SPARC_GOT10";
+  case R_SPARC_GOT13: return "R_SPARC_GOT13";
+  case R_SPARC_GOT22: return "R_SPARC_GOT22";
+  case R_SPARC_PC10: return "R_SPARC_PC10";
+  case R_SPARC_PC22: return "R_SPARC_PC22";
+  case R_SPARC_WPLT30: return "R_SPARC_WPLT30";
+  case R_SPARC_COPY: return "R_SPARC_COPY";
+  case R_SPARC_GLOB_DAT: return "R_SPARC_GLOB_DAT";
+  case R_SPARC_JMP_SLOT: return "R_SPARC_JMP_SLOT";
+  case R_SPARC_RELATIVE: return "R_SPARC_RELATIVE";
+  case R_SPARC_UA32: return "R_SPARC_UA32";
+  case R_SPARC_PLT32: return "R_SPARC_PLT32";
+  case R_SPARC_HIPLT22: return "R_SPARC_HIPLT22";
+  case R_SPARC_LOPLT10: return "R_SPARC_LOPLT10";
+  case R_SPARC_PCPLT32: return "R_SPARC_PCPLT32";
+  case R_SPARC_PCPLT22: return "R_SPARC_PCPLT22";
+  case R_SPARC_PCPLT10: return "R_SPARC_PCPLT10";
+  case R_SPARC_10: return "R_SPARC_10";
+  case R_SPARC_11: return "R_SPARC_11";
+  case R_SPARC_64: return "R_SPARC_64";
+  case R_SPARC_OLO10: return "R_SPARC_OLO10";
+  case R_SPARC_HH22: return "R_SPARC_HH22";
+  case R_SPARC_HM10: return "R_SPARC_HM10";
+  case R_SPARC_LM22: return "R_SPARC_LM22";
+  case R_SPARC_PC_HH22: return "R_SPARC_PC_HH22";
+  case R_SPARC_PC_HM10: return "R_SPARC_PC_HM10";
+  case R_SPARC_PC_LM22: return "R_SPARC_PC_LM22";
+  case R_SPARC_WDISP16: return "R_SPARC_WDISP16";
+  case R_SPARC_WDISP19: return "R_SPARC_WDISP19";
+  case R_SPARC_7: return "R_SPARC_7";
+  case R_SPARC_5: return "R_SPARC_5";
+  case R_SPARC_6: return "R_SPARC_6";
+  case R_SPARC_DISP64: return "R_SPARC_DISP64";
+  case R_SPARC_PLT64: return "R_SPARC_PLT64";
+  case R_SPARC_HIX22: return "R_SPARC_HIX22";
+  case R_SPARC_LOX10: return "R_SPARC_LOX10";
+  case R_SPARC_H44: return "R_SPARC_H44";
+  case R_SPARC_M44: return "R_SPARC_M44";
+  case R_SPARC_L44: return "R_SPARC_L44";
+  case R_SPARC_REGISTER: return "R_SPARC_REGISTER";
+  case R_SPARC_UA64: return "R_SPARC_UA64";
+  case R_SPARC_UA16: return "R_SPARC_UA16";
+  case R_SPARC_TLS_GD_HI22: return "R_SPARC_TLS_GD_HI22";
+  case R_SPARC_TLS_GD_LO10: return "R_SPARC_TLS_GD_LO10";
+  case R_SPARC_TLS_GD_ADD: return "R_SPARC_TLS_GD_ADD";
+  case R_SPARC_TLS_GD_CALL: return "R_SPARC_TLS_GD_CALL";
+  case R_SPARC_TLS_LDM_HI22: return "R_SPARC_TLS_LDM_HI22";
+  case R_SPARC_TLS_LDM_LO10: return "R_SPARC_TLS_LDM_LO10";
+  case R_SPARC_TLS_LDM_ADD: return "R_SPARC_TLS_LDM_ADD";
+  case R_SPARC_TLS_LDM_CALL: return "R_SPARC_TLS_LDM_CALL";
+  case R_SPARC_TLS_LDO_HIX22: return "R_SPARC_TLS_LDO_HIX22";
+  case R_SPARC_TLS_LDO_LOX10: return "R_SPARC_TLS_LDO_LOX10";
+  case R_SPARC_TLS_LDO_ADD: return "R_SPARC_TLS_LDO_ADD";
+  case R_SPARC_TLS_IE_HI22: return "R_SPARC_TLS_IE_HI22";
+  case R_SPARC_TLS_IE_LO10: return "R_SPARC_TLS_IE_LO10";
+  case R_SPARC_TLS_IE_LD: return "R_SPARC_TLS_IE_LD";
+  case R_SPARC_TLS_IE_LDX: return "R_SPARC_TLS_IE_LDX";
+  case R_SPARC_TLS_IE_ADD: return "R_SPARC_TLS_IE_ADD";
+  case R_SPARC_TLS_LE_HIX22: return "R_SPARC_TLS_LE_HIX22";
+  case R_SPARC_TLS_LE_LOX10: return "R_SPARC_TLS_LE_LOX10";
+  case R_SPARC_TLS_DTPMOD32: return "R_SPARC_TLS_DTPMOD32";
+  case R_SPARC_TLS_DTPMOD64: return "R_SPARC_TLS_DTPMOD64";
+  case R_SPARC_TLS_DTPOFF32: return "R_SPARC_TLS_DTPOFF32";
+  case R_SPARC_TLS_DTPOFF64: return "R_SPARC_TLS_DTPOFF64";
+  case R_SPARC_TLS_TPOFF32: return "R_SPARC_TLS_TPOFF32";
+  case R_SPARC_TLS_TPOFF64: return "R_SPARC_TLS_TPOFF64";
+  case R_SPARC_GOTDATA_HIX22: return "R_SPARC_GOTDATA_HIX22";
+  case R_SPARC_GOTDATA_LOX10: return "R_SPARC_GOTDATA_LOX10";
+  case R_SPARC_GOTDATA_OP_HIX22: return "R_SPARC_GOTDATA_OP_HIX22";
+  case R_SPARC_GOTDATA_OP_LOX10: return "R_SPARC_GOTDATA_OP_LOX10";
+  case R_SPARC_GOTDATA_OP: return "R_SPARC_GOTDATA_OP";
+  case R_SPARC_IRELATIVE: return "R_SPARC_IRELATIVE";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<S390X>(u32 r_type) {
+  switch (r_type) {
+  case R_390_NONE: return "R_390_NONE";
+  case R_390_8: return "R_390_8";
+  case R_390_12: return "R_390_12";
+  case R_390_16: return "R_390_16";
+  case R_390_32: return "R_390_32";
+  case R_390_PC32: return "R_390_PC32";
+  case R_390_GOT12: return "R_390_GOT12";
+  case R_390_GOT32: return "R_390_GOT32";
+  case R_390_PLT32: return "R_390_PLT32";
+  case R_390_COPY: return "R_390_COPY";
+  case R_390_GLOB_DAT: return "R_390_GLOB_DAT";
+  case R_390_JMP_SLOT: return "R_390_JMP_SLOT";
+  case R_390_RELATIVE: return "R_390_RELATIVE";
+  case R_390_GOTOFF32: return "R_390_GOTOFF32";
+  case R_390_GOTPC: return "R_390_GOTPC";
+  case R_390_GOT16: return "R_390_GOT16";
+  case R_390_PC16: return "R_390_PC16";
+  case R_390_PC16DBL: return "R_390_PC16DBL";
+  case R_390_PLT16DBL: return "R_390_PLT16DBL";
+  case R_390_PC32DBL: return "R_390_PC32DBL";
+  case R_390_PLT32DBL: return "R_390_PLT32DBL";
+  case R_390_GOTPCDBL: return "R_390_GOTPCDBL";
+  case R_390_64: return "R_390_64";
+  case R_390_PC64: return "R_390_PC64";
+  case R_390_GOT64: return "R_390_GOT64";
+  case R_390_PLT64: return "R_390_PLT64";
+  case R_390_GOTENT: return "R_390_GOTENT";
+  case R_390_GOTOFF16: return "R_390_GOTOFF16";
+  case R_390_GOTOFF64: return "R_390_GOTOFF64";
+  case R_390_GOTPLT12: return "R_390_GOTPLT12";
+  case R_390_GOTPLT16: return "R_390_GOTPLT16";
+  case R_390_GOTPLT32: return "R_390_GOTPLT32";
+  case R_390_GOTPLT64: return "R_390_GOTPLT64";
+  case R_390_GOTPLTENT: return "R_390_GOTPLTENT";
+  case R_390_PLTOFF16: return "R_390_PLTOFF16";
+  case R_390_PLTOFF32: return "R_390_PLTOFF32";
+  case R_390_PLTOFF64: return "R_390_PLTOFF64";
+  case R_390_TLS_LOAD: return "R_390_TLS_LOAD";
+  case R_390_TLS_GDCALL: return "R_390_TLS_GDCALL";
+  case R_390_TLS_LDCALL: return "R_390_TLS_LDCALL";
+  case R_390_TLS_GD32: return "R_390_TLS_GD32";
+  case R_390_TLS_GD64: return "R_390_TLS_GD64";
+  case R_390_TLS_GOTIE12: return "R_390_TLS_GOTIE12";
+  case R_390_TLS_GOTIE32: return "R_390_TLS_GOTIE32";
+  case R_390_TLS_GOTIE64: return "R_390_TLS_GOTIE64";
+  case R_390_TLS_LDM32: return "R_390_TLS_LDM32";
+  case R_390_TLS_LDM64: return "R_390_TLS_LDM64";
+  case R_390_TLS_IE32: return "R_390_TLS_IE32";
+  case R_390_TLS_IE64: return "R_390_TLS_IE64";
+  case R_390_TLS_IEENT: return "R_390_TLS_IEENT";
+  case R_390_TLS_LE32: return "R_390_TLS_LE32";
+  case R_390_TLS_LE64: return "R_390_TLS_LE64";
+  case R_390_TLS_LDO32: return "R_390_TLS_LDO32";
+  case R_390_TLS_LDO64: return "R_390_TLS_LDO64";
+  case R_390_TLS_DTPMOD: return "R_390_TLS_DTPMOD";
+  case R_390_TLS_DTPOFF: return "R_390_TLS_DTPOFF";
+  case R_390_TLS_TPOFF: return "R_390_TLS_TPOFF";
+  case R_390_20: return "R_390_20";
+  case R_390_GOT20: return "R_390_GOT20";
+  case R_390_GOTPLT20: return "R_390_GOTPLT20";
+  case R_390_TLS_GOTIE20: return "R_390_TLS_GOTIE20";
+  case R_390_IRELATIVE: return "R_390_IRELATIVE";
+  case R_390_PC12DBL: return "R_390_PC12DBL";
+  case R_390_PLT12DBL: return "R_390_PLT12DBL";
+  case R_390_PC24DBL: return "R_390_PC24DBL";
+  case R_390_PLT24DBL: return "R_390_PLT24DBL";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<M68K>(u32 r_type) {
+  switch (r_type) {
+  case R_68K_NONE: return "R_68K_NONE";
+  case R_68K_32: return "R_68K_32";
+  case R_68K_16: return "R_68K_16";
+  case R_68K_8: return "R_68K_8";
+  case R_68K_PC32: return "R_68K_PC32";
+  case R_68K_PC16: return "R_68K_PC16";
+  case R_68K_PC8: return "R_68K_PC8";
+  case R_68K_GOTPCREL32: return "R_68K_GOTPCREL32";
+  case R_68K_GOTPCREL16: return "R_68K_GOTPCREL16";
+  case R_68K_GOTPCREL8: return "R_68K_GOTPCREL8";
+  case R_68K_GOTOFF32: return "R_68K_GOTOFF32";
+  case R_68K_GOTOFF16: return "R_68K_GOTOFF16";
+  case R_68K_GOTOFF8: return "R_68K_GOTOFF8";
+  case R_68K_PLT32: return "R_68K_PLT32";
+  case R_68K_PLT16: return "R_68K_PLT16";
+  case R_68K_PLT8: return "R_68K_PLT8";
+  case R_68K_PLTOFF32: return "R_68K_PLTOFF32";
+  case R_68K_PLTOFF16: return "R_68K_PLTOFF16";
+  case R_68K_PLTOFF8: return "R_68K_PLTOFF8";
+  case R_68K_COPY: return "R_68K_COPY";
+  case R_68K_GLOB_DAT: return "R_68K_GLOB_DAT";
+  case R_68K_JMP_SLOT: return "R_68K_JMP_SLOT";
+  case R_68K_RELATIVE: return "R_68K_RELATIVE";
+  case R_68K_TLS_GD32: return "R_68K_TLS_GD32";
+  case R_68K_TLS_GD16: return "R_68K_TLS_GD16";
+  case R_68K_TLS_GD8: return "R_68K_TLS_GD8";
+  case R_68K_TLS_LDM32: return "R_68K_TLS_LDM32";
+  case R_68K_TLS_LDM16: return "R_68K_TLS_LDM16";
+  case R_68K_TLS_LDM8: return "R_68K_TLS_LDM8";
+  case R_68K_TLS_LDO32: return "R_68K_TLS_LDO32";
+  case R_68K_TLS_LDO16: return "R_68K_TLS_LDO16";
+  case R_68K_TLS_LDO8: return "R_68K_TLS_LDO8";
+  case R_68K_TLS_IE32: return "R_68K_TLS_IE32";
+  case R_68K_TLS_IE16: return "R_68K_TLS_IE16";
+  case R_68K_TLS_IE8: return "R_68K_TLS_IE8";
+  case R_68K_TLS_LE32: return "R_68K_TLS_LE32";
+  case R_68K_TLS_LE16: return "R_68K_TLS_LE16";
+  case R_68K_TLS_LE8: return "R_68K_TLS_LE8";
+  case R_68K_TLS_DTPMOD32: return "R_68K_TLS_DTPMOD32";
+  case R_68K_TLS_DTPREL32: return "R_68K_TLS_DTPREL32";
+  case R_68K_TLS_TPREL32: return "R_68K_TLS_TPREL32";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<SH4>(u32 r_type) {
+  switch (r_type) {
+  case R_SH_NONE: return "R_SH_NONE";
+  case R_SH_DIR32: return "R_SH_DIR32";
+  case R_SH_REL32: return "R_SH_REL32";
+  case R_SH_DIR8WPN: return "R_SH_DIR8WPN";
+  case R_SH_IND12W: return "R_SH_IND12W";
+  case R_SH_DIR8WPL: return "R_SH_DIR8WPL";
+  case R_SH_DIR8WPZ: return "R_SH_DIR8WPZ";
+  case R_SH_DIR8BP: return "R_SH_DIR8BP";
+  case R_SH_DIR8W: return "R_SH_DIR8W";
+  case R_SH_DIR8L: return "R_SH_DIR8L";
+  case R_SH_TLS_GD_32: return "R_SH_TLS_GD_32";
+  case R_SH_TLS_LD_32: return "R_SH_TLS_LD_32";
+  case R_SH_TLS_LDO_32: return "R_SH_TLS_LDO_32";
+  case R_SH_TLS_IE_32: return "R_SH_TLS_IE_32";
+  case R_SH_TLS_LE_32: return "R_SH_TLS_LE_32";
+  case R_SH_TLS_DTPMOD32: return "R_SH_TLS_DTPMOD32";
+  case R_SH_TLS_DTPOFF32: return "R_SH_TLS_DTPOFF32";
+  case R_SH_TLS_TPOFF32: return "R_SH_TLS_TPOFF32";
+  case R_SH_GOT32: return "R_SH_GOT32";
+  case R_SH_PLT32: return "R_SH_PLT32";
+  case R_SH_COPY: return "R_SH_COPY";
+  case R_SH_GLOB_DAT: return "R_SH_GLOB_DAT";
+  case R_SH_JMP_SLOT: return "R_SH_JMP_SLOT";
+  case R_SH_RELATIVE: return "R_SH_RELATIVE";
+  case R_SH_GOTOFF: return "R_SH_GOTOFF";
+  case R_SH_GOTPC: return "R_SH_GOTPC";
+  case R_SH_GOTPLT32: return "R_SH_GOTPLT32";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+template <>
+std::string rel_to_string<ALPHA>(u32 r_type) {
+  switch (r_type) {
+  case R_ALPHA_NONE: return "R_ALPHA_NONE";
+  case R_ALPHA_REFLONG: return "R_ALPHA_REFLONG";
+  case R_ALPHA_REFQUAD: return "R_ALPHA_REFQUAD";
+  case R_ALPHA_GPREL32: return "R_ALPHA_GPREL32";
+  case R_ALPHA_LITERAL: return "R_ALPHA_LITERAL";
+  case R_ALPHA_LITUSE: return "R_ALPHA_LITUSE";
+  case R_ALPHA_GPDISP: return "R_ALPHA_GPDISP";
+  case R_ALPHA_BRADDR: return "R_ALPHA_BRADDR";
+  case R_ALPHA_HINT: return "R_ALPHA_HINT";
+  case R_ALPHA_SREL16: return "R_ALPHA_SREL16";
+  case R_ALPHA_SREL32: return "R_ALPHA_SREL32";
+  case R_ALPHA_SREL64: return "R_ALPHA_SREL64";
+  case R_ALPHA_GPRELHIGH: return "R_ALPHA_GPRELHIGH";
+  case R_ALPHA_GPRELLOW: return "R_ALPHA_GPRELLOW";
+  case R_ALPHA_GPREL16: return "R_ALPHA_GPREL16";
+  case R_ALPHA_COPY: return "R_ALPHA_COPY";
+  case R_ALPHA_GLOB_DAT: return "R_ALPHA_GLOB_DAT";
+  case R_ALPHA_JMP_SLOT: return "R_ALPHA_JMP_SLOT";
+  case R_ALPHA_RELATIVE: return "R_ALPHA_RELATIVE";
+  case R_ALPHA_BRSGP: return "R_ALPHA_BRSGP";
+  case R_ALPHA_TLSGD: return "R_ALPHA_TLSGD";
+  case R_ALPHA_TLSLDM: return "R_ALPHA_TLSLDM";
+  case R_ALPHA_DTPMOD64: return "R_ALPHA_DTPMOD64";
+  case R_ALPHA_GOTDTPREL: return "R_ALPHA_GOTDTPREL";
+  case R_ALPHA_DTPREL64: return "R_ALPHA_DTPREL64";
+  case R_ALPHA_DTPRELHI: return "R_ALPHA_DTPRELHI";
+  case R_ALPHA_DTPRELLO: return "R_ALPHA_DTPRELLO";
+  case R_ALPHA_DTPREL16: return "R_ALPHA_DTPREL16";
+  case R_ALPHA_GOTTPREL: return "R_ALPHA_GOTTPREL";
+  case R_ALPHA_TPREL64: return "R_ALPHA_TPREL64";
+  case R_ALPHA_TPRELHI: return "R_ALPHA_TPRELHI";
+  case R_ALPHA_TPRELLO: return "R_ALPHA_TPRELLO";
+  case R_ALPHA_TPREL16: return "R_ALPHA_TPREL16";
+  }
+  return "unknown (" + std::to_string(r_type) + ")";
+}
+
+} // namespace mold::elf
--- a/third_party/mold/elf/elf.h
+++ b/third_party/mold/elf/elf.h
--- a/third_party/mold/elf/gc-sections.cc
+++ b/third_party/mold/elf/gc-sections.cc
@ -0,0 +1,180 @@
+// clang-format off
+// This file implements a mark-sweep garbage collector for -gc-sections.
+// In this algorithm, vertices are sections and edges are relocations.
+// Any section that is reachable from a root section is considered alive.
+
+#include "third_party/mold/elf/mold.h"
+
+// MISSING #include <tbb/concurrent_vector.h>
+// MISSING #include <tbb/parallel_for_each.h>
+
+namespace mold::elf {
+
+template <typename E>
+static bool should_keep(const InputSection<E> &isec) {
+  u32 type = isec.shdr().sh_type;
+  u32 flags = isec.shdr().sh_flags;
+  std::string_view name = isec.name();
+
+  return (flags & SHF_GNU_RETAIN) ||
+         type == SHT_NOTE ||
+         type == SHT_INIT_ARRAY ||
+         type == SHT_FINI_ARRAY ||
+         type == SHT_PREINIT_ARRAY ||
+         (is_arm32<E> && type == SHT_ARM_EXIDX) ||
+         name.starts_with(".ctors") ||
+         name.starts_with(".dtors") ||
+         name.starts_with(".init") ||
+         name.starts_with(".fini") ||
+         is_c_identifier(name);
+}
+
+template <typename E>
+static bool mark_section(InputSection<E> *isec) {
+  return isec && isec->is_alive && !isec->is_visited.test_and_set();
+}
+
+template <typename E>
+static void visit(Context<E> &ctx, InputSection<E> *isec,
+                  tbb::feeder<InputSection<E> *> &feeder, i64 depth) {
+  assert(isec->is_visited);
+
+  // If this is a text section, .eh_frame may contain records
+  // describing how to handle exceptions for that function.
+  // We want to keep associated .eh_frame records.
+  for (FdeRecord<E> &fde : isec->get_fdes())
+    for (const ElfRel<E> &rel : fde.get_rels(isec->file).subspan(1))
+      if (Symbol<E> *sym = isec->file.symbols[rel.r_sym])
+        if (mark_section(sym->get_input_section()))
+          feeder.add(sym->get_input_section());
+
+  for (const ElfRel<E> &rel : isec->get_rels(ctx)) {
+    Symbol<E> &sym = *isec->file.symbols[rel.r_sym];
+
+    // Symbol can refer either a section fragment or an input section.
+    // Mark a fragment as alive.
+    if (SectionFragment<E> *frag = sym.get_frag()) {
+      frag->is_alive = true;
+      continue;
+    }
+
+    // Mark a section alive. For better performacne, we don't call
+    // `feeder.add` too often.
+    if (mark_section(sym.get_input_section())) {
+      if (depth < 3)
+        visit(ctx, sym.get_input_section(), feeder, depth + 1);
+      else
+        feeder.add(sym.get_input_section());
+    }
+  }
+}
+
+template <typename E>
+static void collect_root_set(Context<E> &ctx,
+                             tbb::concurrent_vector<InputSection<E> *> &rootset) {
+  Timer t(ctx, "collect_root_set");
+
+  auto enqueue_section = [&](InputSection<E> *isec) {
+    if (mark_section(isec))
+      rootset.push_back(isec);
+  };
+
+  auto enqueue_symbol = [&](Symbol<E> *sym) {
+    if (sym) {
+      if (SectionFragment<E> *frag = sym->get_frag())
+        frag->is_alive = true;
+      else
+        enqueue_section(sym->get_input_section());
+    }
+  };
+
+  // Add sections that are not subject to garbage collection.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
+      if (!isec || !isec->is_alive)
+        continue;
+
+      // --gc-sections discards only SHF_ALLOC sections. If you want to
+      // reduce the amount of non-memory-mapped segments, you should
+      // use `strip` command, compile without debug info or use
+      // --strip-all linker option.
+      u32 flags = isec->shdr().sh_flags;
+      if (!(flags & SHF_ALLOC))
+        isec->is_visited = true;
+
+      if (should_keep(*isec))
+        enqueue_section(isec.get());
+    }
+  });
+
+  // Add sections containing exported symbols
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (Symbol<E> *sym : file->symbols)
+      if (sym->file == file && sym->is_exported)
+        enqueue_symbol(sym);
+  });
+
+  // Add sections referenced by root symbols.
+  enqueue_symbol(get_symbol(ctx, ctx.arg.entry));
+
+  for (std::string_view name : ctx.arg.undefined)
+    enqueue_symbol(get_symbol(ctx, name));
+
+  for (std::string_view name : ctx.arg.require_defined)
+    enqueue_symbol(get_symbol(ctx, name));
+
+  // .eh_frame consists of variable-length records called CIE and FDE
+  // records, and they are a unit of inclusion or exclusion.
+  // We just keep all CIEs and everything that are referenced by them.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (CieRecord<E> &cie : file->cies)
+      for (const ElfRel<E> &rel : cie.get_rels())
+        enqueue_symbol(file->symbols[rel.r_sym]);
+  });
+}
+
+// Mark all reachable sections
+template <typename E>
+static void mark(Context<E> &ctx,
+                 tbb::concurrent_vector<InputSection<E> *> &rootset) {
+  Timer t(ctx, "mark");
+
+  tbb::parallel_for_each(rootset, [&](InputSection<E> *isec,
+                                    tbb::feeder<InputSection<E> *> &feeder) {
+    visit(ctx, isec, feeder, 0);
+  });
+}
+
+// Remove unreachable sections
+template <typename E>
+static void sweep(Context<E> &ctx) {
+  Timer t(ctx, "sweep");
+  static Counter counter("garbage_sections");
+
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
+      if (isec && isec->is_alive && !isec->is_visited) {
+        if (ctx.arg.print_gc_sections)
+          SyncOut(ctx) << "removing unused section " << *isec;
+        isec->kill();
+        counter++;
+      }
+    }
+  });
+}
+
+template <typename E>
+void gc_sections(Context<E> &ctx) {
+  Timer t(ctx, "gc");
+
+  tbb::concurrent_vector<InputSection<E> *> rootset;
+  collect_root_set(ctx, rootset);
+  mark(ctx, rootset);
+  sweep(ctx);
+}
+
+using E = MOLD_TARGET;
+
+template void gc_sections(Context<E> &ctx);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/icf.cc
+++ b/third_party/mold/elf/icf.cc
@ -0,0 +1,615 @@
+// clang-format off
+// This file implements the Identical Code Folding feature which can
+// reduce the output file size of a typical program by a few percent.
+// ICF identifies read-only input sections that happen to be identical
+// and thus can be used interchangeably. ICF leaves one of them and discards
+// the others.
+//
+// ICF is usually used in combination with -ffunction-sections and
+// -fdata-sections compiler options, so that object files have one section
+// for each function or variable instead of having one large .text or .data.
+// The unit of ICF merging is section.
+//
+// Two sections are considered identical by ICF if they have the exact
+// same contents, metadata such as section flags, exception handling
+// records, and relocations. The last one is interesting because two
+// relocations are considered identical if they point to the _same_
+// section in terms of ICF.
+//
+// To see what that means, consider two sections, A and B, which are
+// identical except for one pair of relocations. Say, A has a relocation to
+// section C, and B has a relocation to D. In this case, A and B are
+// considered identical if C and D are considered identical. C and D can be
+// either really the same section or two different sections that are
+// considered identical by ICF. Below is an example of such inputs, A, B, C
+// and D:
+//
+//   void A() { C(); }
+//   void B() { D(); }
+//   void C() { A(); }
+//   void D() { B(); }
+//
+// If we assume A and B are mergeable, we can merge C and D, which makes A
+// and B mergeable. There's no contradiction in our assumption, so we can
+// conclude that A and B as well as C and D are mergeable.
+//
+// This problem boils down to one in graph theory. Input to ICF can be
+// considered as a directed graph in which vertices are sections and edges
+// are relocations. Vertices have labels (section contents, etc.), and so
+// are edges (relocation offsets, etc.). Two vertices are considered
+// identical if and only if the (possibly infinite) their unfoldings into
+// regular trees are equal. Given this formulation, we want to find as
+// many identical vertices as possible.
+//
+// Just like a lot of problems with graph, this problem doesn't have a
+// straightforward "optimal" solution, and we need to resort to heuristics.
+//
+// mold approaches this problem by hashing program trees with increasing depth
+// on each iteration.
+// For example, when we start, we only hash individual functions with
+// their call into other functions omitted. From the second iteration, we
+// put the function they call into the hash by appending the hash of those
+// functions from the previous iteration. This means that the nth iteration
+// hashes call chain up to (n-1) levels deep.
+// We use a cryptographic hash function, so the unique number of hashes will
+// only monotonically increase as we take into account of deeper trees with
+// iterations (otherwise, that means we have found a hash collision). We stop
+// when the unique number of hashes stop increasing; this is based on the fact
+// that once we observe an iteration with the same amount of unique hashes as
+// the previous iteration, it will remain unchanged for further iterations.
+// This is provable, but here we omit the proof for brevity.
+//
+// When compared to other approaches, mold's approach has a relatively cheaper
+// cost per iteration, and as a bonus, is highly parallelizable.
+// For Chromium, mold's ICF finishes in less than 1 second with 20 threads,
+// whereas lld takes 5 seconds and gold takes 50 seconds under the same
+// conditions.
+
+#include "third_party/mold/elf/mold.h"
+// MISSING #include "../common/sha.h"
+
+#include "third_party/libcxx/array"
+#include "third_party/libcxx/cstdio"
+// MISSING #include <tbb/concurrent_unordered_map.h>
+// MISSING #include <tbb/concurrent_vector.h>
+// MISSING #include <tbb/enumerable_thread_specific.h>
+// MISSING #include <tbb/parallel_for.h>
+// MISSING #include <tbb/parallel_for_each.h>
+// MISSING #include <tbb/parallel_sort.h>
+
+static constexpr int64_t HASH_SIZE = 16;
+
+typedef std::array<uint8_t, HASH_SIZE> Digest;
+
+namespace std {
+template<> struct hash<Digest> {
+  size_t operator()(const Digest &k) const {
+    return *(int64_t *)&k[0];
+  }
+};
+}
+
+namespace mold::elf {
+
+template <typename E>
+static void uniquify_cies(Context<E> &ctx) {
+  Timer t(ctx, "uniquify_cies");
+  std::vector<CieRecord<E> *> cies;
+
+  for (ObjectFile<E> *file : ctx.objs) {
+    for (CieRecord<E> &cie : file->cies) {
+      for (i64 i = 0; i < cies.size(); i++) {
+        if (cie.equals(*cies[i])) {
+          cie.icf_idx = i;
+          goto found;
+        }
+      }
+      cie.icf_idx = cies.size();
+      cies.push_back(&cie);
+    found:;
+    }
+  }
+}
+
+template <typename E>
+static bool is_eligible(Context<E> &ctx, InputSection<E> &isec) {
+  const ElfShdr<E> &shdr = isec.shdr();
+  std::string_view name = isec.name();
+
+  bool is_alloc = (shdr.sh_flags & SHF_ALLOC);
+  bool is_exec = (shdr.sh_flags & SHF_EXECINSTR) ||
+                 ctx.arg.ignore_data_address_equality;
+  bool is_relro = (name == ".data.rel.ro" ||
+                   name.starts_with(".data.rel.ro."));
+  bool is_readonly = !(shdr.sh_flags & SHF_WRITE) || is_relro;
+  bool is_bss = (shdr.sh_type == SHT_NOBITS);
+  bool is_empty = (shdr.sh_size == 0);
+  bool is_init = (shdr.sh_type == SHT_INIT_ARRAY || name == ".init");
+  bool is_fini = (shdr.sh_type == SHT_FINI_ARRAY || name == ".fini");
+  bool is_enumerable = is_c_identifier(name);
+  bool is_addr_taken = !ctx.arg.icf_all && isec.address_significant;
+
+  return is_alloc && is_exec && is_readonly && !is_bss && !is_empty &&
+         !is_init && !is_fini && !is_enumerable && !is_addr_taken;
+}
+
+static Digest digest_final(SHA256Hash &sha) {
+  u8 buf[SHA256_SIZE];
+  sha.finish(buf);
+
+  Digest digest;
+  memcpy(digest.data(), buf, HASH_SIZE);
+  return digest;
+}
+
+template <typename E>
+static bool is_leaf(Context<E> &ctx, InputSection<E> &isec) {
+  if (!isec.get_rels(ctx).empty())
+    return false;
+
+  for (FdeRecord<E> &fde : isec.get_fdes())
+    if (fde.get_rels(isec.file).size() > 1)
+      return false;
+
+  return true;
+}
+
+template <typename E>
+struct LeafHasher {
+  size_t operator()(InputSection<E> *isec) const {
+    u64 h = hash_string(isec->contents);
+    for (FdeRecord<E> &fde : isec->get_fdes()) {
+      u64 h2 = hash_string(fde.get_contents(isec->file).substr(8));
+      h = combine_hash(h, h2);
+    }
+    return h;
+  }
+};
+
+template <typename E>
+struct LeafEq {
+  bool operator()(InputSection<E> *a, InputSection<E> *b) const {
+    if (a->contents != b->contents)
+      return false;
+
+    std::span<FdeRecord<E>> x = a->get_fdes();
+    std::span<FdeRecord<E>> y = b->get_fdes();
+
+    if (x.size() != y.size())
+      return false;
+
+    for (i64 i = 0; i < x.size(); i++)
+      if (x[i].get_contents(a->file).substr(8) !=
+          y[i].get_contents(b->file).substr(8))
+        return false;
+    return true;
+  }
+};
+
+// Early merge of leaf nodes, which can be processed without constructing the
+// entire graph. This reduces the vertex count and improves memory efficiency.
+template <typename E>
+static void merge_leaf_nodes(Context<E> &ctx) {
+  Timer t(ctx, "merge_leaf_nodes");
+
+  static Counter eligible("icf_eligibles");
+  static Counter non_eligible("icf_non_eligibles");
+  static Counter leaf("icf_leaf_nodes");
+
+  tbb::concurrent_unordered_map<InputSection<E> *, InputSection<E> *,
+                                LeafHasher<E>, LeafEq<E>> map;
+
+  tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
+    for (std::unique_ptr<InputSection<E>> &isec : ctx.objs[i]->sections) {
+      if (!isec || !isec->is_alive)
+        continue;
+
+      if (!is_eligible(ctx, *isec)) {
+        non_eligible++;
+        continue;
+      }
+
+      if (is_leaf(ctx, *isec)) {
+        leaf++;
+        isec->icf_leaf = true;
+        auto [it, inserted] = map.insert({isec.get(), isec.get()});
+        if (!inserted && isec->get_priority() < it->second->get_priority())
+          it->second = isec.get();
+      } else {
+        eligible++;
+        isec->icf_eligible = true;
+      }
+    }
+  });
+
+  tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
+    for (std::unique_ptr<InputSection<E>> &isec : ctx.objs[i]->sections) {
+      if (isec && isec->is_alive && isec->icf_leaf) {
+        auto it = map.find(isec.get());
+        assert(it != map.end());
+        isec->leader = it->second;
+      }
+    }
+  });
+}
+
+template <typename E>
+static Digest compute_digest(Context<E> &ctx, InputSection<E> &isec) {
+  SHA256Hash sha;
+
+  auto hash = [&](auto val) {
+    sha.update((u8 *)&val, sizeof(val));
+  };
+
+  auto hash_string = [&](std::string_view str) {
+    hash(str.size());
+    sha.update((u8 *)str.data(), str.size());
+  };
+
+  auto hash_symbol = [&](Symbol<E> &sym) {
+    InputSection<E> *isec = sym.get_input_section();
+
+    if (!sym.file) {
+      hash('1');
+      hash((u64)&sym);
+    } else if (SectionFragment<E> *frag = sym.get_frag()) {
+      hash('2');
+      hash((u64)frag);
+    } else if (!isec) {
+      hash('3');
+    } else if (isec->leader) {
+      hash('4');
+      hash((u64)isec->leader);
+    } else if (isec->icf_eligible) {
+      hash('5');
+    } else {
+      hash('6');
+      hash((u64)isec);
+    }
+    hash(sym.value);
+  };
+
+  hash_string(isec.contents);
+  hash(isec.shdr().sh_flags);
+  hash(isec.get_fdes().size());
+  hash(isec.get_rels(ctx).size());
+
+  for (FdeRecord<E> &fde : isec.get_fdes()) {
+    hash(isec.file.cies[fde.cie_idx].icf_idx);
+
+    // Bytes 0 to 4 contain the length of this record, and
+    // bytes 4 to 8 contain an offset to CIE.
+    hash_string(fde.get_contents(isec.file).substr(8));
+
+    hash(fde.get_rels(isec.file).size());
+
+    for (const ElfRel<E> &rel : fde.get_rels(isec.file).subspan(1)) {
+      hash_symbol(*isec.file.symbols[rel.r_sym]);
+      hash(rel.r_type);
+      hash(rel.r_offset - fde.input_offset);
+      hash(get_addend(isec.file.cies[fde.cie_idx].input_section, rel));
+    }
+  }
+
+  for (i64 i = 0; i < isec.get_rels(ctx).size(); i++) {
+    const ElfRel<E> &rel = isec.get_rels(ctx)[i];
+    hash(rel.r_offset);
+    hash(rel.r_type);
+    hash(get_addend(isec, rel));
+    hash_symbol(*isec.file.symbols[rel.r_sym]);
+  }
+
+  return digest_final(sha);
+}
+
+template <typename E>
+static std::vector<InputSection<E> *> gather_sections(Context<E> &ctx) {
+  Timer t(ctx, "gather_sections");
+
+  // Count the number of input sections for each input file.
+  std::vector<i64> num_sections(ctx.objs.size());
+
+  tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
+    for (std::unique_ptr<InputSection<E>> &isec : ctx.objs[i]->sections)
+      if (isec && isec->is_alive && isec->icf_eligible)
+        num_sections[i]++;
+  });
+
+  std::vector<i64> section_indices(ctx.objs.size());
+  for (i64 i = 0; i < ctx.objs.size() - 1; i++)
+    section_indices[i + 1] = section_indices[i] + num_sections[i];
+
+  std::vector<InputSection<E> *> sections(
+    section_indices.back() + num_sections.back());
+
+  // Fill `sections` contents.
+  tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
+    i64 idx = section_indices[i];
+    for (std::unique_ptr<InputSection<E>> &isec : ctx.objs[i]->sections)
+      if (isec && isec->is_alive && isec->icf_eligible)
+        sections[idx++] = isec.get();
+  });
+
+  tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) {
+    sections[i]->icf_idx = i;
+  });
+
+  return sections;
+}
+
+template <typename E>
+static std::vector<Digest>
+compute_digests(Context<E> &ctx, std::span<InputSection<E> *> sections) {
+  Timer t(ctx, "compute_digests");
+
+  std::vector<Digest> digests(sections.size());
+  tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) {
+    digests[i] = compute_digest(ctx, *sections[i]);
+  });
+  return digests;
+}
+
+// Build a graph, treating every function as a vertex and every function call
+// as an edge. See the description at the top for a more detailed formulation.
+// We use u32 indices here to improve cache locality.
+template <typename E>
+static void gather_edges(Context<E> &ctx,
+                         std::span<InputSection<E> *> sections,
+                         std::vector<u32> &edges,
+                         std::vector<u32> &edge_indices) {
+  Timer t(ctx, "gather_edges");
+
+  if (sections.empty())
+    return;
+
+  std::vector<i64> num_edges(sections.size());
+  edge_indices.resize(sections.size());
+
+  tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) {
+    InputSection<E> &isec = *sections[i];
+    assert(isec.icf_eligible);
+
+    for (i64 j = 0; j < isec.get_rels(ctx).size(); j++) {
+      const ElfRel<E> &rel = isec.get_rels(ctx)[j];
+      Symbol<E> &sym = *isec.file.symbols[rel.r_sym];
+      if (!sym.get_frag())
+        if (InputSection<E> *isec = sym.get_input_section())
+          if (isec->icf_eligible)
+            num_edges[i]++;
+    }
+  });
+
+  for (i64 i = 0; i < num_edges.size() - 1; i++)
+    edge_indices[i + 1] = edge_indices[i] + num_edges[i];
+
+  edges.resize(edge_indices.back() + num_edges.back());
+
+  tbb::parallel_for((i64)0, (i64)num_edges.size(), [&](i64 i) {
+    InputSection<E> &isec = *sections[i];
+    i64 idx = edge_indices[i];
+
+    for (ElfRel<E> &rel : isec.get_rels(ctx)) {
+      Symbol<E> &sym = *isec.file.symbols[rel.r_sym];
+      if (InputSection<E> *isec = sym.get_input_section())
+        if (isec->icf_eligible)
+          edges[idx++] = isec->icf_idx;
+  }
+  });
+}
+
+template <typename E>
+static i64 propagate(std::span<std::vector<Digest>> digests,
+                     std::span<u32> edges, std::span<u32> edge_indices,
+                     bool &slot, BitVector &converged,
+                     tbb::affinity_partitioner &ap) {
+  static Counter round("icf_round");
+  round++;
+
+  i64 num_digests = digests[0].size();
+  tbb::enumerable_thread_specific<i64> changed;
+
+  tbb::parallel_for((i64)0, num_digests, [&](i64 i) {
+    if (converged.get(i))
+      return;
+
+    SHA256Hash sha;
+    sha.update(digests[2][i].data(), HASH_SIZE);
+
+    i64 begin = edge_indices[i];
+    i64 end = (i + 1 == num_digests) ? edges.size() : edge_indices[i + 1];
+
+    for (i64 j : edges.subspan(begin, end - begin))
+      sha.update(digests[slot][j].data(), HASH_SIZE);
+
+    digests[!slot][i] = digest_final(sha);
+
+    if (digests[slot][i] == digests[!slot][i]) {
+      // This node has converged. Skip further iterations as it will
+      // yield the same hash.
+      converged.set(i);
+    } else {
+      changed.local()++;
+    }
+  }, ap);
+
+  slot = !slot;
+  return changed.combine(std::plus());
+}
+
+template <typename E>
+static i64 count_num_classes(std::span<Digest> digests,
+                             tbb::affinity_partitioner &ap) {
+  std::vector<Digest> vec(digests.begin(), digests.end());
+  tbb::parallel_sort(vec);
+
+  tbb::enumerable_thread_specific<i64> num_classes;
+  tbb::parallel_for((i64)0, (i64)vec.size() - 1, [&](i64 i) {
+    if (vec[i] != vec[i + 1])
+      num_classes.local()++;
+  }, ap);
+  return num_classes.combine(std::plus());
+}
+
+template <typename E>
+static void print_icf_sections(Context<E> &ctx) {
+  tbb::concurrent_vector<InputSection<E> *> leaders;
+  tbb::concurrent_unordered_multimap<InputSection<E> *, InputSection<E> *> map;
+
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
+      if (isec && isec->is_alive && isec->leader) {
+        if (isec.get() == isec->leader)
+          leaders.push_back(isec.get());
+        else
+          map.insert({isec->leader, isec.get()});
+      }
+    }
+  });
+
+  tbb::parallel_sort(leaders.begin(), leaders.end(),
+                     [&](InputSection<E> *a, InputSection<E> *b) {
+                       return a->get_priority() < b->get_priority();
+                     });
+
+  i64 saved_bytes = 0;
+
+  for (InputSection<E> *leader : leaders) {
+    auto [begin, end] = map.equal_range(leader);
+    if (begin == end)
+      continue;
+
+    SyncOut(ctx) << "selected section " << *leader;
+
+    i64 n = 0;
+    for (auto it = begin; it != end; it++) {
+      SyncOut(ctx) << "  removing identical section " << *it->second;
+      n++;
+    }
+    saved_bytes += leader->contents.size() * n;
+  }
+
+  SyncOut(ctx) << "ICF saved " << saved_bytes << " bytes";
+}
+
+template <typename E>
+void icf_sections(Context<E> &ctx) {
+  Timer t(ctx, "icf");
+  if (ctx.objs.empty())
+    return;
+
+  uniquify_cies(ctx);
+  merge_leaf_nodes(ctx);
+
+  // Prepare for the propagation rounds.
+  std::vector<InputSection<E> *> sections = gather_sections(ctx);
+
+  // We allocate 3 arrays to store hashes for each vertex.
+  //
+  // Index 0 and 1 are used for tree hashes from the previous
+  // iteration and the current iteration. They switch roles every
+  // iteration. See `slot` below.
+  //
+  // Index 2 stores the initial, single-vertex hash. This is combined
+  // with hashes from the connected vertices to form the tree hash
+  // described above.
+  std::vector<std::vector<Digest>> digests(3);
+  digests[0] = compute_digests<E>(ctx, sections);
+  digests[1].resize(digests[0].size());
+  digests[2] = digests[0];
+
+  std::vector<u32> edges;
+  std::vector<u32> edge_indices;
+  gather_edges<E>(ctx, sections, edges, edge_indices);
+
+  BitVector converged(digests[0].size());
+  bool slot = 0;
+
+  // Execute the propagation rounds until convergence is obtained.
+  {
+    Timer t(ctx, "propagate");
+    tbb::affinity_partitioner ap;
+
+    // A cheap test that the graph hasn't converged yet.
+    // The loop after this one uses a strict condition, but it's expensive
+    // as it requires sorting the entire hash collection.
+    //
+    // For nodes that have a cycle in downstream (i.e. recursive
+    // functions and functions that calls recursive functions) will always
+    // change with the iterations. Nodes that doesn't (i.e. non-recursive
+    // functions) will stop changing as soon as the propagation depth reaches
+    // the call tree depth.
+    // Here, we test whether we have reached sufficient depth for the latter,
+    // which is a necessary (but not sufficient) condition for convergence.
+    i64 num_changed = -1;
+    for (;;) {
+      i64 n = propagate<E>(digests, edges, edge_indices, slot, converged, ap);
+      if (n == num_changed)
+        break;
+      num_changed = n;
+    }
+
+    // Run the pass until the unique number of hashes stop increasing, at which
+    // point we have achieved convergence (proof omitted for brevity).
+    i64 num_classes = -1;
+    for (;;) {
+      // count_num_classes requires sorting which is O(n log n), so do a little
+      // more work beforehand to amortize that log factor.
+      for (i64 i = 0; i < 10; i++)
+        propagate<E>(digests, edges, edge_indices, slot, converged, ap);
+
+      i64 n = count_num_classes<E>(digests[slot], ap);
+      if (n == num_classes)
+        break;
+      num_classes = n;
+    }
+  }
+
+  // Group sections by SHA digest.
+  {
+    Timer t(ctx, "group");
+
+    auto *map = new tbb::concurrent_unordered_map<Digest, InputSection<E> *>;
+    std::span<Digest> digest = digests[slot];
+
+    tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) {
+      InputSection<E> *isec = sections[i];
+      auto [it, inserted] = map->insert({digest[i], isec});
+      if (!inserted && isec->get_priority() < it->second->get_priority())
+        it->second = isec;
+    });
+
+    tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) {
+      auto it = map->find(digest[i]);
+      assert(it != map->end());
+      sections[i]->leader = it->second;
+    });
+
+    // Since free'ing the map is slow, postpone it.
+    ctx.on_exit.push_back([=] { delete map; });
+  }
+
+  if (ctx.arg.print_icf_sections)
+    print_icf_sections(ctx);
+
+  // Eliminate duplicate sections.
+  // Symbols pointing to eliminated sections will be redirected on the fly when
+  // exporting to the symtab.
+  {
+    Timer t(ctx, "sweep");
+    static Counter eliminated("icf_eliminated");
+    tbb::parallel_for_each(ctx.objs, [](ObjectFile<E> *file) {
+      for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
+        if (isec && isec->is_alive && isec->is_killed_by_icf()) {
+          isec->kill();
+          eliminated++;
+        }
+      }
+    });
+  }
+}
+
+using E = MOLD_TARGET;
+
+template void icf_sections(Context<E> &ctx);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/input-files.cc
+++ b/third_party/mold/elf/input-files.cc
--- a/third_party/mold/elf/input-sections.cc
+++ b/third_party/mold/elf/input-sections.cc
@ -0,0 +1,498 @@
+// clang-format off
+#include "third_party/mold/elf/mold.h"
+
+#include "third_party/libcxx/limits"
+// MISSING #include <zlib.h>
+// MISSING #include <zstd.h>
+
+namespace mold::elf {
+
+typedef enum {
+  NONE, ERROR, COPYREL, DYN_COPYREL, PLT, CPLT, DYN_CPLT, DYNREL, BASEREL, IFUNC,
+} Action;
+
+template <typename E>
+bool CieRecord<E>::equals(const CieRecord<E> &other) const {
+  if (get_contents() != other.get_contents())
+    return false;
+
+  std::span<const ElfRel<E>> x = get_rels();
+  std::span<const ElfRel<E>> y = other.get_rels();
+  if (x.size() != y.size())
+    return false;
+
+  for (i64 i = 0; i < x.size(); i++) {
+    if (x[i].r_offset - input_offset != y[i].r_offset - other.input_offset ||
+        x[i].r_type != y[i].r_type ||
+        file.symbols[x[i].r_sym] != other.file.symbols[y[i].r_sym] ||
+        get_addend(input_section, x[i]) != get_addend(other.input_section, y[i]))
+      return false;
+  }
+  return true;
+}
+
+static i64 to_p2align(u64 alignment) {
+  if (alignment == 0)
+    return 0;
+  return std::countr_zero(alignment);
+}
+
+template <typename E>
+InputSection<E>::InputSection(Context<E> &ctx, ObjectFile<E> &file,
+                              std::string_view name, i64 shndx)
+  : file(file), shndx(shndx) {
+  if (shndx < file.elf_sections.size())
+    contents = {(char *)file.mf->data + shdr().sh_offset, (size_t)shdr().sh_size};
+
+  if (shdr().sh_flags & SHF_COMPRESSED) {
+    ElfChdr<E> &chdr = *(ElfChdr<E> *)&contents[0];
+    sh_size = chdr.ch_size;
+    p2align = to_p2align(chdr.ch_addralign);
+  } else {
+    sh_size = shdr().sh_size;
+    p2align = to_p2align(shdr().sh_addralign);
+  }
+
+  // Sections may have been compressed. We usually uncompress them
+  // directly into the mmap'ed output file, but we want to uncompress
+  // early for REL-type ELF types to read relocation addends from
+  // section contents. For RELA-type, we don't need to do this because
+  // addends are in relocations.
+  //
+  // SH-4 stores addends to sections despite being RELA, which is a
+  // special (and buggy) case.
+  if constexpr (!E::is_rela || is_sh4<E>)
+    uncompress(ctx);
+}
+
+template <typename E>
+void InputSection<E>::uncompress(Context<E> &ctx) {
+  if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed)
+    return;
+
+  u8 *buf = new u8[sh_size];
+  uncompress_to(ctx, buf);
+  contents = std::string_view((char *)buf, sh_size);
+  ctx.string_pool.emplace_back(buf);
+  uncompressed = true;
+}
+
+template <typename E>
+void InputSection<E>::uncompress_to(Context<E> &ctx, u8 *buf) {
+  if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed) {
+    memcpy(buf, contents.data(), contents.size());
+    return;
+  }
+
+  if (contents.size() < sizeof(ElfChdr<E>))
+    Fatal(ctx) << *this << ": corrupted compressed section";
+
+  ElfChdr<E> &hdr = *(ElfChdr<E> *)&contents[0];
+  std::string_view data = contents.substr(sizeof(ElfChdr<E>));
+
+  switch (hdr.ch_type) {
+  case ELFCOMPRESS_ZLIB: {
+    unsigned long size = sh_size;
+    if (::uncompress(buf, &size, (u8 *)data.data(), data.size()) != Z_OK)
+      Fatal(ctx) << *this << ": uncompress failed";
+    assert(size == sh_size);
+    break;
+  }
+  case ELFCOMPRESS_ZSTD:
+    if (ZSTD_decompress(buf, sh_size, (u8 *)data.data(), data.size()) != sh_size)
+      Fatal(ctx) << *this << ": ZSTD_decompress failed";
+    break;
+  default:
+    Fatal(ctx) << *this << ": unsupported compression type: 0x"
+               << std::hex << hdr.ch_type;
+  }
+}
+
+template <typename E>
+static Action get_rel_action(Context<E> &ctx, Symbol<E> &sym,
+                             const Action table[3][4]) {
+  auto get_output_type = [&] {
+    if (ctx.arg.shared)
+      return 0;
+    if (ctx.arg.pie)
+      return 1;
+    return 2;
+  };
+
+  auto get_sym_type = [&] {
+    if (sym.is_absolute())
+      return 0;
+    if (!sym.is_imported)
+      return 1;
+    if (sym.get_type() != STT_FUNC)
+      return 2;
+    return 3;
+  };
+
+  return table[get_output_type()][get_sym_type()];
+}
+
+template <typename E>
+static void scan_rel(Context<E> &ctx, InputSection<E> &isec, Symbol<E> &sym,
+                     const ElfRel<E> &rel, Action action) {
+  bool writable = (isec.shdr().sh_flags & SHF_WRITE);
+
+  auto error = [&] {
+    std::string msg = sym.is_absolute() ? "-fno-PIC" : "-fPIC";
+    Error(ctx) << isec << ": " << rel << " relocation at offset 0x"
+               << std::hex << rel.r_offset << " against symbol `"
+               << sym << "' can not be used; recompile with " << msg;
+  };
+
+  auto check_textrel = [&] {
+    if (!writable) {
+      if (ctx.arg.z_text) {
+        error();
+      } else if (ctx.arg.warn_textrel) {
+        Warn(ctx) << isec << ": relocation against symbol `" << sym
+                  << "' in read-only section";
+      }
+      ctx.has_textrel = true;
+    }
+  };
+
+  auto copyrel = [&] {
+    assert(sym.is_imported);
+    if (sym.esym().st_visibility == STV_PROTECTED) {
+      Error(ctx) << isec
+                 << ": cannot make copy relocation for protected symbol '" << sym
+                 << "', defined in " << *sym.file << "; recompile with -fPIC";
+    }
+    sym.flags |= NEEDS_COPYREL;
+  };
+
+  auto dynrel = [&] {
+    check_textrel();
+    isec.file.num_dynrel++;
+  };
+
+  switch (action) {
+  case NONE:
+    break;
+  case ERROR:
+    error();
+    break;
+  case COPYREL:
+    if (!ctx.arg.z_copyreloc)
+      error();
+    copyrel();
+    break;
+  case DYN_COPYREL:
+    if (writable || !ctx.arg.z_copyreloc)
+      dynrel();
+    else
+      copyrel();
+    break;
+  case PLT:
+    sym.flags |= NEEDS_PLT;
+    break;
+  case CPLT:
+    sym.flags |= NEEDS_CPLT;
+    break;
+  case DYN_CPLT:
+    if (writable)
+      dynrel();
+    else
+      sym.flags |= NEEDS_CPLT;
+    break;
+  case DYNREL:
+    dynrel();
+    break;
+  case BASEREL:
+    check_textrel();
+    if (!isec.is_relr_reloc(ctx, rel))
+      isec.file.num_dynrel++;
+    break;
+  case IFUNC:
+    dynrel();
+    ctx.num_ifunc_dynrels++;
+    break;
+  default:
+    unreachable();
+  }
+}
+
+template <typename E>
+static Action get_pcrel_action(Context<E> &ctx, Symbol<E> &sym) {
+  // This is for PC-relative relocations (e.g. R_X86_64_PC32).
+  // We cannot promote them to dynamic relocations because the dynamic
+  // linker generally does not support PC-relative relocations.
+  constexpr static Action table[3][4] = {
+    // Absolute  Local    Imported data  Imported code
+    {  ERROR,    NONE,    ERROR,         PLT    },  // Shared object
+    {  ERROR,    NONE,    COPYREL,       PLT    },  // Position-independent exec
+    {  NONE,     NONE,    COPYREL,       CPLT   },  // Position-dependent exec
+  };
+
+  return get_rel_action(ctx, sym, table);
+}
+
+template <typename E>
+static Action get_absrel_action(Context<E> &ctx, Symbol<E> &sym) {
+  // This is a decision table for absolute relocations that is smaller
+  // than the word size (e.g. R_X86_64_32). Since the dynamic linker
+  // generally does not support dynamic relocations smaller than the
+  // word size, we need to report an error if a relocation cannot be
+  // resolved at link-time.
+  constexpr static Action table[3][4] = {
+    // Absolute  Local    Imported data  Imported code
+    {  NONE,     ERROR,   ERROR,         ERROR },  // Shared object
+    {  NONE,     ERROR,   ERROR,         ERROR },  // Position-independent exec
+    {  NONE,     NONE,    COPYREL,       CPLT  },  // Position-dependent exec
+  };
+
+  return get_rel_action(ctx, sym, table);
+}
+
+template <typename E>
+static Action get_dyn_absrel_action(Context<E> &ctx, Symbol<E> &sym) {
+  if (sym.is_ifunc())
+    return IFUNC;
+
+  // This is a decision table for absolute relocations for the word
+  // size data (e.g. R_X86_64_64). Unlike the absrel_table, we can emit
+  // a dynamic relocation if we cannot resolve an address at link-time.
+  constexpr static Action table[3][4] = {
+    // Absolute  Local    Imported data  Imported code
+    {  NONE,     BASEREL, DYNREL,        DYNREL   },  // Shared object
+    {  NONE,     BASEREL, DYNREL,        DYNREL   },  // Position-independent exec
+    {  NONE,     NONE,    DYN_COPYREL,   DYN_CPLT },  // Position-dependent exec
+  };
+
+  return get_rel_action(ctx, sym, table);
+}
+
+template <typename E>
+static Action get_ppc64_toc_action(Context<E> &ctx, Symbol<E> &sym) {
+  if (sym.is_ifunc())
+    return IFUNC;
+
+  // As a special case, we do not create copy relocations nor canonical
+  // PLTs for .toc sections. PPC64's .toc is a compiler-generated
+  // GOT-like section, and no user-generated code directly uses values
+  // in it.
+  constexpr static Action table[3][4] = {
+    // Absolute  Local    Imported data  Imported code
+    {  NONE,     BASEREL, DYNREL,        DYNREL },  // Shared object
+    {  NONE,     BASEREL, DYNREL,        DYNREL },  // Position-independent exec
+    {  NONE,     NONE,    DYNREL,        DYNREL },  // Position-dependent exec
+  };
+
+  return get_rel_action(ctx, sym, table);
+}
+
+template <typename E>
+void InputSection<E>::scan_pcrel(Context<E> &ctx, Symbol<E> &sym,
+                                 const ElfRel<E> &rel) {
+  scan_rel(ctx, *this, sym, rel, get_pcrel_action(ctx, sym));
+}
+
+template <typename E>
+void InputSection<E>::scan_absrel(Context<E> &ctx, Symbol<E> &sym,
+                                  const ElfRel<E> &rel) {
+  scan_rel(ctx, *this, sym, rel, get_absrel_action(ctx, sym));
+}
+
+template <typename E>
+void InputSection<E>::scan_dyn_absrel(Context<E> &ctx, Symbol<E> &sym,
+                                      const ElfRel<E> &rel) {
+  scan_rel(ctx, *this, sym, rel, get_dyn_absrel_action(ctx, sym));
+}
+
+template <typename E>
+void InputSection<E>::scan_toc_rel(Context<E> &ctx, Symbol<E> &sym,
+                                   const ElfRel<E> &rel) {
+  scan_rel(ctx, *this, sym, rel, get_ppc64_toc_action(ctx, sym));
+}
+
+template <typename E>
+void InputSection<E>::check_tlsle(Context<E> &ctx, Symbol<E> &sym,
+                                  const ElfRel<E> &rel) {
+  if (ctx.arg.shared)
+    Error(ctx) << *this << ": relocation " << rel << " against `" << sym
+               << "` can not be used when making a shared object;"
+               << " recompile with -fPIC";
+}
+
+template <typename E>
+static void apply_absrel(Context<E> &ctx, InputSection<E> &isec,
+                         Symbol<E> &sym, const ElfRel<E> &rel, u8 *loc,
+                         u64 S, i64 A, u64 P, ElfRel<E> *&dynrel,
+                         Action action) {
+  bool writable = (isec.shdr().sh_flags & SHF_WRITE);
+
+  auto apply_dynrel = [&] {
+    *dynrel++ = ElfRel<E>(P, E::R_ABS, sym.get_dynsym_idx(ctx), A);
+    if (ctx.arg.apply_dynamic_relocs)
+      *(Word<E> *)loc = A;
+  };
+
+  switch (action) {
+  case COPYREL:
+  case CPLT:
+  case NONE:
+    *(Word<E> *)loc = S + A;
+    break;
+  case BASEREL:
+    if (isec.is_relr_reloc(ctx, rel)) {
+      *(Word<E> *)loc = S + A;
+    } else {
+      *dynrel++ = ElfRel<E>(P, E::R_RELATIVE, 0, S + A);
+      if (ctx.arg.apply_dynamic_relocs)
+        *(Word<E> *)loc = S + A;
+    }
+    break;
+  case DYN_COPYREL:
+    if (writable || !ctx.arg.z_copyreloc)
+      apply_dynrel();
+    else
+      *(Word<E> *)loc = S + A;
+    break;
+  case DYN_CPLT:
+    if (writable)
+      apply_dynrel();
+    else
+      *(Word<E> *)loc = S + A;
+    break;
+  case DYNREL:
+    apply_dynrel();
+    break;
+  case IFUNC:
+    if constexpr (supports_ifunc<E>) {
+      u64 addr = sym.get_addr(ctx, NO_PLT) + A;
+      *dynrel++ = ElfRel<E>(P, E::R_IRELATIVE, 0, addr);
+      if (ctx.arg.apply_dynamic_relocs)
+        *(Word<E> *)loc = addr;
+    } else {
+      unreachable();
+    }
+    break;
+  default:
+    unreachable();
+  }
+}
+
+template <typename E>
+void InputSection<E>::apply_dyn_absrel(Context<E> &ctx, Symbol<E> &sym,
+                                       const ElfRel<E> &rel, u8 *loc,
+                                       u64 S, i64 A, u64 P,
+                                       ElfRel<E> *&dynrel) {
+  apply_absrel(ctx, *this, sym, rel, loc, S, A, P, dynrel,
+               get_dyn_absrel_action(ctx, sym));
+}
+
+template <typename E>
+void InputSection<E>::apply_toc_rel(Context<E> &ctx, Symbol<E> &sym,
+                                    const ElfRel<E> &rel, u8 *loc,
+                                    u64 S, i64 A, u64 P,
+                                    ElfRel<E> *&dynrel) {
+  apply_absrel(ctx, *this, sym, rel, loc, S, A, P, dynrel,
+               get_ppc64_toc_action(ctx, sym));
+}
+
+template <typename E>
+void InputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
+  if (shdr().sh_type == SHT_NOBITS || sh_size == 0)
+    return;
+
+  // Copy data
+  if constexpr (is_riscv<E>)
+    copy_contents_riscv(ctx, buf);
+  else
+    uncompress_to(ctx, buf);
+
+  // Apply relocations
+  if (!ctx.arg.relocatable) {
+    if (shdr().sh_flags & SHF_ALLOC)
+      apply_reloc_alloc(ctx, buf);
+    else
+      apply_reloc_nonalloc(ctx, buf);
+  }
+}
+
+// Get the name of a function containin a given offset.
+template <typename E>
+std::string_view InputSection<E>::get_func_name(Context<E> &ctx, i64 offset) const {
+  for (const ElfSym<E> &esym : file.elf_syms) {
+    if (esym.st_shndx == shndx && esym.st_type == STT_FUNC &&
+        esym.st_value <= offset && offset < esym.st_value + esym.st_size) {
+      std::string_view name = file.symbol_strtab.data() + esym.st_name;
+      if (ctx.arg.demangle)
+        return demangle(name);
+      return name;
+    }
+  }
+  return "";
+}
+
+// Test if the symbol a given relocation refers to has already been resolved.
+// If not, record that error and returns true.
+template <typename E>
+bool InputSection<E>::record_undef_error(Context<E> &ctx, const ElfRel<E> &rel) {
+  // If a relocation refers to a linker-synthesized symbol for a
+  // section fragment, it's always been resolved.
+  if (file.elf_syms.size() <= rel.r_sym)
+    return false;
+
+  Symbol<E> &sym = *file.symbols[rel.r_sym];
+  const ElfSym<E> &esym = file.elf_syms[rel.r_sym];
+
+  // If a symbol is defined in a comdat group, and the comdat group is
+  // discarded, the symbol may not have an owner. It is technically an
+  // violation of the One Definition Rule, so it is a programmer's fault.
+  if (!sym.file) {
+    Error(ctx) << *this << ": " << sym << " refers to a discarded COMDAT section"
+               << " probably due to an ODR violation";
+    return true;
+  }
+
+  auto record = [&] {
+    std::stringstream ss;
+    if (std::string_view source = file.get_source_name(); !source.empty())
+      ss << ">>> referenced by " << source << "\n";
+    else
+      ss << ">>> referenced by " << *this << "\n";
+
+    ss << ">>>               " << file;
+    if (std::string_view func = get_func_name(ctx, rel.r_offset); !func.empty())
+      ss << ":(" << func << ")";
+
+    typename decltype(ctx.undef_errors)::accessor acc;
+    ctx.undef_errors.insert(acc, {sym.name(), {}});
+    acc->second.push_back(ss.str());
+  };
+
+  // A non-weak undefined symbol must be promoted to an imported
+  // symbol or resolved to an defined symbol. Otherwise, it's an
+  // undefined symbol error.
+  //
+  // Every ELF file has an absolute local symbol as its first symbol.
+  // Referring to that symbol is always valid.
+  bool is_undef = esym.is_undef() && !esym.is_weak() && sym.sym_idx;
+  if (!sym.is_imported && is_undef && sym.esym().is_undef()) {
+    record();
+    return true;
+  }
+
+  // If a protected/hidden undefined symbol is resolved to other .so,
+  // it's handled as if no symbols were found.
+  if (sym.file->is_dso &&
+      (sym.visibility == STV_PROTECTED || sym.visibility == STV_HIDDEN)) {
+    record();
+    return true;
+  }
+
+  return false;
+}
+
+using E = MOLD_TARGET;
+
+template struct CieRecord<E>;
+template class InputSection<E>;
+
+} // namespace mold::elf
--- a/third_party/mold/elf/jobs.cc
+++ b/third_party/mold/elf/jobs.cc
@ -0,0 +1,85 @@
+// clang-format off
+#include "third_party/mold/elf/mold.h"
+
+#ifndef _WIN32
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/flock.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/at.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fd.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/splice.h"
+#include "third_party/musl/passwd.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/stat.macros.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/utime.h"
+#include "libc/time/time.h"
+#include "libc/calls/makedev.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/thread/thread.h"
+#include "libc/calls/typedef/u.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/intrin/newbie.h"
+#include "libc/sock/select.h"
+#include "libc/sysv/consts/endian.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+#endif
+
+namespace mold::elf {
+
+template <typename E>
+void acquire_global_lock(Context<E> &ctx) {
+#ifndef _WIN32
+  char *jobs = getenv("MOLD_JOBS");
+  if (!jobs || std::string(jobs) != "1")
+    return;
+
+  char *home = getenv("HOME");
+  if (!home)
+    home = getpwuid(getuid())->pw_dir;
+
+  std::string path = std::string(home) + "/.mold-lock";
+  int fd = open(path.c_str(), O_WRONLY | O_CREAT | O_CLOEXEC, 0600);
+  if (fd == -1)
+    return;
+
+  if (lockf(fd, F_LOCK, 0) == -1)
+    return;
+
+  ctx.global_lock_fd = fd;
+#endif
+}
+
+template <typename E>
+void release_global_lock(Context<E> &ctx) {
+#ifndef _WIN32
+  if (ctx.global_lock_fd)
+    close(*ctx.global_lock_fd);
+#endif
+}
+
+using E = MOLD_TARGET;
+
+template void acquire_global_lock(Context<E> &);
+template void release_global_lock(Context<E> &);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/linker-script.cc
+++ b/third_party/mold/elf/linker-script.cc
@ -0,0 +1,425 @@
+// clang-format off
+// On Linux, /usr/lib/x86_64-linux-gnu/libc.so is not actually
+// a shared object file but an ASCII text file containing a linker
+// script to include a "real" libc.so file. Therefore, we need to
+// support a (very limited) subset of the linker script language.
+
+#include "third_party/mold/elf/mold.h"
+
+#include "third_party/libcxx/cctype"
+#include "third_party/libcxx/iomanip"
+
+namespace mold::elf {
+
+template <typename E>
+static thread_local MappedFile<Context<E>> *current_file;
+
+template <typename E>
+void read_version_script(Context<E> &ctx, std::span<std::string_view> &tok);
+
+static std::string_view get_line(std::string_view input, const char *pos) {
+  assert(input.data() <= pos);
+  assert(pos < input.data() + input.size());
+
+  i64 start = input.rfind('\n', pos - input.data());
+  if (start == input.npos)
+    start = 0;
+  else
+    start++;
+
+  i64 end = input.find('\n', pos - input.data());
+  if (end == input.npos)
+    end = input.size();
+
+  return input.substr(start, end - start);
+}
+
+template <typename E>
+class SyntaxError {
+public:
+  SyntaxError(Context<E> &ctx, std::string_view errpos) : out(ctx) {
+    std::string_view contents = current_file<E>->get_contents();
+    std::string_view line = get_line(contents, errpos.data());
+
+    i64 lineno = 1;
+    for (i64 i = 0; contents.data() + i < line.data(); i++)
+      if (contents[i] == '\n')
+        lineno++;
+
+    i64 column = errpos.data() - line.data();
+
+    std::stringstream ss;
+    ss << current_file<E>->name << ":" << lineno << ": ";
+    i64 indent = (i64)ss.tellp() + strlen("mold: ");
+    ss << line << "\n" << std::setw(indent + column) << " " << "^ ";
+    out << ss.str();
+  }
+
+  template <class T> SyntaxError &operator<<(T &&val) {
+    out << std::forward<T>(val);
+    return *this;
+  }
+
+  [[noreturn]] ~SyntaxError() = default;
+
+  Fatal<Context<E>> out;
+};
+
+template <typename E>
+static std::vector<std::string_view>
+tokenize(Context<E> &ctx, std::string_view input) {
+  std::vector<std::string_view> vec;
+  while (!input.empty()) {
+    if (isspace(input[0])) {
+      input = input.substr(1);
+      continue;
+    }
+
+    if (input.starts_with("/*")) {
+      i64 pos = input.find("*/", 2);
+      if (pos == std::string_view::npos)
+        SyntaxError(ctx, input) << "unclosed comment";
+      input = input.substr(pos + 2);
+      continue;
+    }
+
+    if (input[0] == '#') {
+      i64 pos = input.find("\n", 1);
+      if (pos == std::string_view::npos)
+        break;
+      input = input.substr(pos + 1);
+      continue;
+    }
+
+    if (input[0] == '"') {
+      i64 pos = input.find('"', 1);
+      if (pos == std::string_view::npos)
+        SyntaxError(ctx, input) << "unclosed string literal";
+      vec.push_back(input.substr(0, pos + 1));
+      input = input.substr(pos + 1);
+      continue;
+    }
+
+    i64 pos = input.find_first_not_of(
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
+      "0123456789_.$/\\~=+[]*?-!^:");
+
+    if (pos == 0)
+      pos = 1;
+    else if (pos == input.npos)
+      pos = input.size();
+
+    vec.push_back(input.substr(0, pos));
+    input = input.substr(pos);
+  }
+  return vec;
+}
+
+template <typename E>
+static std::span<std::string_view>
+skip(Context<E> &ctx, std::span<std::string_view> tok, std::string_view str) {
+  if (tok.empty())
+    Fatal(ctx) << current_file<E>->name << ": expected '" << str
+               << "', but got EOF";
+  if (tok[0] != str)
+    SyntaxError(ctx, tok[0]) << "expected '" << str << "'";
+  return tok.subspan(1);
+}
+
+static std::string_view unquote(std::string_view s) {
+  if (s.size() > 0 && s[0] == '"') {
+    assert(s[s.size() - 1] == '"');
+    return s.substr(1, s.size() - 2);
+  }
+  return s;
+}
+
+template <typename E>
+static std::span<std::string_view>
+read_output_format(Context<E> &ctx, std::span<std::string_view> tok) {
+  tok = skip(ctx, tok, "(");
+  while (!tok.empty() && tok[0] != ")")
+    tok = tok.subspan(1);
+  if (tok.empty())
+    Fatal(ctx) << current_file<E>->name << ": expected ')', but got EOF";
+  return tok.subspan(1);
+}
+
+template <typename E>
+static bool is_in_sysroot(Context<E> &ctx, std::string path) {
+  std::string rel = to_abs_path(path)
+                        .lexically_relative(to_abs_path(ctx.arg.sysroot))
+                        .string();
+  return rel != "." && !rel.starts_with("../");
+}
+
+template <typename E>
+static MappedFile<Context<E>> *resolve_path(Context<E> &ctx, std::string_view tok) {
+  std::string str(unquote(tok));
+
+  // GNU ld prepends the sysroot if a pathname starts with '/' and the
+  // script being processed is in the sysroot. We do the same.
+  if (str.starts_with('/') && is_in_sysroot(ctx, current_file<E>->name))
+    return MappedFile<Context<E>>::must_open(ctx, ctx.arg.sysroot + str);
+
+  if (str.starts_with('=')) {
+    std::string path;
+    if (ctx.arg.sysroot.empty())
+      path = str.substr(1);
+    else
+      path = ctx.arg.sysroot + str.substr(1);
+    return MappedFile<Context<E>>::must_open(ctx, path);
+  }
+
+  if (str.starts_with("-l"))
+    return find_library(ctx, str.substr(2));
+
+  if (MappedFile<Context<E>> *mf = open_library(ctx, str))
+    return mf;
+
+  for (std::string_view dir : ctx.arg.library_paths) {
+    std::string path = std::string(dir) + "/" + str;
+    if (MappedFile<Context<E>> *mf = open_library(ctx, path))
+      return mf;
+  }
+
+  SyntaxError(ctx, tok) << "library not found: " << str;
+}
+
+template <typename E>
+static std::span<std::string_view>
+read_group(Context<E> &ctx, std::span<std::string_view> tok) {
+  tok = skip(ctx, tok, "(");
+
+  while (!tok.empty() && tok[0] != ")") {
+    if (tok[0] == "AS_NEEDED") {
+      bool orig = ctx.as_needed;
+      ctx.as_needed = true;
+      tok = read_group(ctx, tok.subspan(1));
+      ctx.as_needed = orig;
+      continue;
+    }
+
+    MappedFile<Context<E>> *mf = resolve_path(ctx, tok[0]);
+    read_file(ctx, mf);
+    tok = tok.subspan(1);
+  }
+
+  if (tok.empty())
+    Fatal(ctx) << current_file<E>->name << ": expected ')', but got EOF";
+  return tok.subspan(1);
+}
+
+template <typename E>
+void parse_linker_script(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  current_file<E> = mf;
+
+  std::vector<std::string_view> vec = tokenize(ctx, mf->get_contents());
+  std::span<std::string_view> tok = vec;
+
+  while (!tok.empty()) {
+    if (tok[0] == "OUTPUT_FORMAT") {
+      tok = read_output_format(ctx, tok.subspan(1));
+    } else if (tok[0] == "INPUT" || tok[0] == "GROUP") {
+      tok = read_group(ctx, tok.subspan(1));
+    } else if (tok[0] == "VERSION") {
+      tok = tok.subspan(1);
+      tok = skip(ctx, tok, "{");
+      read_version_script(ctx, tok);
+      tok = skip(ctx, tok, "}");
+    } else if (tok.size() > 3 && tok[1] == "=" && tok[3] == ";") {
+      ctx.arg.defsyms.emplace_back(get_symbol(ctx, unquote(tok[0])),
+                                   get_symbol(ctx, unquote(tok[2])));
+      tok = tok.subspan(4);
+    } else if (tok[0] == ";") {
+      tok = tok.subspan(1);
+    } else {
+      SyntaxError(ctx, tok[0]) << "unknown linker script token";
+    }
+  }
+}
+
+template <typename E>
+std::string_view
+get_script_output_type(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  current_file<E> = mf;
+
+  std::vector<std::string_view> vec = tokenize(ctx, mf->get_contents());
+  std::span<std::string_view> tok = vec;
+
+  if (tok.size() >= 3 && tok[0] == "OUTPUT_FORMAT" && tok[1] == "(") {
+    if (tok[2] == "elf64-x86-64")
+      return X86_64::target_name;
+    if (tok[2] == "elf32-i386")
+      return I386::target_name;
+  }
+
+  if (tok.size() >= 3 && (tok[0] == "INPUT" || tok[0] == "GROUP") &&
+      tok[1] == "(")
+    if (MappedFile<Context<E>> *mf =
+        MappedFile<Context<E>>::open(ctx, std::string(unquote(tok[2]))))
+      return get_machine_type(ctx, mf);
+
+  return "";
+}
+
+static bool read_label(std::span<std::string_view> &tok,
+                       std::string label) {
+  if (tok.size() >= 1 && tok[0] == label + ":") {
+    tok = tok.subspan(1);
+    return true;
+  }
+
+  if (tok.size() >= 2 && tok[0] == label && tok[1] == ":") {
+    tok = tok.subspan(2);
+    return true;
+  }
+  return false;
+}
+
+template <typename E>
+static void
+read_version_script_commands(Context<E> &ctx, std::span<std::string_view> &tok,
+                             std::string_view ver_str, u16 ver_idx, bool is_cpp) {
+  bool is_global = true;
+
+  while (!tok.empty() && tok[0] != "}") {
+    if (read_label(tok, "global")) {
+      is_global = true;
+      continue;
+    }
+
+    if (read_label(tok, "local")) {
+      is_global = false;
+      continue;
+    }
+
+    if (tok[0] == "extern") {
+      tok = tok.subspan(1);
+
+      if (!tok.empty() && tok[0] == "\"C\"") {
+        tok = tok.subspan(1);
+        tok = skip(ctx, tok, "{");
+        read_version_script_commands( ctx, tok, ver_str, ver_idx, false);
+      } else {
+        tok = skip(ctx, tok, "\"C++\"");
+        tok = skip(ctx, tok, "{");
+        read_version_script_commands(ctx, tok, ver_str, ver_idx, true);
+      }
+
+      tok = skip(ctx, tok, "}");
+      tok = skip(ctx, tok, ";");
+      continue;
+    }
+
+    if (tok[0] == "*") {
+      ctx.default_version = (is_global ? ver_idx : (u32)VER_NDX_LOCAL);
+      ctx.default_version_from_version_script = true;
+    } else if (is_global) {
+      ctx.version_patterns.push_back({unquote(tok[0]), current_file<E>->name,
+                                      ver_str, ver_idx, is_cpp});
+    } else {
+      ctx.version_patterns.push_back({unquote(tok[0]), current_file<E>->name,
+                                      ver_str, VER_NDX_LOCAL, is_cpp});
+    }
+
+    tok = tok.subspan(1);
+
+    if (!tok.empty() && tok[0] == "}")
+      return;
+    tok = skip(ctx, tok, ";");
+  }
+}
+
+template <typename E>
+void read_version_script(Context<E> &ctx, std::span<std::string_view> &tok) {
+  u16 next_ver = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size() + 1;
+
+  while (!tok.empty() && tok[0] != "}") {
+    std::string_view ver_str;
+    u16 ver_idx;
+
+    if (tok[0] == "{") {
+      ver_str = "global";
+      ver_idx = VER_NDX_GLOBAL;
+    } else {
+      ver_str = tok[0];
+      ver_idx = next_ver++;
+      ctx.arg.version_definitions.push_back(std::string(tok[0]));
+      tok = tok.subspan(1);
+    }
+
+    tok = skip(ctx, tok, "{");
+    read_version_script_commands(ctx, tok, ver_str, ver_idx, false);
+    tok = skip(ctx, tok, "}");
+    if (!tok.empty() && tok[0] != ";")
+      tok = tok.subspan(1);
+    tok = skip(ctx, tok, ";");
+  }
+}
+
+template <typename E>
+void parse_version_script(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  current_file<E> = mf;
+  std::vector<std::string_view> vec = tokenize(ctx, mf->get_contents());
+  std::span<std::string_view> tok = vec;
+  read_version_script(ctx, tok);
+  if (!tok.empty())
+    SyntaxError(ctx, tok[0]) << "trailing garbage token";
+}
+
+template <typename E>
+void read_dynamic_list_commands(Context<E> &ctx, std::span<std::string_view> &tok,
+                                bool is_cpp) {
+  while (!tok.empty() && tok[0] != "}") {
+    if (tok[0] == "extern") {
+      tok = tok.subspan(1);
+
+      if (!tok.empty() && tok[0] == "\"C\"") {
+        tok = tok.subspan(1);
+        tok = skip(ctx, tok, "{");
+        read_dynamic_list_commands(ctx, tok, false);
+      } else {
+        tok = skip(ctx, tok, "\"C++\"");
+        tok = skip(ctx, tok, "{");
+        read_dynamic_list_commands(ctx, tok, true);
+      }
+
+      tok = skip(ctx, tok, "}");
+      tok = skip(ctx, tok, ";");
+      continue;
+    }
+
+    if (tok[0] == "*")
+      ctx.default_version = VER_NDX_GLOBAL;
+    else
+      ctx.version_patterns.push_back({unquote(tok[0]), current_file<E>->name,
+                                      "global", VER_NDX_GLOBAL, is_cpp});
+
+    tok = skip(ctx, tok.subspan(1), ";");
+  }
+}
+
+template <typename E>
+void parse_dynamic_list(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  current_file<E> = mf;
+  std::vector<std::string_view> vec = tokenize(ctx, mf->get_contents());
+  std::span<std::string_view> tok = vec;
+
+  tok = skip(ctx, tok, "{");
+  read_dynamic_list_commands(ctx, tok, false);
+  tok = skip(ctx, tok, "}");
+  tok = skip(ctx, tok, ";");
+
+  if (!tok.empty())
+    SyntaxError(ctx, tok[0]) << "trailing garbage token";
+}
+
+using E = MOLD_TARGET;
+
+template void parse_linker_script(Context<E> &, MappedFile<Context<E>> *);
+template std::string_view get_script_output_type(Context<E> &, MappedFile<Context<E>> *);
+template void parse_version_script(Context<E> &, MappedFile<Context<E>> *);
+template void parse_dynamic_list(Context<E> &, MappedFile<Context<E>> *);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/lto-unix.cc
+++ b/third_party/mold/elf/lto-unix.cc
@ -0,0 +1,739 @@
+// clang-format off
+// This file handles the linker plugin to support LTO (Link-Time
+// Optimization).
+//
+// LTO is a technique to do whole-program optimization to a program. Since
+// a linker sees the whole program as opposed to a single compilation
+// unit, it in theory can do some optimizations that cannot be done in the
+// usual separate compilation model. For example, LTO should be able to
+// inline functions that are defined in other compilation unit.
+//
+// In GCC and Clang, all you have to do to enable LTO is adding the
+// `-flto` flag to the compiler and the linker command lines. If `-flto`
+// is given, the compiler generates a file that contains not machine code
+// but the compiler's IR (intermediate representation). In GCC, the output
+// is an ELF file which wraps GCC's IR. In LLVM, it's not even an ELF file
+// but just a raw LLVM IR file.
+//
+// Here is what we have to do if at least one input file is not a usual
+// ELF file but an IR object file:
+//
+//  1. Read symbols both from usual ELF files and from IR object files and
+//     resolve symbols as usual.
+//
+//  2. Pass all IR objects to the compiler backend. The compiler backend
+//     compiles the IRs and returns a few big ELF object files as a
+//     result.
+//
+//  3. Parse the returned ELF files and overwrite IR object symbols with
+//     the returned ones, discarding IR object files.
+//
+//  4. Continue the rest of the linking process as usual.
+//
+// When gcc or clang inovkes ld, they pass `-plugin linker-plugin.so` to
+// the linker. The given .so file provides a way to call the compiler
+// backend.
+//
+// The linker plugin API is documented at
+// https://gcc.gnu.org/wiki/whopr/driver, though the document is a bit
+// outdated.
+//
+// Frankly, the linker plugin API is peculiar and is not very easy to use.
+// For some reason, the API functions don't return the result of a
+// function call as a return value but instead calls other function with
+// the result as its argument to "return" the result.
+//
+// For example, the first thing you need to do after dlopen()'ing a linker
+// plugin .so is to call `onload` function with a list of callback
+// functions. `onload` calls callbacks to notify about the pointers to
+// other functions the linker plugin provides. I don't know why `onload`
+// can't just return a list of functions or why the linker plugin can't
+// define not only `onload` but other functions, but that's what it is.
+//
+// Here is the steps to use the linker plugin:
+//
+//  1. dlopen() the linker plugin .so and call `onload` to obtain pointers
+//     to other functions provided by the plugin.
+//
+//  2. Call `claim_file_hook` with an IR object file to read its symbol
+//     table. `claim_file_hook` calls the `add_symbols` callback to
+//     "return" a list of symbols.
+//
+//  3. `claim_file_hook` returns LDPT_OK only when the plugin wants to
+//     handle a given file. Since we pass only IR object files to the
+//     plugin in mold, it always returns LDPT_OK in our case.
+//
+//  4. Once we made a decision as to which object file to include into the
+//     output file, we call `all_symbols_read_hook` to compile IR objects
+//     into a few big ELF files. That function calls the `get_symbols`
+//     callback to ask us about the symbol resolution results. (The
+//     compiler backend needs to know whether an undefined symbol in an IR
+//     object was resolved to a regular object file or a shared object to
+//     do whole program optimization, for example.)
+//
+//  5. `all_symbols_read_hook` "returns" the result by calling the
+//     `add_input_file` callback. The callback is called with a path to an
+//     LTO'ed ELF file. We parse that ELF file and override symbols
+//     defined by IR objects with the ELF file's ones.
+//
+//  6. Lastly, we call `cleanup_hook` to remove temporary files created by
+//     the compiler backend.
+
+#include "third_party/mold/elf/mold.h"
+#include "third_party/mold/elf/lto.h"
+
+#include "third_party/libcxx/cstdarg"
+#include "third_party/libcxx/cstring"
+#include "libc/runtime/dlfcn.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/flock.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/at.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fd.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/splice.h"
+#include "third_party/libcxx/sstream"
+// MISSING #include <tbb/parallel_for_each.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+
+#if 0
+# define LOG std::cerr
+#else
+# define LOG std::ostringstream()
+#endif
+
+namespace mold::elf {
+
+// Global variables
+// We store LTO-related information to global variables,
+// as the LTO plugin is not thread-safe by design anyway.
+
+template <typename E> static Context<E> *gctx;
+template <typename E> static std::vector<ObjectFile<E> *> lto_objects;
+
+static int phase = 0;
+static std::vector<PluginSymbol> plugin_symbols;
+static ClaimFileHandler *claim_file_hook;
+static AllSymbolsReadHandler *all_symbols_read_hook;
+static CleanupHandler *cleanup_hook;
+static bool is_gcc_linker_api_v1 = false;
+
+// Event handlers
+
+template <typename E>
+static PluginStatus message(PluginLevel level, const char *fmt, ...) {
+  LOG << "message\n";
+  Context<E> &ctx = *gctx<E>;
+
+  char buf[1000];
+  va_list ap;
+  va_start(ap, fmt);
+  vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+
+  switch (level) {
+  case LDPL_INFO:
+    SyncOut(ctx) << buf;
+    break;
+  case LDPL_WARNING:
+    Warn(ctx) << buf;
+    break;
+  case LDPL_ERROR:
+  case LDPL_FATAL:
+    Fatal(ctx) << buf;
+  }
+
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginStatus register_claim_file_hook(ClaimFileHandler fn) {
+  LOG << "register_claim_file_hook\n";
+  claim_file_hook = fn;
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginStatus
+register_all_symbols_read_hook(AllSymbolsReadHandler fn) {
+  LOG << "register_all_symbols_read_hook\n";
+  all_symbols_read_hook = fn;
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginStatus register_cleanup_hook(CleanupHandler fn) {
+  LOG << "register_cleanup_hook\n";
+  cleanup_hook = fn;
+  return LDPS_OK;
+}
+
+static PluginStatus
+add_symbols(void *handle, int nsyms, const PluginSymbol *psyms) {
+  LOG << "add_symbols: " << nsyms << "\n";
+  assert(phase == 1);
+  plugin_symbols = {psyms, psyms + nsyms};
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginStatus add_input_file(const char *path) {
+  LOG << "add_input_file: " << path << "\n";
+
+  Context<E> &ctx = *gctx<E>;
+  static i64 file_priority = 100;
+
+  MappedFile<Context<E>> *mf = MappedFile<Context<E>>::must_open(ctx, path);
+
+  ObjectFile<E> *file = ObjectFile<E>::create(ctx, mf, "", false);
+  ctx.obj_pool.emplace_back(file);
+  lto_objects<E>.push_back(file);
+
+  file->priority = file_priority++;
+  file->is_alive = true;
+  file->parse(ctx);
+  file->resolve_symbols(ctx);
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_input_file(const void *handle, struct PluginInputFile *file) {
+  LOG << "get_input_file\n";
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginStatus release_input_file(const void *handle) {
+  LOG << "release_input_file\n";
+
+  ObjectFile<E> &file = *(ObjectFile<E> *)handle;
+  if (file.mf->fd != -1) {
+    close(file.mf->fd);
+    file.mf->fd = -1;
+  }
+  return LDPS_OK;
+}
+
+static PluginStatus add_input_library(const char *path) {
+  LOG << "add_input_library\n";
+  return LDPS_OK;
+}
+
+static PluginStatus set_extra_library_path(const char *path) {
+  LOG << "set_extra_library_path\n";
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginStatus get_view(const void *handle, const void **view) {
+  LOG << "get_view\n";
+
+  ObjectFile<E> &file = *(ObjectFile<E> *)handle;
+  *view = (void *)file.mf->data;
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_input_section_count(const void *handle, int *count) {
+  LOG << "get_input_section_count\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_input_section_type(const PluginSection section, int *type) {
+  LOG << "get_input_section_type\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_input_section_name(const PluginSection section,
+                       char **section_name) {
+  LOG << "get_input_section_name\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_input_section_contents(const PluginSection section,
+                           const char **section_contents,
+                           size_t *len) {
+  LOG << "get_input_section_contents\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+update_section_order(const PluginSection *section_list,
+                     int num_sections) {
+  LOG << "update_section_order\n";
+  return LDPS_OK;
+}
+
+static PluginStatus allow_section_ordering() {
+  LOG << "allow_section_ordering\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_symbols_v1(const void *handle, int nsyms, PluginSymbol *psyms) {
+  unreachable();
+}
+
+// get_symbols teaches the LTO plugin as to how we have resolved symbols.
+// The plugin uses the symbol resolution info to optimize the program.
+//
+// For example, if a definition in an IR file is not referenced by
+// non-IR objects at all, the plugin may choose to completely inline
+// that definition within the IR objects and remove the symbol from the
+// LTO result. On the other hand, if a definition is referenced by a
+// non-IR object, it has to keep the symbol in the LTO result.
+template <typename E>
+static PluginStatus
+get_symbols(const void *handle, int nsyms, PluginSymbol *psyms, bool is_v2) {
+  ObjectFile<E> &file = *(ObjectFile<E> *)handle;
+  assert(file.is_lto_obj);
+
+  // If file is an archive member which was not chose to be included in
+  // to the final result, we need to make the plugin to ignore all
+  // symbols.
+  if (!file.is_alive) {
+    assert(!is_v2);
+    for (int i = 0; i < nsyms; i++)
+      psyms[i].resolution = LDPR_PREEMPTED_REG;
+    return LDPS_NO_SYMS;
+  }
+
+  auto get_resolution = [&](ElfSym<E> &esym, Symbol<E> &sym) {
+    if (!sym.file)
+      return LDPR_UNDEF;
+
+    if (sym.file == &file) {
+      if (sym.referenced_by_regular_obj)
+        return LDPR_PREVAILING_DEF;
+      if (sym.is_exported)
+        return is_v2 ? LDPR_PREVAILING_DEF : LDPR_PREVAILING_DEF_IRONLY_EXP;
+      return LDPR_PREVAILING_DEF_IRONLY;
+    }
+
+    if (sym.file->is_dso)
+      return LDPR_RESOLVED_DYN;
+
+    if (((ObjectFile<E> *)sym.file)->is_lto_obj && !sym.is_wrapped)
+      return esym.is_undef() ? LDPR_RESOLVED_IR : LDPR_PREEMPTED_IR;
+    return esym.is_undef() ? LDPR_RESOLVED_EXEC : LDPR_PREEMPTED_REG;
+  };
+
+  // Set the symbol resolution results to psyms.
+  for (i64 i = 0; i < nsyms; i++) {
+    ElfSym<E> &esym = file.elf_syms[i + 1];
+    Symbol<E> &sym = *file.symbols[i + 1];
+    psyms[i].resolution = get_resolution(esym, sym);
+  }
+  return LDPS_OK;
+}
+
+// This function restarts mold itself with `--:lto-pass2` and
+// `--:ignore-ir-file` flags. We do this as a workaround for the old
+// linker plugins that do not support the get_symbols_v3 API.
+//
+// get_symbols_v1 and get_symbols_v2 don't provide a way to ignore an
+// object file we previously passed to the linker plugin. So we can't
+// "unload" object files in archives that we ended up not choosing to
+// include into the final output.
+//
+// As a workaround, we restart the linker with a list of object files
+// the linker has to ignore, so that it won't read the object files
+// from archives next time.
+//
+// This is an ugly hack and should be removed once GCC adopts the v3 API.
+template <typename E>
+static void restart_process(Context<E> &ctx) {
+  std::vector<const char *> args;
+
+  for (std::string_view arg : ctx.cmdline_args)
+    args.push_back(strdup(std::string(arg).c_str()));
+
+  for (std::unique_ptr<ObjectFile<E>> &file : ctx.obj_pool)
+    if (file->is_lto_obj && !file->is_alive)
+      args.push_back(strdup(("--:ignore-ir-file=" +
+                             file->mf->get_identifier()).c_str()));
+
+  args.push_back("--:lto-pass2");
+  args.push_back(nullptr);
+
+  std::cout << std::flush;
+  std::cerr << std::flush;
+
+  std::string self = get_self_path();
+  execv(self.c_str(), (char * const *)args.data());
+  std::cerr << "execv failed: " << errno_string() << "\n";
+  _exit(1);
+}
+
+template <typename E>
+static PluginStatus
+get_symbols_v2(const void *handle, int nsyms, PluginSymbol *psyms) {
+  LOG << "get_symbols_v2\n";
+  return get_symbols<E>(handle, nsyms, psyms, true);
+}
+
+template <typename E>
+static PluginStatus
+get_symbols_v3(const void *handle, int nsyms, PluginSymbol *psyms) {
+  LOG << "get_symbols_v3\n";
+  return get_symbols<E>(handle, nsyms, psyms, false);
+}
+
+static PluginStatus allow_unique_segment_for_sections() {
+  LOG << "allow_unique_segment_for_sections\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+unique_segment_for_sections(const char *segment_name,
+                            uint64_t flags,
+                            uint64_t align,
+                            const PluginSection *section_list,
+                            int num_sections) {
+  LOG << "unique_segment_for_sections\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_input_section_alignment(const PluginSection section,
+                            int *addralign) {
+  LOG << "get_input_section_alignment\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_input_section_size(const PluginSection section, uint64_t *size) {
+  LOG << "get_input_section_size\n";
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginStatus
+register_new_input_hook(NewInputHandler fn) {
+  LOG << "register_new_input_hook\n";
+  return LDPS_OK;
+}
+
+static PluginStatus
+get_wrap_symbols(uint64_t *num_symbols, const char ***wrap_symbols) {
+  LOG << "get_wrap_symbols\n";
+  return LDPS_OK;
+}
+
+template <typename E>
+static PluginLinkerAPIVersion
+get_api_version(const char *plugin_identifier,
+                unsigned plugin_version,
+                int minimal_api_supported,
+                int maximal_api_supported,
+                const char **linker_identifier,
+                const char **linker_version) {
+  if (LAPI_V1 < minimal_api_supported)
+    Fatal(*gctx<E>) << "LTO plugin does not support V0 or V1 API";
+
+  std::string version = mold_version + "\0"s;
+
+  *linker_identifier = "mold";
+  *linker_version = version.data();
+
+  if (LAPI_V1 <= maximal_api_supported) {
+    is_gcc_linker_api_v1 = true;
+    return LAPI_V1;
+  }
+  return LAPI_V0;
+}
+
+template <typename E>
+static void load_plugin(Context<E> &ctx) {
+  assert(phase == 0);
+  phase = 1;
+  gctx<E> = &ctx;
+
+  void *handle = dlopen(ctx.arg.plugin.c_str(), RTLD_NOW | RTLD_GLOBAL);
+  if (!handle)
+    Fatal(ctx) << "could not open plugin file: " << dlerror();
+
+  OnloadFn *onload = (OnloadFn *)dlsym(handle, "onload");
+  if (!onload)
+    Fatal(ctx) << "failed to load plugin " << ctx.arg.plugin << ": "
+               << dlerror();
+
+  auto save = [&](std::string_view str) {
+    return save_string(ctx, std::string(str).c_str()).data();
+  };
+
+  std::vector<PluginTagValue> tv;
+  tv.emplace_back(LDPT_MESSAGE, message<E>);
+
+  if (ctx.arg.shared)
+    tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_DYN);
+  else if (ctx.arg.pie)
+    tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_PIE);
+  else
+    tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_EXEC);
+
+  for (std::string_view opt : ctx.arg.plugin_opt)
+    tv.emplace_back(LDPT_OPTION, save(opt));
+
+  tv.emplace_back(LDPT_REGISTER_CLAIM_FILE_HOOK, register_claim_file_hook<E>);
+  tv.emplace_back(LDPT_REGISTER_ALL_SYMBOLS_READ_HOOK,
+                  register_all_symbols_read_hook<E>);
+  tv.emplace_back(LDPT_REGISTER_CLEANUP_HOOK, register_cleanup_hook<E>);
+  tv.emplace_back(LDPT_ADD_SYMBOLS, add_symbols);
+  tv.emplace_back(LDPT_GET_SYMBOLS, get_symbols_v1);
+  tv.emplace_back(LDPT_ADD_INPUT_FILE, add_input_file<E>);
+  tv.emplace_back(LDPT_GET_INPUT_FILE, get_input_file);
+  tv.emplace_back(LDPT_RELEASE_INPUT_FILE, release_input_file<E>);
+  tv.emplace_back(LDPT_ADD_INPUT_LIBRARY, add_input_library);
+  tv.emplace_back(LDPT_OUTPUT_NAME, save(ctx.arg.output));
+  tv.emplace_back(LDPT_SET_EXTRA_LIBRARY_PATH, set_extra_library_path);
+  tv.emplace_back(LDPT_GET_VIEW, get_view<E>);
+  tv.emplace_back(LDPT_GET_INPUT_SECTION_COUNT, get_input_section_count);
+  tv.emplace_back(LDPT_GET_INPUT_SECTION_TYPE, get_input_section_type);
+  tv.emplace_back(LDPT_GET_INPUT_SECTION_NAME, get_input_section_name);
+  tv.emplace_back(LDPT_GET_INPUT_SECTION_CONTENTS, get_input_section_contents);
+  tv.emplace_back(LDPT_UPDATE_SECTION_ORDER, update_section_order);
+  tv.emplace_back(LDPT_ALLOW_SECTION_ORDERING, allow_section_ordering);
+  tv.emplace_back(LDPT_ADD_SYMBOLS_V2, add_symbols);
+  tv.emplace_back(LDPT_GET_SYMBOLS_V2, get_symbols_v2<E>);
+  tv.emplace_back(LDPT_ALLOW_UNIQUE_SEGMENT_FOR_SECTIONS,
+                  allow_unique_segment_for_sections);
+  tv.emplace_back(LDPT_UNIQUE_SEGMENT_FOR_SECTIONS, unique_segment_for_sections);
+  tv.emplace_back(LDPT_GET_SYMBOLS_V3, get_symbols_v3<E>);
+  tv.emplace_back(LDPT_GET_INPUT_SECTION_ALIGNMENT, get_input_section_alignment);
+  tv.emplace_back(LDPT_GET_INPUT_SECTION_SIZE, get_input_section_size);
+  tv.emplace_back(LDPT_REGISTER_NEW_INPUT_HOOK, register_new_input_hook<E>);
+  tv.emplace_back(LDPT_GET_WRAP_SYMBOLS, get_wrap_symbols);
+  tv.emplace_back(LDPT_GET_API_VERSION, get_api_version<E>);
+  tv.emplace_back(LDPT_NULL, 0);
+
+  [[maybe_unused]] PluginStatus status = onload(tv.data());
+  assert(status == LDPS_OK);
+}
+
+template <typename E>
+static ElfSym<E> to_elf_sym(PluginSymbol &psym) {
+  ElfSym<E> esym;
+  memset(&esym, 0, sizeof(esym));
+
+  switch (psym.def) {
+  case LDPK_DEF:
+    esym.st_shndx = SHN_ABS;
+    break;
+  case LDPK_WEAKDEF:
+    esym.st_shndx = SHN_ABS;
+    esym.st_bind = STB_WEAK;
+    break;
+  case LDPK_UNDEF:
+    esym.st_shndx = SHN_UNDEF;
+    break;
+  case LDPK_WEAKUNDEF:
+    esym.st_shndx = SHN_UNDEF;
+    esym.st_bind = STB_WEAK;
+    break;
+  case LDPK_COMMON:
+    esym.st_shndx = SHN_COMMON;
+    break;
+  }
+
+  switch (psym.symbol_type) {
+  case LDST_UNKNOWN:
+    break;
+  case LDST_FUNCTION:
+    esym.st_type = STT_FUNC;
+    break;
+  case LDST_VARIABLE:
+    esym.st_type = STT_OBJECT;
+    break;
+  };
+
+  switch (psym.visibility) {
+  case LDPV_DEFAULT:
+    break;
+  case LDPV_PROTECTED:
+    esym.st_visibility = STV_PROTECTED;
+    break;
+  case LDPV_INTERNAL:
+    esym.st_visibility = STV_INTERNAL;
+    break;
+  case LDPV_HIDDEN:
+    esym.st_visibility = STV_HIDDEN;
+    break;
+  }
+
+  esym.st_size = psym.size;
+  return esym;
+}
+
+// Returns true if a given linker plugin looks like LLVM's one.
+// Returns false if it's GCC.
+template <typename E>
+static bool is_llvm(Context<E> &ctx) {
+  return ctx.arg.plugin.ends_with("LLVMgold.so");
+}
+
+// Returns true if a given linker plugin supports the get_symbols_v3 API.
+// Any version of LLVM and GCC 12 or newer support it.
+template <typename E>
+static bool supports_v3_api(Context<E> &ctx) {
+  return is_gcc_linker_api_v1 || is_llvm(ctx);
+}
+
+template <typename E>
+ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  // V0 API's claim_file is not thread-safe.
+  static std::mutex mu;
+  std::unique_lock lock(mu, std::defer_lock);
+  if (!is_gcc_linker_api_v1)
+    lock.lock();
+
+  if (ctx.arg.plugin.empty())
+    Fatal(ctx) << mf->name << ": don't know how to handle this LTO object file "
+               << "because no -plugin option was given. Please make sure you "
+               << "added -flto not only for creating object files but also for "
+               << "creating the final executable.";
+
+  // dlopen the linker plugin file
+  static std::once_flag flag;
+  std::call_once(flag, [&] { load_plugin(ctx); });
+
+  // Create mold's object instance
+  ObjectFile<E> *obj = new ObjectFile<E>;
+  ctx.obj_pool.emplace_back(obj);
+
+  obj->filename = mf->name;
+  obj->symbols.push_back(new Symbol<E>);
+  obj->first_global = 1;
+  obj->is_lto_obj = true;
+  obj->mf = mf;
+
+  // Create plugin's object instance
+  PluginInputFile file = {};
+
+  MappedFile<Context<E>> *mf2 = mf->parent ? mf->parent : mf;
+  file.name = save_string(ctx, mf2->name).data();
+  if (mf2->fd == -1)
+    mf2->fd = open(file.name, O_RDONLY);
+  file.fd = mf2->fd;
+  if (file.fd == -1)
+    Fatal(ctx) << "cannot open " << file.name << ": " << errno_string();
+
+  if (mf->parent)
+    obj->archive_name = mf->parent->name;
+
+  file.offset = mf->get_offset();
+  file.filesize = mf->size;
+  file.handle = (void *)obj;
+
+  LOG << "read_lto_symbols: "<< mf->name << "\n";
+
+  // claim_file_hook() calls add_symbols() which initializes `plugin_symbols`
+  int claimed = false;
+  claim_file_hook(&file, &claimed);
+  if (!claimed)
+    Fatal(ctx) << mf->name << ": not claimed by the LTO plugin;"
+               << " please make sure you are using the same compiler of the"
+               << " same version for all object files";
+
+  // It looks like GCC doesn't need fd after claim_file_hook() while
+  // LLVM needs it and takes the ownership of fd. To prevent "too many
+  // open files" issue, we close fd only for GCC. This is ugly, though.
+  if (!is_llvm(ctx)) {
+    close(mf2->fd);
+    mf2->fd = -1;
+  }
+
+  // Initialize object symbols
+  std::vector<ElfSym<E>> *esyms = new std::vector<ElfSym<E>>(1);
+
+  for (PluginSymbol &psym : plugin_symbols) {
+    esyms->push_back(to_elf_sym<E>(psym));
+    obj->symbols.push_back(get_symbol(ctx, save_string(ctx, psym.name)));
+  }
+
+  obj->elf_syms = *esyms;
+  obj->has_symver.resize(esyms->size());
+  plugin_symbols.clear();
+  return obj;
+}
+
+// Entry point
+template <typename E>
+std::vector<ObjectFile<E> *> do_lto(Context<E> &ctx) {
+  Timer t(ctx, "do_lto");
+
+  if (!ctx.arg.lto_pass2 && !supports_v3_api(ctx))
+    restart_process(ctx);
+
+  assert(phase == 1);
+  phase = 2;
+
+  // Set `referenced_by_regular_obj` bit.
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    if (file->is_lto_obj)
+      return;
+
+    for (i64 i = file->first_global; i < file->symbols.size(); i++) {
+      Symbol<E> &sym = *file->symbols[i];
+
+      if (sym.file && !sym.file->is_dso &&
+          ((ObjectFile<E> *)sym.file)->is_lto_obj) {
+        std::scoped_lock lock(sym.mu);
+        sym.referenced_by_regular_obj = true;
+      }
+    }
+  });
+
+  // Symbols specified by the --wrap option needs to be visible from
+  // regular object files.
+  for (std::string_view name : ctx.arg.wrap) {
+    get_symbol(ctx, name)->referenced_by_regular_obj = true;
+
+    std::string_view x = save_string(ctx, "__wrap_" + std::string(name));
+    std::string_view y = save_string(ctx, "__real_" + std::string(name));
+
+    get_symbol(ctx, x)->referenced_by_regular_obj = true;
+    get_symbol(ctx, y)->referenced_by_regular_obj = true;
+  }
+
+  // all_symbols_read_hook() calls add_input_file() and add_input_library()
+  LOG << "all symbols read\n";
+  if (PluginStatus st = all_symbols_read_hook(); st != LDPS_OK)
+    Fatal(ctx) << "LTO: all_symbols_read_hook returns " << st;
+
+  return lto_objects<E>;
+}
+
+template <typename E>
+void lto_cleanup(Context<E> &ctx) {
+  Timer t(ctx, "lto_cleanup");
+
+  if (cleanup_hook)
+    cleanup_hook();
+}
+
+using E = MOLD_TARGET;
+
+template ObjectFile<E> *read_lto_object(Context<E> &, MappedFile<Context<E>> *);
+template std::vector<ObjectFile<E> *> do_lto(Context<E> &);
+template void lto_cleanup(Context<E> &);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/lto-win32.cc
+++ b/third_party/mold/elf/lto-win32.cc
@ -0,0 +1,26 @@
+// clang-format off
+#include "third_party/mold/elf/mold.h"
+#include "third_party/mold/elf/lto.h"
+
+namespace mold::elf {
+
+template <typename E>
+ObjectFile<E> *read_lto_object(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  Fatal(ctx) << "LTO is not supported on Windows";
+}
+
+template <typename E>
+std::vector<ObjectFile<E> *> do_lto(Context<E> &ctx) {
+  return {};
+}
+
+template <typename E>
+void lto_cleanup(Context<E> &ctx) {}
+
+using E = MOLD_TARGET;
+
+template ObjectFile<E> *read_lto_object(Context<E> &, MappedFile<Context<E>> *);
+template std::vector<ObjectFile<E> *> do_lto(Context<E> &);
+template void lto_cleanup(Context<E> &);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/lto.cc
+++ b/third_party/mold/elf/lto.cc
@ -0,0 +1,6 @@
+// clang-format off
+#ifdef _WIN32
+#include "third_party/mold/elf/lto-win32.cc"
+#else
+#include "third_party/mold/elf/lto-unix.cc"
+#endif
--- a/third_party/mold/elf/lto.h
+++ b/third_party/mold/elf/lto.h
@ -0,0 +1,167 @@
+// clang-format off
+#pragma once
+
+// MISSING #include "../common/integers.h"
+
+namespace mold {
+
+enum PluginStatus {
+  LDPS_OK,
+  LDPS_NO_SYMS,
+  LDPS_BAD_HANDLE,
+  LDPS_ERR,
+};
+
+enum PluginTag {
+  LDPT_NULL,
+  LDPT_API_VERSION,
+  LDPT_GOLD_VERSION,
+  LDPT_LINKER_OUTPUT,
+  LDPT_OPTION,
+  LDPT_REGISTER_CLAIM_FILE_HOOK,
+  LDPT_REGISTER_ALL_SYMBOLS_READ_HOOK,
+  LDPT_REGISTER_CLEANUP_HOOK,
+  LDPT_ADD_SYMBOLS,
+  LDPT_GET_SYMBOLS,
+  LDPT_ADD_INPUT_FILE,
+  LDPT_MESSAGE,
+  LDPT_GET_INPUT_FILE,
+  LDPT_RELEASE_INPUT_FILE,
+  LDPT_ADD_INPUT_LIBRARY,
+  LDPT_OUTPUT_NAME,
+  LDPT_SET_EXTRA_LIBRARY_PATH,
+  LDPT_GNU_LD_VERSION,
+  LDPT_GET_VIEW,
+  LDPT_GET_INPUT_SECTION_COUNT,
+  LDPT_GET_INPUT_SECTION_TYPE,
+  LDPT_GET_INPUT_SECTION_NAME,
+  LDPT_GET_INPUT_SECTION_CONTENTS,
+  LDPT_UPDATE_SECTION_ORDER,
+  LDPT_ALLOW_SECTION_ORDERING,
+  LDPT_GET_SYMBOLS_V2,
+  LDPT_ALLOW_UNIQUE_SEGMENT_FOR_SECTIONS,
+  LDPT_UNIQUE_SEGMENT_FOR_SECTIONS,
+  LDPT_GET_SYMBOLS_V3,
+  LDPT_GET_INPUT_SECTION_ALIGNMENT,
+  LDPT_GET_INPUT_SECTION_SIZE,
+  LDPT_REGISTER_NEW_INPUT_HOOK,
+  LDPT_GET_WRAP_SYMBOLS,
+  LDPT_ADD_SYMBOLS_V2,
+  LDPT_GET_API_VERSION,
+};
+
+enum PluginApiVersion {
+  LD_PLUGIN_API_VERSION = 1,
+};
+
+struct PluginTagValue {
+  PluginTagValue(PluginTag tag, int val) : tag(tag), val(val) {}
+
+  template <typename T>
+  PluginTagValue(PluginTag tag, T *ptr) : tag(tag), ptr((void *)ptr) {}
+
+  PluginTag tag;
+  union {
+    int val;
+    void *ptr;
+  };
+};
+
+enum PluginOutputFileType {
+  LDPO_REL,
+  LDPO_EXEC,
+  LDPO_DYN,
+  LDPO_PIE,
+};
+
+struct PluginInputFile {
+  const char *name;
+  i32 fd;
+  u64 offset;
+  u64 filesize;
+  void *handle;
+};
+
+struct PluginSection {
+  const void *handle;
+  u32 shndx;
+};
+
+struct PluginSymbol {
+  char *name;
+  char *version;
+#ifdef __LITTLE_ENDIAN__
+  u8 def;
+  u8 symbol_type;
+  u8 section_kind;
+  u8 padding;
+#else
+  u8 padding;
+  u8 section_kind;
+  u8 symbol_type;
+  u8 def;
+#endif
+  i32 visibility;
+  u64 size;
+  char *comdat_key;
+  i32 resolution;
+};
+
+enum PluginSymbolKind {
+  LDPK_DEF,
+  LDPK_WEAKDEF,
+  LDPK_UNDEF,
+  LDPK_WEAKUNDEF,
+  LDPK_COMMON,
+};
+
+enum PluginSymbolVisibility {
+  LDPV_DEFAULT,
+  LDPV_PROTECTED,
+  LDPV_INTERNAL,
+  LDPV_HIDDEN,
+};
+
+enum PluginSymbolType {
+  LDST_UNKNOWN,
+  LDST_FUNCTION,
+  LDST_VARIABLE,
+};
+
+enum PluginSymbolSectionKind {
+  LDSSK_DEFAULT,
+  LDSSK_BSS,
+};
+
+enum PluginSymbolResolution {
+  LDPR_UNKNOWN,
+  LDPR_UNDEF,
+  LDPR_PREVAILING_DEF,
+  LDPR_PREVAILING_DEF_IRONLY,
+  LDPR_PREEMPTED_REG,
+  LDPR_PREEMPTED_IR,
+  LDPR_RESOLVED_IR,
+  LDPR_RESOLVED_EXEC,
+  LDPR_RESOLVED_DYN,
+  LDPR_PREVAILING_DEF_IRONLY_EXP,
+};
+
+enum PluginLevel {
+  LDPL_INFO,
+  LDPL_WARNING,
+  LDPL_ERROR,
+  LDPL_FATAL,
+};
+
+enum PluginLinkerAPIVersion {
+  LAPI_V0,
+  LAPI_V1,
+};
+
+typedef PluginStatus OnloadFn(PluginTagValue *tv);
+typedef PluginStatus ClaimFileHandler(const PluginInputFile *, int *);
+typedef PluginStatus AllSymbolsReadHandler();
+typedef PluginStatus CleanupHandler();
+typedef PluginStatus NewInputHandler(const PluginInputFile *);
+
+} // namespace mold
--- a/third_party/mold/elf/main.cc
+++ b/third_party/mold/elf/main.cc
@ -0,0 +1,812 @@
+// clang-format off
+#include "third_party/mold/elf/mold.h"
+// MISSING #include "../common/archive-file.h"
+// MISSING #include "../common/cmdline.h"
+// MISSING #include "../common/output-file.h"
+
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/iomanip"
+#include "third_party/libcxx/map"
+#include "third_party/libcxx/regex"
+#include "libc/calls/calls.h"
+#include "libc/calls/sigtimedwait.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/stat.macros.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/utime.h"
+#include "libc/time/time.h"
+#include "libc/calls/makedev.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/thread/thread.h"
+#include "libc/calls/typedef/u.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/intrin/newbie.h"
+#include "libc/sock/select.h"
+#include "libc/sysv/consts/endian.h"
+// MISSING #include <tbb/global_control.h>
+// MISSING #include <tbb/parallel_for_each.h>
+#include "third_party/libcxx/unordered_set"
+
+#ifdef _WIN32
+// MISSING #include <direct.h>
+# define _chdir chdir
+#else
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+#endif
+
+namespace mold::elf {
+
+// Read the beginning of a given file and returns its machine type
+// (e.g. EM_X86_64 or EM_386).
+template <typename E>
+std::string_view get_machine_type(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  auto get_elf_type = [&](u8 *buf) -> std::string_view {
+    bool is_le = (((ElfEhdr<I386> *)buf)->e_ident[EI_DATA] == ELFDATA2LSB);
+    bool is_64;
+    u32 e_machine;
+
+    if (is_le) {
+      auto &ehdr = *(ElfEhdr<I386> *)buf;
+      is_64 = (ehdr.e_ident[EI_CLASS] == ELFCLASS64);
+      e_machine = ehdr.e_machine;
+    } else {
+      auto &ehdr = *(ElfEhdr<M68K> *)buf;
+      is_64 = (ehdr.e_ident[EI_CLASS] == ELFCLASS64);
+      e_machine = ehdr.e_machine;
+    }
+
+    switch (e_machine) {
+    case EM_386:
+      return I386::target_name;
+    case EM_X86_64:
+      return X86_64::target_name;
+    case EM_ARM:
+      return ARM32::target_name;
+    case EM_AARCH64:
+      return ARM64::target_name;
+    case EM_RISCV:
+      if (is_le)
+        return is_64 ? RV64LE::target_name : RV32LE::target_name;
+      return is_64 ? RV64BE::target_name : RV32BE::target_name;
+    case EM_PPC:
+      return PPC32::target_name;
+    case EM_PPC64:
+      return is_le ? PPC64V2::target_name : PPC64V1::target_name;
+    case EM_S390X:
+      return S390X::target_name;
+    case EM_SPARC64:
+      return SPARC64::target_name;
+    case EM_68K:
+      return M68K::target_name;
+    case EM_SH:
+      return SH4::target_name;
+    case EM_ALPHA:
+      return ALPHA::target_name;
+    default:
+      return "";
+    }
+  };
+
+  switch (get_file_type(ctx, mf)) {
+  case FileType::ELF_OBJ:
+  case FileType::ELF_DSO:
+  case FileType::GCC_LTO_OBJ:
+    return get_elf_type(mf->data);
+  case FileType::AR:
+    for (MappedFile<Context<E>> *child : read_fat_archive_members(ctx, mf))
+      if (get_file_type(ctx, child) == FileType::ELF_OBJ)
+        return get_elf_type(child->data);
+    return "";
+  case FileType::THIN_AR:
+    for (MappedFile<Context<E>> *child : read_thin_archive_members(ctx, mf))
+      if (get_file_type(ctx, child) == FileType::ELF_OBJ)
+        return get_elf_type(child->data);
+    return "";
+  case FileType::TEXT:
+    return get_script_output_type(ctx, mf);
+  default:
+    return "";
+  }
+}
+
+template <typename E>
+static void
+check_file_compatibility(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  std::string_view target = get_machine_type(ctx, mf);
+  if (target != ctx.arg.emulation)
+    Fatal(ctx) << mf->name << ": incompatible file type: "
+               << ctx.arg.emulation << " is expected but got " << target;
+}
+
+template <typename E>
+static ObjectFile<E> *new_object_file(Context<E> &ctx, MappedFile<Context<E>> *mf,
+                                      std::string archive_name) {
+  static Counter count("parsed_objs");
+  count++;
+
+  check_file_compatibility(ctx, mf);
+
+  bool in_lib = ctx.in_lib || (!archive_name.empty() && !ctx.whole_archive);
+  ObjectFile<E> *file = ObjectFile<E>::create(ctx, mf, archive_name, in_lib);
+  file->priority = ctx.file_priority++;
+  ctx.tg.run([file, &ctx] { file->parse(ctx); });
+  if (ctx.arg.trace)
+    SyncOut(ctx) << "trace: " << *file;
+  return file;
+}
+
+template <typename E>
+static ObjectFile<E> *new_lto_obj(Context<E> &ctx, MappedFile<Context<E>> *mf,
+                                  std::string archive_name) {
+  static Counter count("parsed_lto_objs");
+  count++;
+
+  if (ctx.arg.ignore_ir_file.count(mf->get_identifier()))
+    return nullptr;
+
+  ObjectFile<E> *file = read_lto_object(ctx, mf);
+  file->priority = ctx.file_priority++;
+  file->archive_name = archive_name;
+  file->is_in_lib = ctx.in_lib || (!archive_name.empty() && !ctx.whole_archive);
+  file->is_alive = !file->is_in_lib;
+  ctx.has_lto_object = true;
+  if (ctx.arg.trace)
+    SyncOut(ctx) << "trace: " << *file;
+  return file;
+}
+
+template <typename E>
+static SharedFile<E> *
+new_shared_file(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  check_file_compatibility(ctx, mf);
+
+  SharedFile<E> *file = SharedFile<E>::create(ctx, mf);
+  file->priority = ctx.file_priority++;
+  ctx.tg.run([file, &ctx] { file->parse(ctx); });
+  if (ctx.arg.trace)
+    SyncOut(ctx) << "trace: " << *file;
+  return file;
+}
+
+template <typename E>
+void read_file(Context<E> &ctx, MappedFile<Context<E>> *mf) {
+  if (ctx.visited.contains(mf->name))
+    return;
+
+  switch (get_file_type(ctx, mf)) {
+  case FileType::ELF_OBJ:
+    ctx.objs.push_back(new_object_file(ctx, mf, ""));
+    return;
+  case FileType::ELF_DSO:
+    ctx.dsos.push_back(new_shared_file(ctx, mf));
+    ctx.visited.insert(mf->name);
+    return;
+  case FileType::AR:
+  case FileType::THIN_AR:
+    for (MappedFile<Context<E>> *child : read_archive_members(ctx, mf)) {
+      switch (get_file_type(ctx, child)) {
+      case FileType::ELF_OBJ:
+        ctx.objs.push_back(new_object_file(ctx, child, mf->name));
+        break;
+      case FileType::GCC_LTO_OBJ:
+      case FileType::LLVM_BITCODE:
+        if (ObjectFile<E> *file = new_lto_obj(ctx, child, mf->name))
+          ctx.objs.push_back(file);
+        break;
+      case FileType::ELF_DSO:
+        Warn(ctx) << mf->name << "(" << child->name
+                  << "): shared object file in an archive is ignored";
+        break;
+      default:
+        break;
+      }
+    }
+    ctx.visited.insert(mf->name);
+    return;
+  case FileType::TEXT:
+    parse_linker_script(ctx, mf);
+    return;
+  case FileType::GCC_LTO_OBJ:
+  case FileType::LLVM_BITCODE:
+    if (ObjectFile<E> *file = new_lto_obj(ctx, mf, ""))
+      ctx.objs.push_back(file);
+    return;
+  default:
+    Fatal(ctx) << mf->name << ": unknown file type";
+  }
+}
+
+template <typename E>
+static std::string_view
+deduce_machine_type(Context<E> &ctx, std::span<std::string> args) {
+  for (std::string_view arg : args)
+    if (!arg.starts_with('-'))
+      if (auto *mf = MappedFile<Context<E>>::open(ctx, std::string(arg)))
+        if (std::string_view target = get_machine_type(ctx, mf);
+            !target.empty())
+          return target;
+  Fatal(ctx) << "-m option is missing";
+}
+
+template <typename E>
+MappedFile<Context<E>> *open_library(Context<E> &ctx, std::string path) {
+  MappedFile<Context<E>> *mf = MappedFile<Context<E>>::open(ctx, path);
+  if (!mf)
+    return nullptr;
+
+  std::string_view target = get_machine_type(ctx, mf);
+  if (target.empty() || target == E::target_name)
+    return mf;
+  Warn(ctx) << path << ": skipping incompatible file " << target
+            << " " << (int)E::e_machine;
+  return nullptr;
+}
+
+template <typename E>
+MappedFile<Context<E>> *find_library(Context<E> &ctx, std::string name) {
+  if (name.starts_with(':')) {
+    for (std::string_view dir : ctx.arg.library_paths) {
+      std::string path = std::string(dir) + "/" + name.substr(1);
+      if (MappedFile<Context<E>> *mf = open_library(ctx, path))
+        return mf;
+    }
+    Fatal(ctx) << "library not found: " << name;
+  }
+
+  for (std::string_view dir : ctx.arg.library_paths) {
+    std::string stem = std::string(dir) + "/lib" + name;
+    if (!ctx.is_static)
+      if (MappedFile<Context<E>> *mf = open_library(ctx, stem + ".so"))
+        return mf;
+    if (MappedFile<Context<E>> *mf = open_library(ctx, stem + ".a"))
+      return mf;
+  }
+  Fatal(ctx) << "library not found: " << name;
+}
+
+template <typename E>
+MappedFile<Context<E>> *find_from_search_paths(Context<E> &ctx, std::string name) {
+  if (MappedFile<Context<E>> *mf = MappedFile<Context<E>>::open(ctx, name))
+    return mf;
+
+  for (std::string_view dir : ctx.arg.library_paths)
+    if (MappedFile<Context<E>> *mf =
+        MappedFile<Context<E>>::open(ctx, std::string(dir) + "/" + name))
+      return mf;
+  return nullptr;
+}
+
+template <typename E>
+static void read_input_files(Context<E> &ctx, std::span<std::string> args) {
+  Timer t(ctx, "read_input_files");
+
+  std::vector<std::tuple<bool, bool, bool, bool>> state;
+  ctx.is_static = ctx.arg.is_static;
+
+  while (!args.empty()) {
+    std::string_view arg = args[0];
+    args = args.subspan(1);
+
+    if (arg == "--as-needed") {
+      ctx.as_needed = true;
+    } else if (arg == "--no-as-needed") {
+      ctx.as_needed = false;
+    } else if (arg == "--whole-archive") {
+      ctx.whole_archive = true;
+    } else if (arg == "--no-whole-archive") {
+      ctx.whole_archive = false;
+    } else if (arg == "--Bstatic") {
+      ctx.is_static = true;
+    } else if (arg == "--Bdynamic") {
+      ctx.is_static = false;
+    } else if (arg == "--start-lib") {
+      ctx.in_lib = true;
+    } else if (arg == "--end-lib") {
+      ctx.in_lib = false;
+    } else if (remove_prefix(arg, "--version-script=")) {
+      MappedFile<Context<E>> *mf = find_from_search_paths(ctx, std::string(arg));
+      if (!mf)
+        Fatal(ctx) << "--version-script: file not found: " << arg;
+      parse_version_script(ctx, mf);
+    } else if (remove_prefix(arg, "--dynamic-list=")) {
+      MappedFile<Context<E>> *mf = find_from_search_paths(ctx, std::string(arg));
+      if (!mf)
+        Fatal(ctx) << "--dynamic-list: file not found: " << arg;
+      parse_dynamic_list(ctx, mf);
+    } else if (remove_prefix(arg, "--export-dynamic-symbol=")) {
+      if (arg == "*")
+        ctx.default_version = VER_NDX_GLOBAL;
+      else
+        ctx.version_patterns.push_back({arg, "--export-dynamic-symbol",
+                                        "global", VER_NDX_GLOBAL, false});
+    } else if (remove_prefix(arg, "--export-dynamic-symbol-list=")) {
+      MappedFile<Context<E>> *mf = find_from_search_paths(ctx, std::string(arg));
+      if (!mf)
+        Fatal(ctx) << "--export-dynamic-symbol-list: file not found: " << arg;
+      parse_dynamic_list(ctx, mf);
+    } else if (arg == "--push-state") {
+      state.push_back({ctx.as_needed, ctx.whole_archive, ctx.is_static,
+                       ctx.in_lib});
+    } else if (arg == "--pop-state") {
+      if (state.empty())
+        Fatal(ctx) << "no state pushed before popping";
+      std::tie(ctx.as_needed, ctx.whole_archive, ctx.is_static, ctx.in_lib) =
+        state.back();
+      state.pop_back();
+    } else if (remove_prefix(arg, "-l")) {
+      MappedFile<Context<E>> *mf = find_library(ctx, std::string(arg));
+      mf->given_fullpath = false;
+      read_file(ctx, mf);
+    } else {
+      read_file(ctx, MappedFile<Context<E>>::must_open(ctx, std::string(arg)));
+    }
+  }
+
+  if (ctx.objs.empty())
+    Fatal(ctx) << "no input files";
+
+  ctx.tg.wait();
+}
+
+// Since elf_main is a template, we can't run it without a type parameter.
+// We speculatively run elf_main with X86_64, and if the speculation was
+// wrong, re-run it with an actual machine type.
+template <typename E>
+static int redo_main(int argc, char **argv, std::string_view target) {
+  if (target == I386::target_name)
+    return elf_main<I386>(argc, argv);
+  if (target == ARM64::target_name)
+    return elf_main<ARM64>(argc, argv);
+  if (target == ARM32::target_name)
+    return elf_main<ARM32>(argc, argv);
+  if (target == RV64LE::target_name)
+    return elf_main<RV64LE>(argc, argv);
+  if (target == RV64BE::target_name)
+    return elf_main<RV64BE>(argc, argv);
+  if (target == RV32LE::target_name)
+    return elf_main<RV32LE>(argc, argv);
+  if (target == RV32BE::target_name)
+    return elf_main<RV32BE>(argc, argv);
+  if (target == PPC32::target_name)
+    return elf_main<PPC32>(argc, argv);
+  if (target == PPC64V1::target_name)
+    return elf_main<PPC64V1>(argc, argv);
+  if (target == PPC64V2::target_name)
+    return elf_main<PPC64V2>(argc, argv);
+  if (target == S390X::target_name)
+    return elf_main<S390X>(argc, argv);
+  if (target == SPARC64::target_name)
+    return elf_main<SPARC64>(argc, argv);
+  if (target == M68K::target_name)
+    return elf_main<M68K>(argc, argv);
+  if (target == SH4::target_name)
+    return elf_main<SH4>(argc, argv);
+  if (target == ALPHA::target_name)
+    return elf_main<ALPHA>(argc, argv);
+  unreachable();
+}
+
+template <typename E>
+int elf_main(int argc, char **argv) {
+  Context<E> ctx;
+
+  // Process -run option first. process_run_subcommand() does not return.
+  if (argc >= 2 && (argv[1] == "-run"sv || argv[1] == "--run"sv)) {
+#if defined(_WIN32) || defined(__APPLE__)
+    Fatal(ctx) << "-run is supported only on Unix";
+#endif
+    process_run_subcommand(ctx, argc, argv);
+  }
+
+  // Parse non-positional command line options
+  ctx.cmdline_args = expand_response_files(ctx, argv);
+  std::vector<std::string> file_args = parse_nonpositional_args(ctx);
+
+  // If no -m option is given, deduce it from input files.
+  if (ctx.arg.emulation.empty())
+    ctx.arg.emulation = deduce_machine_type(ctx, file_args);
+
+  // Redo if -m is not x86-64.
+  if constexpr (is_x86_64<E>)
+    if (ctx.arg.emulation != X86_64::target_name)
+      return redo_main<E>(argc, argv, ctx.arg.emulation);
+
+  Timer t_all(ctx, "all");
+
+  install_signal_handler();
+
+  if (!ctx.arg.directory.empty())
+    if (chdir(ctx.arg.directory.c_str()) == -1)
+      Fatal(ctx) << "chdir failed: " << ctx.arg.directory
+                 << ": " << errno_string();
+
+  // Fork a subprocess unless --no-fork is given.
+  std::function<void()> on_complete;
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+  if (ctx.arg.fork)
+    on_complete = fork_child();
+#endif
+
+  acquire_global_lock(ctx);
+
+  tbb::global_control tbb_cont(tbb::global_control::max_allowed_parallelism,
+                               ctx.arg.thread_count);
+
+  // Handle --wrap options if any.
+  for (std::string_view name : ctx.arg.wrap)
+    get_symbol(ctx, name)->is_wrapped = true;
+
+  // Handle --retain-symbols-file options if any.
+  if (ctx.arg.retain_symbols_file)
+    for (std::string_view name : *ctx.arg.retain_symbols_file)
+      get_symbol(ctx, name)->write_to_symtab = true;
+
+  for (std::string_view arg : ctx.arg.trace_symbol)
+    get_symbol(ctx, arg)->is_traced = true;
+
+  // Parse input files
+  read_input_files(ctx, file_args);
+
+  // Uniquify shared object files by soname
+  {
+    std::unordered_set<std::string_view> seen;
+    std::erase_if(ctx.dsos, [&](SharedFile<E> *file) {
+      return !seen.insert(file->soname).second;
+    });
+  }
+
+  Timer t_total(ctx, "total");
+  Timer t_before_copy(ctx, "before_copy");
+
+  // Apply -exclude-libs
+  apply_exclude_libs(ctx);
+
+  // Create a dummy file containing linker-synthesized symbols.
+  if (!ctx.arg.relocatable)
+    create_internal_file(ctx);
+
+  // resolve_symbols is 4 things in 1 phase:
+  //
+  // - Determine the set of object files to extract from archives.
+  // - Remove redundant COMDAT sections (e.g. duplicate inline functions).
+  // - Finally, the actual symbol resolution.
+  // - LTO, which requires preliminary symbol resolution before running
+  //   and a follow-up re-resolution after the LTO objects are emitted.
+  //
+  // These passes have complex interactions, and unfortunately has to be
+  // put together in a single phase.
+  resolve_symbols(ctx);
+
+  // "Kill" .eh_frame input sections after symbol resolution.
+  kill_eh_frame_sections(ctx);
+
+  // Resolve mergeable section pieces to merge them.
+  resolve_section_pieces(ctx);
+
+  // Handle --relocatable. Since the linker's behavior is quite different
+  // from the normal one when the option is given, the logic is implemented
+  // to a separate file.
+  if (ctx.arg.relocatable) {
+    combine_objects(ctx);
+    return 0;
+  }
+
+  // Create .bss sections for common symbols.
+  convert_common_symbols(ctx);
+
+  // Apply version scripts.
+  apply_version_script(ctx);
+
+  // Parse symbol version suffixes (e.g. "foo@ver1").
+  parse_symbol_version(ctx);
+
+  // Set is_imported and is_exported bits for each symbol.
+  compute_import_export(ctx);
+
+  // Read address-significant section information.
+  if (ctx.arg.icf && !ctx.arg.icf_all)
+    mark_addrsig(ctx);
+
+  // Garbage-collect unreachable sections.
+  if (ctx.arg.gc_sections)
+    gc_sections(ctx);
+
+  // Merge identical read-only sections.
+  if (ctx.arg.icf)
+    icf_sections(ctx);
+
+  // Compute sizes of sections containing mergeable strings.
+  compute_merged_section_sizes(ctx);
+
+  // Create linker-synthesized sections such as .got or .plt.
+  create_synthetic_sections(ctx);
+
+  // Make sure that there's no duplicate symbol
+  if (!ctx.arg.allow_multiple_definition)
+    check_duplicate_symbols(ctx);
+
+  // Warn if symbols with different types are defined under the same name.
+  check_symbol_types(ctx);
+
+  if constexpr (is_ppc64v1<E>)
+    ppc64v1_rewrite_opd(ctx);
+
+  // Bin input sections into output sections.
+  create_output_sections(ctx);
+
+  // Add synthetic symbols such as __ehdr_start or __end.
+  add_synthetic_symbols(ctx);
+
+  // Beyond this point, no new files will be added to ctx.objs
+  // or ctx.dsos.
+
+  // Handle `-z cet-report`.
+  if (ctx.arg.z_cet_report != CET_REPORT_NONE)
+    check_cet_errors(ctx);
+
+  // Handle `-z execstack-if-needed`.
+  if (ctx.arg.z_execstack_if_needed)
+    for (ObjectFile<E> *file : ctx.objs)
+      if (file->needs_executable_stack)
+        ctx.arg.z_execstack = true;
+
+  // If we are linking a .so file, remaining undefined symbols does
+  // not cause a linker error. Instead, they are treated as if they
+  // were imported symbols.
+  //
+  // If we are linking an executable, weak undefs are converted to
+  // weakly imported symbols so that they'll have another chance to be
+  // resolved.
+  claim_unresolved_symbols(ctx);
+
+  // Beyond this point, no new symbols will be added to the result.
+
+  // Handle --print-dependencies
+  if (ctx.arg.print_dependencies)
+    print_dependencies(ctx);
+
+  // Handle -repro
+  if (ctx.arg.repro)
+    write_repro_file(ctx);
+
+  // Handle --require-defined
+  for (std::string_view name : ctx.arg.require_defined)
+    if (!get_symbol(ctx, name)->file)
+      Error(ctx) << "--require-defined: undefined symbol: " << name;
+
+  // .init_array and .fini_array contents have to be sorted by
+  // a special rule. Sort them.
+  sort_init_fini(ctx);
+
+  // Likewise, .ctors and .dtors have to be sorted. They are rare
+  // because they are superceded by .init_array/.fini_array, though.
+  sort_ctor_dtor(ctx);
+
+  // Handle --shuffle-sections
+  if (ctx.arg.shuffle_sections != SHUFFLE_SECTIONS_NONE)
+    shuffle_sections(ctx);
+
+  // Copy string referred by .dynamic to .dynstr.
+  for (SharedFile<E> *file : ctx.dsos)
+    ctx.dynstr->add_string(file->soname);
+  for (std::string_view str : ctx.arg.auxiliary)
+    ctx.dynstr->add_string(str);
+  for (std::string_view str : ctx.arg.filter)
+    ctx.dynstr->add_string(str);
+  if (!ctx.arg.rpaths.empty())
+    ctx.dynstr->add_string(ctx.arg.rpaths);
+  if (!ctx.arg.soname.empty())
+    ctx.dynstr->add_string(ctx.arg.soname);
+
+  if constexpr (is_ppc64v1<E>)
+    ppc64v1_scan_symbols(ctx);
+
+  // Scan relocations to find symbols that need entries in .got, .plt,
+  // .got.plt, .dynsym, .dynstr, etc.
+  scan_relocations(ctx);
+
+  // Compute sizes of output sections while assigning offsets
+  // within an output section to input sections.
+  compute_section_sizes(ctx);
+
+  // Sort sections by section attributes so that we'll have to
+  // create as few segments as possible.
+  sort_output_sections(ctx);
+
+  // If --packed_dyn_relocs=relr was given, base relocations are stored
+  // to a .relr.dyn section in a compressed form. Construct a compressed
+  // relocations now so that we can fix section sizes and file layout.
+  if (ctx.arg.pack_dyn_relocs_relr)
+    construct_relr(ctx);
+
+  // Reserve a space for dynamic symbol strings in .dynstr and sort
+  // .dynsym contents if necessary. Beyond this point, no symbol will
+  // be added to .dynsym.
+  ctx.dynsym->finalize(ctx);
+
+  // Print reports about undefined symbols, if needed.
+  if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR)
+    report_undef_errors(ctx);
+
+  // Fill .gnu.version_d section contents.
+  if (ctx.verdef)
+    ctx.verdef->construct(ctx);
+
+  // Fill .gnu.version_r section contents.
+  ctx.verneed->construct(ctx);
+
+  // Compute .symtab and .strtab sizes for each file.
+  create_output_symtab(ctx);
+
+  // .eh_frame is a special section from the linker's point of view,
+  // as its contents are parsed and reconstructed by the linker,
+  // unlike other sections that are regarded as opaque bytes.
+  // Here, we construct output .eh_frame contents.
+  ctx.eh_frame->construct(ctx);
+
+  // Handle --gdb-index.
+  if (ctx.arg.gdb_index)
+    ctx.gdb_index->construct(ctx);
+
+  // If --emit-relocs is given, we'll copy relocation sections from input
+  // files to an output file.
+  if (ctx.arg.emit_relocs)
+    create_reloc_sections(ctx);
+
+  // Compute the section header values for all sections.
+  compute_section_headers(ctx);
+
+  // Assign offsets to output sections
+  i64 filesize = set_osec_offsets(ctx);
+
+  // On RISC-V, branches are encode using multiple instructions so
+  // that they can jump to anywhere in ±2 GiB by default. They may
+  // be replaced with shorter instruction sequences if destinations
+  // are close enough. Do this optimization.
+  if constexpr (is_riscv<E>)
+    filesize = riscv_resize_sections(ctx);
+
+  // At this point, memory layout is fixed.
+
+  // Set actual addresses to linker-synthesized symbols.
+  fix_synthetic_symbols(ctx);
+
+  // Beyond this, you can assume that symbol addresses including their
+  // GOT or PLT addresses have a correct final value.
+
+  // If --compress-debug-sections is given, compress .debug_* sections
+  // using zlib.
+  if (ctx.arg.compress_debug_sections != COMPRESS_NONE)
+    filesize = compress_debug_sections(ctx);
+
+  // At this point, both memory and file layouts are fixed.
+
+  t_before_copy.stop();
+
+  // Create an output file
+  ctx.output_file =
+    OutputFile<Context<E>>::open(ctx, ctx.arg.output, filesize, 0777);
+  ctx.buf = ctx.output_file->buf;
+
+  Timer t_copy(ctx, "copy");
+
+  // Copy input sections to the output file and apply relocations.
+  copy_chunks(ctx);
+
+  // Some part of .gdb_index couldn't be computed until other debug
+  // sections are complete. We have complete debug sections now, so
+  // write the rest of .gdb_index.
+  if (ctx.gdb_index)
+    ctx.gdb_index->write_address_areas(ctx);
+
+  // Dynamic linker works better with sorted .rela.dyn section,
+  // so we sort them.
+  ctx.reldyn->sort(ctx);
+
+  // Zero-clear paddings between sections
+  clear_padding(ctx);
+
+  // .note.gnu.build-id section contains a cryptographic hash of the
+  // entire output file. Now that we wrote everything except build-id,
+  // we can compute it.
+  if (ctx.buildid)
+    ctx.buildid->write_buildid(ctx);
+
+  t_copy.stop();
+  ctx.checkpoint();
+
+  // Close the output file. This is the end of the linker's main job.
+  ctx.output_file->close(ctx);
+
+  // Handle --dependency-file
+  if (!ctx.arg.dependency_file.empty())
+    write_dependency_file(ctx);
+
+  if (ctx.has_lto_object)
+    lto_cleanup(ctx);
+
+  t_total.stop();
+  t_all.stop();
+
+  if (ctx.arg.print_map)
+    print_map(ctx);
+
+  // Show stats numbers
+  if (ctx.arg.stats)
+    show_stats(ctx);
+
+  if (ctx.arg.perf)
+    print_timer_records(ctx.timer_records);
+
+  std::cout << std::flush;
+  std::cerr << std::flush;
+  if (on_complete)
+    on_complete();
+
+  release_global_lock(ctx);
+
+  if (ctx.arg.quick_exit)
+    _exit(0);
+
+  for (std::function<void()> &fn : ctx.on_exit)
+    fn();
+  ctx.checkpoint();
+  return 0;
+}
+
+using E = MOLD_TARGET;
+
+template void read_file(Context<E> &, MappedFile<Context<E>> *);
+template MappedFile<Context<E>> *open_library(Context<E> &, std::string);
+
+#ifdef MOLD_X86_64
+
+extern template int elf_main<I386>(int, char **);
+extern template int elf_main<ARM32>(int, char **);
+extern template int elf_main<ARM64>(int, char **);
+extern template int elf_main<RV32BE>(int, char **);
+extern template int elf_main<RV32LE>(int, char **);
+extern template int elf_main<RV64LE>(int, char **);
+extern template int elf_main<RV64BE>(int, char **);
+extern template int elf_main<PPC32>(int, char **);
+extern template int elf_main<PPC64V1>(int, char **);
+extern template int elf_main<PPC64V2>(int, char **);
+extern template int elf_main<S390X>(int, char **);
+extern template int elf_main<SPARC64>(int, char **);
+extern template int elf_main<M68K>(int, char **);
+extern template int elf_main<SH4>(int, char **);
+extern template int elf_main<ALPHA>(int, char **);
+
+int main(int argc, char **argv) {
+  return elf_main<X86_64>(argc, argv);
+}
+
+#else
+
+template int elf_main<E>(int, char **);
+
+#endif
+
+} // namespace mold::elf
--- a/third_party/mold/elf/mapfile.cc
+++ b/third_party/mold/elf/mapfile.cc
@ -0,0 +1,117 @@
+// clang-format off
+#include "third_party/mold/elf/mold.h"
+
+#include "third_party/libcxx/fstream"
+#include "third_party/libcxx/iomanip"
+#include "third_party/libcxx/ios"
+#include "third_party/libcxx/sstream"
+// MISSING #include <tbb/parallel_for_each.h>
+#include "third_party/libcxx/unordered_map"
+
+namespace mold::elf {
+
+template <typename E>
+using Map =
+  tbb::concurrent_hash_map<InputSection<E> *, std::vector<Symbol<E> *>>;
+
+template <typename E>
+static std::unique_ptr<std::ofstream> open_output_file(Context<E> &ctx) {
+  std::unique_ptr<std::ofstream> file(new std::ofstream);
+  file->open(ctx.arg.Map.c_str());
+  if (!file->is_open())
+    Fatal(ctx) << "cannot open " << ctx.arg.Map << ": " << errno_string();
+  return file;
+}
+
+template <typename E>
+static Map<E> get_map(Context<E> &ctx) {
+  Map<E> map;
+
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    for (Symbol<E> *sym : file->symbols) {
+      if (sym->file != file || sym->get_type() == STT_SECTION)
+        continue;
+
+      if (InputSection<E> *isec = sym->get_input_section()) {
+        assert(file == &isec->file);
+        typename Map<E>::accessor acc;
+        map.insert(acc, {isec, {}});
+        acc->second.push_back(sym);
+      }
+    }
+  });
+
+  if (map.size() <= 1)
+    return map;
+
+  tbb::parallel_for(map.range(), [](const typename Map<E>::range_type &range) {
+    for (auto it = range.begin(); it != range.end(); it++) {
+      std::vector<Symbol<E> *> &vec = it->second;
+      sort(vec, [](Symbol<E> *a, Symbol<E> *b) { return a->value < b->value; });
+    }
+  });
+  return map;
+}
+
+template <typename E>
+void print_map(Context<E> &ctx) {
+  std::ostream *out = &std::cout;
+  std::unique_ptr<std::ofstream> file;
+
+  if (!ctx.arg.Map.empty()) {
+    file = open_output_file(ctx);
+    out = file.get();
+  }
+
+  // Construct a section-to-symbol map.
+  Map<E> map = get_map(ctx);
+
+  // Print a mapfile.
+  *out << "               VMA       Size Align Out     In      Symbol\n";
+
+  for (Chunk<E> *osec : ctx.chunks) {
+    *out << std::showbase
+         << std::setw(18) << std::hex << (u64)osec->shdr.sh_addr << std::dec
+         << std::setw(11) << (u64)osec->shdr.sh_size
+         << std::setw(6) << (u64)osec->shdr.sh_addralign
+         << " " << osec->name << "\n";
+
+    if (osec->kind() != OUTPUT_SECTION)
+      continue;
+
+    std::span<InputSection<E> *> members = ((OutputSection<E> *)osec)->members;
+    std::vector<std::string> bufs(members.size());
+
+    tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) {
+      InputSection<E> *mem = members[i];
+      std::ostringstream ss;
+      opt_demangle = ctx.arg.demangle;
+      u64 addr = osec->shdr.sh_addr + mem->offset;
+
+      ss << std::showbase
+         << std::setw(18) << std::hex << addr << std::dec
+         << std::setw(11) << (u64)mem->sh_size
+         << std::setw(6) << (1 << (u64)mem->p2align)
+         << "         " << *mem << "\n";
+
+      typename Map<E>::const_accessor acc;
+      if (map.find(acc, mem))
+        for (Symbol<E> *sym : acc->second)
+          ss << std::showbase
+             << std::setw(18) << std::hex << sym->get_addr(ctx) << std::dec
+             << "          0     0                 "
+             << *sym << "\n";
+
+      bufs[i] = ss.str();
+    });
+
+    for (std::string &str : bufs)
+      *out << str;
+  }
+}
+
+using E = MOLD_TARGET;
+
+template void print_map(Context<E> &ctx);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/mold-wrapper.c
+++ b/third_party/mold/elf/mold-wrapper.c
@ -0,0 +1,171 @@
+// clang-format off
+#define _GNU_SOURCE 1
+
+#if !defined(__OpenBSD__) && !defined(__FreeBSD__)
+#include "libc/mem/alloca.h"
+#endif
+#include "libc/runtime/dlfcn.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/stdio/posix_spawn.h"
+
+
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/mem.h"
+#include "libc/str/str.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+
+extern char **environ;
+
+static char *get_mold_path() {
+  char *path = getenv("MOLD_PATH");
+  if (path)
+    return path;
+  fprintf(stderr, "MOLD_PATH is not set\n");
+  exit(1);
+}
+
+static void debug_print(const char *fmt, ...) {
+  if (!getenv("MOLD_WRAPPER_DEBUG"))
+    return;
+
+  va_list ap;
+  va_start(ap, fmt);
+  fprintf(stderr, "mold-wrapper.so: ");
+  vfprintf(stderr, fmt, ap);
+  fflush(stderr);
+  va_end(ap);
+}
+
+static int count_args(va_list *ap) {
+  va_list aq;
+  va_copy(aq, *ap);
+
+  int i = 0;
+  while (va_arg(aq, char *))
+    i++;
+  va_end(aq);
+  return i;
+}
+
+static void copy_args(char **argv, const char *arg0, va_list *ap) {
+  int i = 1;
+  char *arg;
+  while ((arg = va_arg(*ap, char *)))
+    argv[i++] = arg;
+
+  ((const char **)argv)[0] = arg0;
+  ((const char **)argv)[i] = NULL;
+}
+
+static bool is_ld(const char *path) {
+  const char *ptr = path + strlen(path);
+  while (path < ptr && ptr[-1] != '/')
+    ptr--;
+
+  return !strcmp(ptr, "ld") || !strcmp(ptr, "ld.lld") ||
+         !strcmp(ptr, "ld.gold") || !strcmp(ptr, "ld.bfd") ||
+         !strcmp(ptr, "ld.mold");
+}
+
+int execvpe(const char *file, char *const *argv, char *const *envp) {
+  debug_print("execvpe %s\n", file);
+
+  if (!strcmp(file, "ld") || is_ld(file))
+    file = get_mold_path();
+
+  for (int i = 0; envp[i]; i++)
+    putenv(envp[i]);
+
+  typeof(execvpe) *real = dlsym(RTLD_NEXT, "execvp");
+  return real(file, argv, environ);
+}
+
+int execve(const char *path, char *const *argv, char *const *envp) {
+  debug_print("execve %s\n", path);
+  if (is_ld(path))
+    path = get_mold_path();
+  typeof(execve) *real = dlsym(RTLD_NEXT, "execve");
+  return real(path, argv, envp);
+}
+
+int execl(const char *path, const char *arg0, ...) {
+  va_list ap;
+  va_start(ap, arg0);
+  char **argv = alloca((count_args(&ap) + 2) * sizeof(char *));
+  copy_args(argv, arg0, &ap);
+  va_end(ap);
+  return execve(path, argv, environ);
+}
+
+int execlp(const char *file, const char *arg0, ...) {
+  va_list ap;
+  va_start(ap, arg0);
+  char **argv = alloca((count_args(&ap) + 2) * sizeof(char *));
+  copy_args(argv, arg0, &ap);
+  va_end(ap);
+  return execvpe(file, argv, environ);
+}
+
+int execle(const char *path, const char *arg0, ...) {
+  va_list ap;
+  va_start(ap, arg0);
+  char **argv = alloca((count_args(&ap) + 2) * sizeof(char *));
+  copy_args(argv, arg0, &ap);
+  char **env = va_arg(ap, char **);
+  va_end(ap);
+  return execve(path, argv, env);
+}
+
+int execv(const char *path, char *const *argv) {
+  return execve(path, argv, environ);
+}
+
+int execvp(const char *file, char *const *argv) {
+  return execvpe(file, argv, environ);
+}
+
+int posix_spawn(pid_t *pid, const char *path,
+                const posix_spawn_file_actions_t *file_actions,
+                const posix_spawnattr_t *attrp,
+                char *const *argv, char *const *envp) {
+  debug_print("posix_spawn %s\n", path);
+  if (is_ld(path))
+    path = get_mold_path();
+  typeof(posix_spawn) *real = dlsym(RTLD_NEXT, "posix_spawn");
+  return real(pid, path, file_actions, attrp, argv, envp);
+}
--- a/third_party/mold/elf/mold.h
+++ b/third_party/mold/elf/mold.h
--- a/third_party/mold/elf/output-chunks.cc
+++ b/third_party/mold/elf/output-chunks.cc
--- a/third_party/mold/elf/passes.cc
+++ b/third_party/mold/elf/passes.cc
--- a/third_party/mold/elf/relocatable.cc
+++ b/third_party/mold/elf/relocatable.cc
@ -0,0 +1,198 @@
+// clang-format off
+// This file implements -r or --relocatable. That option forces the linker
+// to combine input object files into another single large object file.
+// Since the behavior of the linker when the option is given is quite
+// different from that of the normal execution mode, we separate code for
+// the feature into this separate file.
+//
+// The --relocatable option isn't used very often. After all, if you want
+// to combine object files into a single file, you could use `ar`.
+// However, some programs use it in a creative manner which is hard to be
+// substituted with static archives, so we need to support this option in
+// the same way as GNU ld does. A notable example is GHC (Glasgow Haskell
+// Compiler). GHC has its own dynamic linker which can load a .o file (as
+// opposed to a .so) into memory. GHC's module is not a shared object file
+// but a combined object file.
+//
+// There are many different ways to combine object files into a single file.
+// The simplest approach would be to just copy all sections from input files
+// to an output file as-is with a few exceptions for singleton sections such
+// as the symbol table or the string table. That works, but that's not
+// compatible with GNU ld.
+//
+// To be compatible with GNU ld, we need to do the followings:
+//
+//  - Regular sections containing opaque data (e.g. ".text" or ".data")
+//    are just copied as-is. Two sections with the same name are merged.
+//
+//  - .symtab, .strtab and .shstrtab are merged.
+//
+//  - COMDAT groups are uniquified.
+//
+//  - Relocations are copied, but we need to fix symbol indices.
+
+#include "third_party/mold/elf/mold.h"
+
+// MISSING #include <tbb/parallel_for.h>
+// MISSING #include <tbb/parallel_for_each.h>
+
+namespace mold::elf {
+
+// Create linker-synthesized sections
+template <typename E>
+static void r_create_synthetic_sections(Context<E> &ctx) {
+  auto push = [&]<typename T>(T *x) {
+    ctx.chunks.push_back(x);
+    ctx.chunk_pool.emplace_back(x);
+    return x;
+  };
+
+  ctx.ehdr = push(new OutputEhdr<E>(0));
+  ctx.shdr = push(new OutputShdr<E>);
+  ctx.eh_frame = push(new EhFrameSection<E>);
+  ctx.eh_frame_reloc = push(new EhFrameRelocSection<E>);
+  ctx.strtab = push(new StrtabSection<E>);
+  ctx.symtab = push(new SymtabSection<E>);
+  ctx.shstrtab = push(new ShstrtabSection<E>);
+  ctx.note_property = push(new NotePropertySection<E>);
+}
+
+// Create SHT_GROUP (i.e. comdat group) sections. We uniquify comdat
+// sections by signature. We want to propagate input comdat groups as
+// output comdat groups if they are still alive after uniquification.
+template <typename E>
+static void create_comdat_group_sections(Context<E> &ctx) {
+  Timer t(ctx, "create_comdat_group_sections");
+
+  std::vector<std::vector<Chunk<E> *>> vec{ctx.objs.size()};
+
+  tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) {
+    ObjectFile<E> &file = *ctx.objs[i];
+
+    for (ComdatGroupRef<E> &ref : file.comdat_groups) {
+      if (ref.group->owner != file.priority)
+        continue;
+
+      Symbol<E> *sym = file.symbols[file.elf_sections[ref.sect_idx].sh_info];
+      assert(sym);
+
+      std::vector<Chunk<E> *> members;
+      for (u32 j : ref.members) {
+        const ElfShdr<E> &shdr = file.elf_sections[j];
+        if (shdr.sh_type == (E::is_rela ? SHT_RELA : SHT_REL)) {
+          InputSection<E> &isec = *file.sections[shdr.sh_info];
+          members.push_back(isec.output_section->reloc_sec.get());
+        } else {
+          InputSection<E> &isec = *file.sections[j];
+          members.push_back(isec.output_section);
+        }
+      }
+
+      vec[i].push_back(new ComdatGroupSection<E>(*sym, std::move(members)));
+    }
+  });
+
+  for (std::vector<Chunk<E> *> &vec2 : vec) {
+    for (Chunk<E> *chunk : vec2) {
+      ctx.chunks.push_back(chunk);
+      ctx.chunk_pool.emplace_back(chunk);
+    }
+  }
+}
+
+// Unresolved undefined symbols in the -r mode are simply propagated to an
+// output file as undefined symbols. This function guarantees that
+// unresolved undefined symbols belongs to some input file.
+template <typename E>
+static void r_claim_unresolved_symbols(Context<E> &ctx) {
+  Timer t(ctx, "r_claim_unresolved_symbols");
+
+  tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
+    if (!file->is_alive)
+      return;
+
+    for (i64 i = file->first_global; i < file->elf_syms.size(); i++) {
+      const ElfSym<E> &esym = file->elf_syms[i];
+      Symbol<E> &sym = *file->symbols[i];
+      if (!esym.is_undef())
+        continue;
+
+      std::scoped_lock lock(sym.mu);
+
+      if (sym.file &&
+          (!sym.esym().is_undef() || sym.file->priority <= file->priority))
+        continue;
+
+      sym.file = file;
+      sym.origin = 0;
+      sym.value = 0;
+      sym.sym_idx = i;
+    }
+  });
+}
+
+// Set output section in-file offsets. Output section memory addresses
+// are left as zero.
+template <typename E>
+static u64 r_set_osec_offsets(Context<E> &ctx) {
+  u64 offset = 0;
+  for (Chunk<E> *chunk : ctx.chunks) {
+    offset = align_to(offset, chunk->shdr.sh_addralign);
+    chunk->shdr.sh_offset = offset;
+    offset += chunk->shdr.sh_size;
+  }
+  return offset;
+}
+
+template <typename E>
+void combine_objects(Context<E> &ctx) {
+  compute_merged_section_sizes(ctx);
+
+  create_output_sections(ctx);
+
+  r_create_synthetic_sections(ctx);
+
+  r_claim_unresolved_symbols(ctx);
+
+  compute_section_sizes(ctx);
+
+  sort_output_sections(ctx);
+
+  create_output_symtab(ctx);
+
+  ctx.eh_frame->construct(ctx);
+
+  create_reloc_sections(ctx);
+
+  create_comdat_group_sections(ctx);
+
+  compute_section_headers(ctx);
+
+  i64 filesize = r_set_osec_offsets(ctx);
+  ctx.output_file =
+    OutputFile<Context<E>>::open(ctx, ctx.arg.output, filesize, 0666);
+  ctx.buf = ctx.output_file->buf;
+
+  copy_chunks(ctx);
+  clear_padding(ctx);
+  ctx.output_file->close(ctx);
+  ctx.checkpoint();
+
+  if (ctx.arg.print_map)
+    print_map(ctx);
+
+  if (ctx.arg.stats)
+    show_stats(ctx);
+
+  if (ctx.arg.perf)
+    print_timer_records(ctx.timer_records);
+
+  if (ctx.arg.quick_exit)
+    _exit(0);
+}
+
+using E = MOLD_TARGET;
+
+template void combine_objects(Context<E> &);
+
+} // namespace mold::elf
--- a/third_party/mold/elf/subprocess.cc
+++ b/third_party/mold/elf/subprocess.cc
@ -0,0 +1,166 @@
+// clang-format off
+#if !defined(_WIN32) && !defined(__APPLE__)
+
+#include "third_party/mold/elf/mold.h"
+// MISSING #include "config.h"
+
+#include "third_party/libcxx/filesystem"
+#include "libc/calls/calls.h"
+#include "libc/calls/sigtimedwait.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/stat.macros.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/utime.h"
+#include "libc/time/time.h"
+#include "libc/calls/struct/itimerval.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sock/select.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/itimer.h"
+#include "libc/time/struct/timezone.h"
+#include "libc/time/time.h"
+#include "libc/calls/makedev.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/thread/thread.h"
+#include "libc/calls/typedef/u.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/intrin/newbie.h"
+#include "libc/sock/select.h"
+#include "libc/sysv/consts/endian.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/w.h"
+#include "libc/sysv/consts/waitid.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+
+namespace mold::elf {
+
+#ifdef MOLD_X86_64
+// Exiting from a program with large memory usage is slow --
+// it may take a few hundred milliseconds. To hide the latency,
+// we fork a child and let it do the actual linking work.
+std::function<void()> fork_child() {
+  int pipefd[2];
+  if (pipe(pipefd) == -1) {
+    perror("pipe");
+    exit(1);
+  }
+
+  pid_t pid = fork();
+  if (pid == -1) {
+    perror("fork");
+    exit(1);
+  }
+
+  if (pid > 0) {
+    // Parent
+    close(pipefd[1]);
+
+    char buf[1];
+    if (read(pipefd[0], buf, 1) == 1)
+      _exit(0);
+
+    int status;
+    waitpid(pid, &status, 0);
+
+    if (WIFEXITED(status))
+      _exit(WEXITSTATUS(status));
+    if (WIFSIGNALED(status))
+      raise(WTERMSIG(status));
+    _exit(1);
+  }
+
+  // Child
+  close(pipefd[0]);
+
+  return [=] {
+    char buf[] = {1};
+    [[maybe_unused]] int n = write(pipefd[1], buf, 1);
+    assert(n == 1);
+  };
+}
+#endif
+
+template <typename E>
+static std::string find_dso(Context<E> &ctx, std::filesystem::path self) {
+  // Look for mold-wrapper.so from the same directory as the executable is.
+  std::filesystem::path path = self.parent_path() / "mold-wrapper.so";
+  std::error_code ec;
+  if (std::filesystem::is_regular_file(path, ec) && !ec)
+    return path;
+
+  // If not found, search $(MOLD_LIBDIR)/mold, which is /usr/local/lib/mold
+  // by default.
+  path = MOLD_LIBDIR "/mold/mold-wrapper.so";
+  if (std::filesystem::is_regular_file(path, ec) && !ec)
+    return path;
+
+  // Look for ../lib/mold/mold-wrapper.so
+  path = self.parent_path() / "../lib/mold/mold-wrapper.so";
+  if (std::filesystem::is_regular_file(path, ec) && !ec)
+    return path;
+
+  Fatal(ctx) << "mold-wrapper.so is missing";
+}
+
+template <typename E>
+[[noreturn]]
+void process_run_subcommand(Context<E> &ctx, int argc, char **argv) {
+  assert(argv[1] == "-run"s || argv[1] == "--run"s);
+
+  if (!argv[2])
+    Fatal(ctx) << "-run: argument missing";
+
+  // Get the mold-wrapper.so path
+  std::string self = get_self_path();
+  std::string dso_path = find_dso(ctx, self);
+
+  // Set environment variables
+  putenv(strdup(("LD_PRELOAD=" + dso_path).c_str()));
+  putenv(strdup(("MOLD_PATH=" + self).c_str()));
+
+  // If ld, ld.lld or ld.gold is specified, run mold itself
+  if (std::string cmd = filepath(argv[2]).filename();
+      cmd == "ld" || cmd == "ld.lld" || cmd == "ld.gold") {
+    std::vector<char *> args;
+    args.push_back(argv[0]);
+    args.insert(args.end(), argv + 3, argv + argc);
+    args.push_back(nullptr);
+    execv(self.c_str(), args.data());
+    Fatal(ctx) << "mold -run failed: " << self << ": " << errno_string();
+  }
+
+  // Execute a given command
+  execvp(argv[2], argv + 2);
+  Fatal(ctx) << "mold -run failed: " << argv[2] << ": " << errno_string();
+}
+
+using E = MOLD_TARGET;
+
+template void process_run_subcommand(Context<E> &, int, char **);
+
+} // namespace mold::elf
+
+#endif
--- a/third_party/mold/elf/thunks.cc
+++ b/third_party/mold/elf/thunks.cc
@ -0,0 +1,318 @@
+// clang-format off
+// RISC instructions are usually up to 4 bytes long, so the immediates of
+// their branch instructions are naturally smaller than 32 bits.  This is
+// contrary to x86-64 on which branch instructions take 4 bytes immediates
+// and can jump to anywhere within PC ± 2 GiB.
+//
+// In fact, ARM32's branch instructions can jump only within ±16 MiB and
+// ARM64's ±128 MiB, for example. If a branch target is further than that,
+// we need to let it branch to a linker-synthesized code sequence that
+// construct a full 32 bit address in a register and jump there. That
+// linker-synthesized code is called "thunk".
+//
+// The function in this file creates thunks.
+//
+// Note that although thunks play an important role in an executable, they
+// don't take up too much space in it. For example, among the clang-16's
+// text segment whose size is ~300 MiB on ARM64, thunks in total occupy
+// only ~30 KiB or 0.01%. Of course the number depends on an ISA; we would
+// need more thunks on ARM32 whose branch range is shorter than ARM64.
+// That said, the total size of thunks still isn't that much. Therefore,
+// we don't need to try too hard to reduce thunk size to the absolute
+// minimum.
+
+#if MOLD_ARM32 || MOLD_ARM64 || MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2
+
+#include "third_party/mold/elf/mold.h"
+
+// MISSING #include <tbb/parallel_for.h>
+// MISSING #include <tbb/parallel_for_each.h>
+
+namespace mold::elf {
+
+// Returns a branch reach in bytes for a given target.
+template <typename E>
+static consteval i64 max_distance() {
+  // ARM64's branch has 26 bits immediate. The immediate is padded with
+  // implicit two-bit zeros because all instructions are 4 bytes aligned
+  // and therefore the least two bits are always zero. So the branch
+  // operand is effectively 28 bits long. That means the branch range is
+  // [-2^27, 2^27) or PC ± 128 MiB.
+  if (is_arm64<E>)
+    return 1 << 27;
+
+  // ARM32's Thumb branch has 24 bits immediate, and the instructions are
+  // aligned to 2, so it's effectively 25 bits. It's [-2^24, 2^24) or PC ±
+  // 16 MiB.
+  //
+  // ARM32's non-Thumb branches have twice longer range than its Thumb
+  // counterparts, but we conservatively use the Thumb's limitation.
+  if (is_arm32<E>)
+    return 1 << 24;
+
+  // PPC's branch has 24 bits immediate, and the instructions are aligned
+  // to 4, therefore the reach is [-2^25, 2^25) or PC ± 32 MiB.
+  assert(is_ppc<E>);
+  return 1 << 25;
+}
+
+// We create thunks for each 12.8/1.6/3.2 MiB code block for
+// ARM64/ARM32/PPC, respectively.
+template <typename E>
+static constexpr i64 batch_size = max_distance<E>() / 10;
+
+// We assume that a single thunk group is smaller than 100 KiB.
+static constexpr i64 max_thunk_size = 102400;
+
+// Returns true if a given relocation is of type used for function calls.
+template <typename E>
+static bool needs_thunk_rel(const ElfRel<E> &r) {
+  u32 ty = r.r_type;
+
+  if constexpr (is_arm64<E>) {
+    return ty == R_AARCH64_JUMP26 || ty == R_AARCH64_CALL26;
+  } else if constexpr (is_arm32<E>) {
+    return ty == R_ARM_JUMP24 || ty == R_ARM_THM_JUMP24 ||
+           ty == R_ARM_CALL   || ty == R_ARM_THM_CALL   ||
+           ty == R_ARM_PLT32;
+  } else if constexpr (is_ppc32<E>) {
+    return ty == R_PPC_REL24  || ty == R_PPC_PLTREL24 || ty == R_PPC_LOCAL24PC;
+  } else {
+    static_assert(is_ppc64<E>);
+    return ty == R_PPC64_REL24 || ty == R_PPC64_REL24_NOTOC;
+  }
+}
+
+template <typename E>
+static bool is_reachable(Context<E> &ctx, InputSection<E> &isec,
+                         Symbol<E> &sym, const ElfRel<E> &rel) {
+  // We create thunks with a pessimistic assumption that all
+  // out-of-section relocations would be out-of-range.
+  InputSection<E> *isec2 = sym.get_input_section();
+  if (!isec2 || isec.output_section != isec2->output_section)
+    return false;
+
+  // Even if the target is the same section, we branch to its PLT
+  // if it has one. So a symbol with a PLT is also considered an
+  // out-of-section reference.
+  if (sym.has_plt(ctx))
+    return false;
+
+  // If the target section is in the same output section but
+  // hasn't got any address yet, that's unreacahble.
+  if (isec2->offset == -1)
+    return false;
+
+  // Thumb and ARM B instructions cannot be converted to BX, so we
+  // always have to make them jump to a thunk to switch processor mode
+  // even if their destinations are within their ranges.
+  if constexpr (is_arm32<E>) {
+    bool is_thumb = sym.get_addr(ctx) & 1;
+    if ((rel.r_type == R_ARM_THM_JUMP24 && !is_thumb) ||
+        (rel.r_type == R_ARM_JUMP24 && is_thumb) ||
+        (rel.r_type == R_ARM_PLT32 && is_thumb))
+      return false;
+  }
+
+  // PowerPC before Power9 lacks PC-relative load/store instructions.
+  // Functions compiled for Power9 or earlier assume that r2 points to
+  // GOT+0x8000, while those for Power10 uses r2 as a scratch register.
+  // We need to a thunk to recompute r2 for interworking.
+  if constexpr (is_ppc64v2<E>) {
+    if (rel.r_type == R_PPC64_REL24 && !sym.esym().preserves_r2())
+      return false;
+    if (rel.r_type == R_PPC64_REL24_NOTOC && sym.esym().uses_toc())
+      return false;
+  }
+
+  // Compute a distance between the relocated place and the symbol
+  // and check if they are within reach.
+  i64 S = sym.get_addr(ctx, NO_OPD);
+  i64 A = get_addend(isec, rel);
+  i64 P = isec.get_addr() + rel.r_offset;
+  i64 val = S + A - P;
+  return -max_distance<E>() <= val && val < max_distance<E>();
+}
+
+template <typename E>
+static void reset_thunk(RangeExtensionThunk<E> &thunk) {
+  for (Symbol<E> *sym : thunk.symbols) {
+    sym->extra.thunk_idx = -1;
+    sym->extra.thunk_sym_idx = -1;
+    sym->flags = 0;
+  }
+}
+
+// Scan relocations to collect symbols that need thunks.
+template <typename E>
+static void scan_rels(Context<E> &ctx, InputSection<E> &isec,
+                      RangeExtensionThunk<E> &thunk) {
+  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
+  std::vector<RangeExtensionRef> &range_extn = isec.extra.range_extn;
+  range_extn.resize(rels.size());
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &rel = rels[i];
+    if (!needs_thunk_rel(rel))
+      continue;
+
+    // Skip if the symbol is undefined. apply_reloc() will report an error.
+    Symbol<E> &sym = *isec.file.symbols[rel.r_sym];
+    if (!sym.file)
+      continue;
+
+    // Skip if the destination is within reach.
+    if (is_reachable(ctx, isec, sym, rel))
+      continue;
+
+    // This relocation needs a thunk. If the symbol is already in a
+    // previous thunk, reuse it.
+    if (sym.extra.thunk_idx != -1) {
+      range_extn[i].thunk_idx = sym.extra.thunk_idx;
+      range_extn[i].sym_idx = sym.extra.thunk_sym_idx;
+      continue;
+    }
+
+    // Otherwise, add the symbol to the current thunk if it's not
+    // added already.
+    range_extn[i].thunk_idx = thunk.thunk_idx;
+    range_extn[i].sym_idx = -1;
+
+    if (sym.flags.exchange(-1) == 0) {
+      std::scoped_lock lock(thunk.mu);
+      thunk.symbols.push_back(&sym);
+    }
+  }
+}
+
+template <typename E>
+void create_range_extension_thunks(Context<E> &ctx, OutputSection<E> &osec) {
+  std::span<InputSection<E> *> m = osec.members;
+  if (m.empty())
+    return;
+
+  m[0]->offset = 0;
+
+  // Initialize input sections with a dummy offset so that we can
+  // distinguish sections that have got an address with the one who
+  // haven't.
+  tbb::parallel_for((i64)1, (i64)m.size(), [&](i64 i) {
+    m[i]->offset = -1;
+  });
+
+  // We create thunks from the beginning of the section to the end.
+  // We manage progress using four offsets which increase monotonically.
+  // The locations they point to are always A <= B <= C <= D.
+  //
+  // Input sections between B and C are in the current batch.
+  //
+  // A is the input section with the smallest address than can reach
+  // anywhere from the current batch.
+  //
+  // D is the input section with the largest address such that the thunk
+  // is reachable from the current batch if it's inserted right before D.
+  //
+  //  ................................ <input sections> ............
+  //     A    B    C    D
+  //                    ^ We insert a thunk for the current batch just before D
+  //          <--->       The current batch, which is smaller than batch_size
+  //     <-------->       Smaller than max_distance
+  //          <-------->  Smaller than max_distance
+  //     <------------->  Reachable from the current batch
+  i64 a = 0;
+  i64 b = 0;
+  i64 c = 0;
+  i64 d = 0;
+  i64 offset = 0;
+  i64 thunk_idx = 0;
+
+  while (b < m.size()) {
+    // Move D foward as far as we can jump from B to anywhere in a thunk at D.
+    while (d < m.size() &&
+           align_to(offset, 1 << m[d]->p2align) + m[d]->sh_size + max_thunk_size <
+           m[b]->offset + max_distance<E>()) {
+      offset = align_to(offset, 1 << m[d]->p2align);
+      m[d]->offset = offset;
+      offset += m[d]->sh_size;
+      d++;
+    }
+
+    // Move C forward so that C is apart from B by BATCH_SIZE. We want
+    // to make sure that there's at least one section between B and C
+    // to ensure progress.
+    c = b + 1;
+    while (c < m.size() &&
+           m[c]->offset + m[c]->sh_size < m[b]->offset + batch_size<E>)
+      c++;
+
+    // Move A forward so that A is reachable from C.
+    i64 c_offset = (c == m.size()) ? offset : m[c]->offset;
+    while (a < m.size() && m[a]->offset + max_distance<E>() < c_offset)
+      a++;
+
+    // Erase references to out-of-range thunks.
+    while (thunk_idx < osec.thunks.size() &&
+           osec.thunks[thunk_idx]->offset < m[a]->offset)
+      reset_thunk(*osec.thunks[thunk_idx++]);
+
+    // Create a thunk for input sections between B and C and place it at D.
+    offset = align_to(offset, RangeExtensionThunk<E>::alignment);
+    RangeExtensionThunk<E> *thunk =
+      new RangeExtensionThunk<E>(osec, osec.thunks.size(), offset);
+    osec.thunks.emplace_back(thunk);
+
+    // Scan relocations between B and C to collect symbols that need thunks.
+    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
+                           [&](InputSection<E> *isec) {
+      scan_rels(ctx, *isec, *thunk);
+    });
+
+    // Now that we know the number of symbols in the thunk, we can compute
+    // its size.
+    assert(thunk->size() < max_thunk_size);
+    offset += thunk->size();
+
+    // Sort symbols added to the thunk to make the output deterministic.
+    sort(thunk->symbols, [](Symbol<E> *a, Symbol<E> *b) {
+      return std::tuple{a->file->priority, a->sym_idx} <
+             std::tuple{b->file->priority, b->sym_idx};
+    });
+
+    // Assign offsets within the thunk to the symbols.
+    for (i64 i = 0; i < thunk->symbols.size(); i++) {
+      Symbol<E> &sym = *thunk->symbols[i];
+      sym.extra.thunk_idx = thunk->thunk_idx;
+      sym.extra.thunk_sym_idx = i;
+    }
+
+    // Scan relocations again to fix symbol offsets in the last thunk.
+    tbb::parallel_for_each(m.begin() + b, m.begin() + c,
+                           [&](InputSection<E> *isec) {
+      std::span<Symbol<E> *> syms = isec->file.symbols;
+      std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
+      std::span<RangeExtensionRef> range_extn = isec->extra.range_extn;
+
+      for (i64 i = 0; i < rels.size(); i++)
+        if (range_extn[i].thunk_idx == thunk->thunk_idx)
+          range_extn[i].sym_idx = syms[rels[i].r_sym]->extra.thunk_sym_idx;
+    });
+
+    // Move B forward to point to the begining of the next batch.
+    b = c;
+  }
+
+  while (thunk_idx < osec.thunks.size())
+    reset_thunk(*osec.thunks[thunk_idx++]);
+
+  osec.shdr.sh_size = offset;
+}
+
+using E = MOLD_TARGET;
+
+static_assert(max_thunk_size / E::thunk_size < INT16_MAX);
+
+template void create_range_extension_thunks(Context<E> &, OutputSection<E> &);
+
+} // namespace mold::elf
+
+#endif
--- a/third_party/mold/elf/tls.cc
+++ b/third_party/mold/elf/tls.cc
@ -0,0 +1,215 @@
+// clang-format off
+// This file contains helper functions for thread-local storage (TLS).
+// TLS is probably the most obscure feature the linker has to support,
+// so I'll explain it in detail in this comment.
+//
+// TLS is a per-thread storage. Thread-local variables (TLVs) are in a TLS
+// so that each thread has its own set of thread-local variables. Taking
+// an address of a TLV returns a unique value for each thread. For example,
+// `&foo` for the following code returns different pointer values for
+// different threads.
+//
+//   thread_local int foo;
+//
+// TLV is a relatively new feature. C for example didn't provide the
+// official support for it through the keyword `thread_local` until C11.
+// TLV needs a coordination between the compiler, the linker and the
+// runtime to work correctly.
+//
+// An ELF exectuable or a shared library using TLV contains a "TLS template
+// image" in the PT_TLS segment. For each newly created thread including the
+// initial one, the runtime allocates a contiguous memory for an executable
+// and its depending shared libraries and copies template images there. That
+// per-thread memory is called the "TLS block". After allocating and
+// initializing a TLS block, the runtime sets a register to refer to the TLS
+// block, so that the thread-local variables are accessible relative to the
+// register.
+//
+// The register referring to the per-thread storage is called the Thread
+// Pointer (TP). TP is part of the thread's context. When the kernel
+// scheduler switches threads, TP is saved and restored automatically just
+// like other registers are.
+//
+// The TLS template image is read-only. It contains TLVs' initial values
+// for new threads, and no one writes to it at runtime.
+//
+// Now, let's think about how to access a TLV. We need to know the TLV's
+// address to access it which can be done in several different ways as
+// follows:
+//
+//  1. If we are creating an executable, we know the exact size of the TLS
+//     template image we are creating, and we know where the TP will be
+//     set to after the template is copied to the TLS block. Therefore,
+//     the TP-relative address of a TLV in the main executable is known at
+//     link-time. That means, computing a TLV's address can be as easy as
+//     `add %dst, %tp, <link-time constant>`.
+//
+//  2. If we are creating a shared library, we don't exactly know where
+//     its TLS template image will be copied to in terms of the
+//     TP-relative address, because we don't know how large is the main
+//     executable's and other libraries' TLS template images are. Only the
+//     runtime knows the exact TP-relative address.
+//
+//     We can solve the problem with an indirection. Specifically, for
+//     each TLV whose TP-relative address is only known at process startup
+//     time, we create a GOT entry to store its TP-relative address. We
+//     also emit a dynamic relocation to let the runtime to fill the GOT
+//     entry with a TP-relative address.
+//
+//     Computing a TLV address in this scheme needs at least two machine
+//     instructions in most ISAs; first instruction loads a value from a
+//     GOT entry, and the second one adds the loaded value to TP.
+//
+//  3. Now, think about libraries that you dynamically load with dlopen.
+//     The TLS block for such library has to be allocated separately from
+//     the initial TLS block, so we now have two or more discontiguous
+//     TLS blocks. There's no easy formula to compute an address of a TLV
+//     in a separate TLS block.
+//
+//     The address of a TLV in a separate TLS block can be obtained by
+//     calling a libc-provided function, __tls_get_addr(). The function
+//     takes two arguments; a module ID to identify the ELF file and the
+//     TLV's offset within the ELF file's TLS template image. Accessing a
+//     TLV is sometimes compiled to a function call! The module ID and the
+//     offset are usually stored to GOT as two consecutive words.
+//
+// The last access method is the most generic, so the compiler emits such
+// code by default. But that's the most expensive one, so the linker
+// rewrites instructions if possible so that 3) is relaxed to 2) or even
+// to 1).
+//
+// 1) is called the Local Exec access model. 2) is Initial Exec, and 3) is
+// General Dynamic.
+//
+// There's another little trick that the compiler can use if it knows two
+// TLVs are in the same ELF file (usually in the same file as the code is).
+// In this case, we can call __tls_get_addr() only once with a module ID and
+// the offset 0 to obtain the base address of the ELF file's TLS block. The
+// base address obtained this way is sometimes called Dynamic Thread Pointer
+// or DTP. We can then compute TLVs' addresses by adding their DTP-relative
+// addresses to DTP. This access model is called the Local Dynamic.
+//
+//
+// === TLS Descriptor access model ===
+//
+// As described above, there are arguably too many different TLS access
+// models from the most generic one you can use in any ELF file to the most
+// efficient one you can use only when building a main executable. Compiling
+// source code with an appropriate TLS access model is bothersome. To solve
+// the problem, a new TLS access model was proposed. That is called the TLS
+// Descriptor (TLSDESC) model.
+//
+// For a TLV compiled with TLSDESC, we allocate two consecutive GOT slots
+// and create a TLSDESC dynamic relocation for them. The dynamic linker
+// sets a function pointer to the first GOT slot and its argument to the
+// second slot.
+//
+// To access the TLV, we call the function pointer with the argument we
+// read from the second GOT slot. The function returns the TLV's
+// TP-relative address.
+//
+// The runtime chooses the best access method depending on the situation
+// and sets a pointer to the most efficient code to the first GOT slot.
+// For example, if a TLV's TP-relative address is known at process startup
+// time, the runtime sets that address to the second GOT slot and set a
+// function that just returns its argument to the first GOT slot.
+//
+// With TLSDECS, the compiler can always emit the same code for TLVs
+// without sacrificing runtime performance.
+//
+// TLSDESC is better than the traditional, non-TLSDESC TLS access models.
+// It's the default on ARM64, but on other targets, TLSDESC is
+// unfortunately either optional or even not supported at all. So we still
+// need to support both the traditional TLS models and the TLSDESC model.
+
+#include "third_party/mold/elf/mold.h"
+
+namespace mold::elf {
+
+template <typename E>
+static ElfPhdr<E> *get_tls_segment(Context<E> &ctx) {
+  if (ctx.phdr)
+    for (ElfPhdr<E> &phdr : ctx.phdr->phdrs)
+      if (phdr.p_type == PT_TLS)
+        return &phdr;
+  return nullptr;
+}
+
+template <typename E>
+u64 get_tls_begin(Context<E> &ctx) {
+  if (ElfPhdr<E> *phdr = get_tls_segment(ctx))
+    return phdr->p_vaddr;
+  return 0;
+}
+
+// Returns the TP address which can be used for efficient TLV accesses in
+// the main executable. TP at runtime refers to a per-process TLS block
+// whose address is not known at link-time. So the address returned from
+// this function is the TP if the TLS template image were a TLS block.
+template <typename E>
+u64 get_tp_addr(Context<E> &ctx) {
+  ElfPhdr<E> *phdr = get_tls_segment(ctx);
+  if (!phdr)
+    return 0;
+
+  // On x86, SPARC and s390x, TP (%gs on i386, %fs on x86-64, %g7 on SPARC
+  // and %a0/%a1 on s390x) refers to past the end of the TLS block for
+  // historical reasons. TLVs are accessed with negative offsets from TP.
+  if constexpr (is_x86<E> || is_sparc<E> || is_s390x<E>)
+    return align_to(phdr->p_vaddr + phdr->p_memsz, phdr->p_align);
+
+  // On ARM, SH4 and Alpha, the runtime appends two words at the beginning
+  // of TLV template image when copying TLVs to the TLS block, so we need
+  // to offset it.
+  if constexpr (is_arm<E> || is_sh4<E> || is_alpha<E>)
+    return align_down(phdr->p_vaddr - sizeof(Word<E>) * 2, phdr->p_align);
+
+  // On PPC and m68k, TP is 0x7000 (28 KiB) past the beginning of the TLV
+  // block to maximize the addressable range for load/store instructions
+  // with 16-bits signed immediates. It's not exactly 0x8000 (32 KiB) off
+  // because there's a small implementation-defined piece of data before
+  // the TLV block, and the runtime wants to access them efficiently too.
+  if constexpr (is_ppc<E> || is_m68k<E>)
+    return phdr->p_vaddr + 0x7000;
+
+  // RISC-V just uses the beginning of the main executable's TLV block as
+  // TP. RISC-V load/store instructions usually take 12-bits signed
+  // immediates, so the beginning of TLV ± 2 KiB is accessible with a
+  // single load/store instruction.
+  assert(is_riscv<E>);
+  return phdr->p_vaddr;
+}
+
+// Returns the address __tls_get_addr() would return if it's called
+// with offset 0.
+template <typename E>
+u64 get_dtp_addr(Context<E> &ctx) {
+  ElfPhdr<E> *phdr = get_tls_segment(ctx);
+  if (!phdr)
+    return 0;
+
+  // On PPC64 and m68k, R_DTPOFF is resolved to the address 0x8000 (32
+  // KiB) past the start of the TLS block. The bias maximizes the
+  // accessible range for load/store instructions with 16-bits signed
+  // immediates. That is, if the offset were right at the beginning of
+  // the start of the TLS block, the half of addressible space (negative
+  // immediates) would have been wasted.
+  if constexpr (is_ppc<E> || is_m68k<E>)
+    return phdr->p_vaddr + 0x8000;
+
+  // On RISC-V, the bias is 0x800 as the load/store instructions in the
+  // ISA usually have a 12-bit immediate.
+  if constexpr (is_riscv<E>)
+    return phdr->p_vaddr + 0x800;
+
+  // On other targets, DTP simply refers to the beginning of the TLS block.
+  return phdr->p_vaddr;
+}
+
+using E = MOLD_TARGET;
+
+template u64 get_tls_begin<E>(Context<E> &);
+template u64 get_tp_addr<E>(Context<E> &);
+template u64 get_dtp_addr<E>(Context<E> &);
+
+} // namespace mold::elf
--- a/third_party/mold/fake_tbb.h
+++ b/third_party/mold/fake_tbb.h
@ -0,0 +1,15 @@
+#ifndef __TBB_FAKE_H
+#define __TBB_FAKE_H
+
+namespace tbb {
+
+    template<typename InputIterator, typename Function>
+    void parallel_for_each(InputIterator first, InputIterator last, const Function& f) {
+    }
+
+    template<typename Range, typename Body>
+    void parallel_for_each(Range& rng, const Body& body) {
+    }
+
+}
+#endif
--- a/third_party/mold/filepath.cc
+++ b/third_party/mold/filepath.cc
@ -0,0 +1,37 @@
+// clang-format off
+#include "third_party/mold/common.h"
+
+#include "third_party/libcxx/filesystem"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/stat.macros.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/utime.h"
+#include "libc/time/time.h"
+
+namespace mold {
+
+std::string get_realpath(std::string_view path) {
+  std::error_code ec;
+  std::filesystem::path link = std::filesystem::read_symlink(path, ec);
+  if (ec)
+    return std::string(path);
+  return (filepath(path) / ".." / link).lexically_normal().string();
+}
+
+// Removes redundant '/..' or '/.' from a given path.
+// The transformation is done purely by lexical processing.
+// This function does not access file system.
+std::string path_clean(std::string_view path) {
+  return filepath(path).lexically_normal().string();
+}
+
+std::filesystem::path to_abs_path(std::filesystem::path path) {
+  if (path.is_absolute())
+    return path.lexically_normal();
+  return (std::filesystem::current_path() / path).lexically_normal();
+}
+
+} // namespace mold
--- a/third_party/mold/filetype.h
+++ b/third_party/mold/filetype.h
@ -0,0 +1,194 @@
+// clang-format off
+#pragma once
+
+#include "third_party/mold/common.h"
+// MISSING #include "../elf/elf.h"
+
+namespace mold {
+
+enum class FileType {
+  UNKNOWN,
+  EMPTY,
+  ELF_OBJ,
+  ELF_DSO,
+  MACH_OBJ,
+  MACH_EXE,
+  MACH_DYLIB,
+  MACH_BUNDLE,
+  MACH_UNIVERSAL,
+  AR,
+  THIN_AR,
+  TAPI,
+  TEXT,
+  GCC_LTO_OBJ,
+  LLVM_BITCODE,
+};
+
+template <typename MappedFile>
+bool is_text_file(MappedFile *mf) {
+  u8 *data = mf->data;
+  return mf->size >= 4 && isprint(data[0]) && isprint(data[1]) &&
+         isprint(data[2]) && isprint(data[3]);
+}
+
+template <typename E, typename Context, typename MappedFile>
+inline bool is_gcc_lto_obj(Context &ctx, MappedFile *mf) {
+  using namespace mold::elf;
+
+  const char *data = mf->get_contents().data();
+  ElfEhdr<E> &ehdr = *(ElfEhdr<E> *)data;
+  ElfShdr<E> *sh_begin = (ElfShdr<E> *)(data + ehdr.e_shoff);
+  std::span<ElfShdr<E>> shdrs{(ElfShdr<E> *)(data + ehdr.e_shoff), ehdr.e_shnum};
+
+  // e_shstrndx is a 16-bit field. If .shstrtab's section index is
+  // too large, the actual number is stored to sh_link field.
+  i64 shstrtab_idx = (ehdr.e_shstrndx == SHN_XINDEX)
+    ? sh_begin->sh_link : ehdr.e_shstrndx;
+
+  for (ElfShdr<E> &sec : shdrs) {
+    // GCC FAT LTO objects contain both regular ELF sections and GCC-
+    // specific LTO sections, so that they can be linked as LTO objects if
+    // the LTO linker plugin is available and falls back as regular
+    // objects otherwise. GCC FAT LTO object can be identified by the
+    // presence of `.gcc.lto_.symtab` section.
+    if (!ctx.arg.plugin.empty()) {
+      std::string_view name = data + shdrs[shstrtab_idx].sh_offset + sec.sh_name;
+      if (name.starts_with(".gnu.lto_.symtab."))
+      return true;
+    }
+
+    if (sec.sh_type != SHT_SYMTAB)
+      continue;
+
+    // GCC non-FAT LTO object contains only sections symbols followed by
+    // a common symbol whose name is `__gnu_lto_slim` (or `__gnu_lto_v1`
+    // for older GCC releases).
+    std::span<ElfSym<E>> elf_syms{(ElfSym<E> *)(data + sec.sh_offset),
+                                  (size_t)sec.sh_size / sizeof(ElfSym<E>)};
+
+    auto skip = [](u8 type) {
+      return type == STT_NOTYPE || type == STT_FILE || type == STT_SECTION;
+    };
+
+    i64 i = 1;
+    while (i < elf_syms.size() && skip(elf_syms[i].st_type))
+      i++;
+
+    if (i < elf_syms.size() && elf_syms[i].st_shndx == SHN_COMMON) {
+      std::string_view name =
+        data + shdrs[sec.sh_link].sh_offset + elf_syms[i].st_name;
+      if (name.starts_with("__gnu_lto_"))
+        return true;
+    }
+    break;
+  }
+
+  return false;
+}
+
+template <typename Context, typename MappedFile>
+FileType get_file_type(Context &ctx, MappedFile *mf) {
+  using namespace elf;
+
+  std::string_view data = mf->get_contents();
+
+  if (data.empty())
+    return FileType::EMPTY;
+
+  if (data.starts_with("\177ELF")) {
+    u8 byte_order = ((ElfEhdr<I386> *)data.data())->e_ident[EI_DATA];
+
+    if (byte_order == ELFDATA2LSB) {
+      auto &ehdr = *(ElfEhdr<I386> *)data.data();
+
+      if (ehdr.e_type == ET_REL) {
+        if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) {
+          if (is_gcc_lto_obj<I386>(ctx, mf))
+            return FileType::GCC_LTO_OBJ;
+        } else {
+          if (is_gcc_lto_obj<X86_64>(ctx, mf))
+            return FileType::GCC_LTO_OBJ;
+        }
+        return FileType::ELF_OBJ;
+      }
+
+      if (ehdr.e_type == ET_DYN)
+        return FileType::ELF_DSO;
+    } else {
+      auto &ehdr = *(ElfEhdr<M68K> *)data.data();
+
+      if (ehdr.e_type == ET_REL) {
+        if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) {
+          if (is_gcc_lto_obj<M68K>(ctx, mf))
+            return FileType::GCC_LTO_OBJ;
+        } else {
+          if (is_gcc_lto_obj<SPARC64>(ctx, mf))
+            return FileType::GCC_LTO_OBJ;
+        }
+        return FileType::ELF_OBJ;
+      }
+
+      if (ehdr.e_type == ET_DYN)
+        return FileType::ELF_DSO;
+    }
+    return FileType::UNKNOWN;
+  }
+
+  if (data.starts_with("\xcf\xfa\xed\xfe")) {
+    switch (*(ul32 *)(data.data() + 12)) {
+    case 1: // MH_OBJECT
+      return FileType::MACH_OBJ;
+    case 2: // MH_EXECUTE
+      return FileType::MACH_EXE;
+    case 6: // MH_DYLIB
+      return FileType::MACH_DYLIB;
+    case 8: // MH_BUNDLE
+      return FileType::MACH_BUNDLE;
+    }
+    return FileType::UNKNOWN;
+  }
+
+  if (data.starts_with("!<arch>\n"))
+    return FileType::AR;
+  if (data.starts_with("!<thin>\n"))
+    return FileType::THIN_AR;
+  if (data.starts_with("--- !tapi-tbd"))
+    return FileType::TAPI;
+  if (data.starts_with("\xca\xfe\xba\xbe"))
+    return FileType::MACH_UNIVERSAL;
+  if (is_text_file(mf))
+    return FileType::TEXT;
+  if (data.starts_with("\xde\xc0\x17\x0b"))
+    return FileType::LLVM_BITCODE;
+  if (data.starts_with("BC\xc0\xde"))
+    return FileType::LLVM_BITCODE;
+  return FileType::UNKNOWN;
+}
+
+inline std::string filetype_to_string(FileType type) {
+  switch (type) {
+  case FileType::UNKNOWN: return "UNKNOWN";
+  case FileType::EMPTY: return "EMPTY";
+  case FileType::ELF_OBJ: return "ELF_OBJ";
+  case FileType::ELF_DSO: return "ELF_DSO";
+  case FileType::MACH_EXE: return "MACH_EXE";
+  case FileType::MACH_OBJ: return "MACH_OBJ";
+  case FileType::MACH_DYLIB: return "MACH_DYLIB";
+  case FileType::MACH_BUNDLE: return "MACH_BUNDLE";
+  case FileType::MACH_UNIVERSAL: return "MACH_UNIVERSAL";
+  case FileType::AR: return "AR";
+  case FileType::THIN_AR: return "THIN_AR";
+  case FileType::TAPI: return "TAPI";
+  case FileType::TEXT: return "TEXT";
+  case FileType::GCC_LTO_OBJ: return "GCC_LTO_OBJ";
+  case FileType::LLVM_BITCODE: return "LLVM_BITCODE";
+  }
+  return "UNKNOWN";
+}
+
+inline std::ostream &operator<<(std::ostream &out, FileType type) {
+  out << filetype_to_string(type);
+  return out;
+}
+
+} // namespace mold
--- a/third_party/mold/glob.cc
+++ b/third_party/mold/glob.cc
@ -0,0 +1,150 @@
+// clang-format off
+#include "third_party/mold/common.h"
+
+#include "third_party/libcxx/cstring"
+
+namespace mold {
+
+std::optional<Glob> Glob::compile(std::string_view pat) {
+  std::vector<Element> vec;
+
+  while (!pat.empty()) {
+    u8 c = pat[0];
+    pat = pat.substr(1);
+
+    switch (c) {
+    case '[': {
+      // Here are a few bracket pattern examples:
+      //
+      // [abc]: a, b or c
+      // [$\]!]: $, ] or !
+      // [a-czg-i]: a, b, c, z, g, h, or i
+      // [^a-z]: Any character except lowercase letters
+      vec.push_back({BRACKET});
+      std::bitset<256> &bitset = vec.back().bitset;
+
+      bool negate = false;
+      if (!pat.empty() && pat[0] == '^') {
+        negate = true;
+        pat = pat.substr(1);
+      }
+
+      bool closed = false;
+
+      while (!pat.empty()) {
+        if (pat[0] == ']') {
+          pat = pat.substr(1);
+          closed = true;
+          break;
+        }
+
+        if (pat[0] == '\\') {
+          pat = pat.substr(1);
+          if (pat.empty())
+            return {};
+        }
+
+        if (pat.size() >= 3 && pat[1] == '-') {
+          u8 start = pat[0];
+          u8 end = pat[2];
+          pat = pat.substr(3);
+
+          if (end == '\\') {
+            if (pat.empty())
+              return {};
+            end = pat[0];
+            pat = pat.substr(1);
+          }
+
+          if (end < start)
+            return {};
+
+          for (i64 i = start; i <= end; i++)
+            bitset[i] = true;
+        } else {
+          bitset[(u8)pat[0]] = true;
+          pat = pat.substr(1);
+        }
+      }
+
+      if (!closed)
+        return {};
+
+      if (negate)
+        bitset.flip();
+      break;
+    }
+    case '?':
+      vec.push_back({QUESTION});
+      break;
+    case '*':
+      vec.push_back({STAR});
+      break;
+    default:
+      if (vec.empty() || vec.back().kind != STRING)
+        vec.push_back({STRING});
+      vec.back().str += c;
+      break;
+    }
+  }
+
+  return {Glob{std::move(vec)}};
+}
+
+bool Glob::match(std::string_view str) {
+  return do_match(str, elements);
+}
+
+bool Glob::do_match(std::string_view str, std::span<Element> elements) {
+  while (!elements.empty()) {
+    Element &e = elements[0];
+    elements = elements.subspan(1);
+
+    switch (e.kind) {
+    case STRING:
+      if (str.empty() || !str.starts_with(e.str))
+        return false;
+      str = str.substr(e.str.size());
+      break;
+    case STAR:
+      if (elements.empty())
+        return true;
+
+      // Patterns like "*foo*bar*" should be much more common than more
+      // complex ones like "*foo*[abc]*" or "*foo**?bar*", so we optimize
+      // the former case here.
+      if (elements[0].kind == STRING) {
+        for (;;) {
+          size_t pos = str.find(elements[0].str);
+          if (pos == str.npos)
+            break;
+          if (do_match(str.substr(pos + elements[0].str.size()),
+                       elements.subspan(1)))
+            return true;
+          str = str.substr(pos + 1);
+        }
+        return false;
+      }
+
+      // Other cases are handled here.
+      for (i64 j = 0; j < str.size(); j++)
+        if (do_match(str.substr(j), elements))
+          return true;
+      return false;
+    case QUESTION:
+      if (str.empty())
+        return false;
+      str = str.substr(1);
+      break;
+    case BRACKET:
+      if (str.empty() || !e.bitset[str[0]])
+        return false;
+      str = str.substr(1);
+      break;
+    }
+  }
+
+  return str.empty();
+}
+
+} // namespace mold
--- a/third_party/mold/hyperloglog.cc
+++ b/third_party/mold/hyperloglog.cc
@ -0,0 +1,21 @@
+// clang-format off
+// This file implements HyperLogLog algorithm, which estimates
+// the number of unique items in a given multiset.
+//
+// For more info, read
+// https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog
+
+#include "third_party/mold/common.h"
+
+#include "third_party/libcxx/cmath"
+
+namespace mold {
+
+i64 HyperLogLog::get_cardinality() const {
+  double z = 0;
+  for (i64 val : buckets)
+    z += pow(2, -val);
+  return ALPHA * NBUCKETS * NBUCKETS / z;
+}
+
+} // namespace mold
--- a/third_party/mold/integers.h
+++ b/third_party/mold/integers.h
@ -0,0 +1,222 @@
+// clang-format off
+// This file defines integral types for file input/output. We need to use
+// these types instead of the plain integers (such as uint32_t or int32_t)
+// when reading from/writing to an mmap'ed file area for the following
+// reasons:
+//
+// 1. mold is always a cross linker and should not depend on what host it
+//    is running on. Users should be able to run mold on a big-endian
+//    SPARC machine to create a little-endian RV64 binary, for example.
+//
+// 2. Even though data members in all ELF data strucutres are naturally
+//    aligned, they are not guaranteed to be aligned on memory. Because
+//    archive file (.a file) aligns each member only to a 2 byte boundary,
+//    anything larger than 2 bytes may be unaligned in an mmap'ed memory.
+//    Unaligned access is an undefined behavior in C/C++, so we shouldn't
+//    cast an arbitrary pointer to a uint32_t, for example, to read a
+//    32-bits value.
+//
+// The data types defined in this file don't depend on host byte order and
+// don't do unaligned access.
+
+#pragma once
+
+#include "third_party/libcxx/bit"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/cstring"
+
+#if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#  define __LITTLE_ENDIAN__ 1
+# elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#  define __BIG_ENDIAN__ 1
+# else
+#  error "unknown host byte order"
+# endif
+#endif
+
+namespace mold {
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t i8;
+typedef int16_t i16;
+typedef int32_t i32;
+typedef int64_t i64;
+
+template <typename T>
+static inline T bswap(T val) {
+  switch (sizeof(T)) {
+  case 2:  return __builtin_bswap16(val);
+  case 4:  return __builtin_bswap32(val);
+  case 8:  return __builtin_bswap64(val);
+  default: __builtin_unreachable();
+  }
+}
+
+template <typename T, int SIZE = sizeof(T)>
+class LittleEndian {
+public:
+  LittleEndian() = default;
+  LittleEndian(T x) { *this = x; }
+
+  operator T() const {
+    if constexpr (sizeof(T) == SIZE) {
+      T x;
+      memcpy(&x, val, sizeof(T));
+      if constexpr (std::endian::native == std::endian::big)
+        x = bswap(x);
+      return x;
+    } else {
+      static_assert(SIZE == 3);
+      return (val[2] << 16) | (val[1] << 8) | val[0];
+    }
+  }
+
+  LittleEndian &operator=(T x) {
+    if constexpr (sizeof(T) == SIZE) {
+      if constexpr (std::endian::native == std::endian::big)
+        x = bswap(x);
+      memcpy(val, &x, sizeof(T));
+    } else {
+      static_assert(SIZE == 3);
+      val[2] = x >> 16;
+      val[1] = x >> 8;
+      val[0] = x;
+    }
+    return *this;
+  }
+
+  LittleEndian &operator++() {
+    return *this = *this + 1;
+  }
+
+  LittleEndian operator++(int) {
+    T ret = *this;
+    *this = *this + 1;
+    return ret;
+  }
+
+  LittleEndian &operator--() {
+    return *this = *this - 1;
+  }
+
+  LittleEndian operator--(int) {
+    T ret = *this;
+    *this = *this - 1;
+    return ret;
+  }
+
+  LittleEndian &operator+=(T x) {
+    return *this = *this + x;
+  }
+
+  LittleEndian &operator-=(T x) {
+    return *this = *this - x;
+  }
+
+  LittleEndian &operator&=(T x) {
+    return *this = *this & x;
+  }
+
+  LittleEndian &operator|=(T x) {
+    return *this = *this | x;
+  }
+
+private:
+  u8 val[SIZE];
+};
+
+using il16 = LittleEndian<i16>;
+using il32 = LittleEndian<i32>;
+using il64 = LittleEndian<i64>;
+using ul16 = LittleEndian<u16>;
+using ul24 = LittleEndian<u32, 3>;
+using ul32 = LittleEndian<u32>;
+using ul64 = LittleEndian<u64>;
+
+template <typename T, int SIZE = sizeof(T)>
+class BigEndian {
+public:
+  BigEndian() = default;
+  BigEndian(T x) { *this = x; }
+
+  operator T() const {
+    if constexpr (sizeof(T) == SIZE) {
+      T x;
+      memcpy(&x, val, sizeof(T));
+      if constexpr (std::endian::native == std::endian::little)
+        x = bswap(x);
+      return x;
+    } else {
+      static_assert(SIZE == 3);
+      return (val[0] << 16) | (val[1] << 8) | val[2];
+    }
+  }
+
+  BigEndian &operator=(T x) {
+    if constexpr (sizeof(T) == SIZE) {
+      if constexpr (std::endian::native == std::endian::little)
+        x = bswap(x);
+      memcpy(val, &x, sizeof(T));
+    } else {
+      static_assert(SIZE == 3);
+      val[0] = x >> 16;
+      val[1] = x >> 8;
+      val[2] = x;
+    }
+    return *this;
+  }
+
+  BigEndian &operator++() {
+    return *this = *this + 1;
+  }
+
+  BigEndian operator++(int) {
+    T ret = *this;
+    *this = *this + 1;
+    return ret;
+  }
+
+  BigEndian &operator--() {
+    return *this = *this - 1;
+  }
+
+  BigEndian operator--(int) {
+    T ret = *this;
+    *this = *this - 1;
+    return ret;
+  }
+
+  BigEndian &operator+=(T x) {
+    return *this = *this + x;
+  }
+
+  BigEndian &operator-=(T x) {
+    return *this = *this - x;
+  }
+
+  BigEndian &operator&=(T x) {
+    return *this = *this & x;
+  }
+
+  BigEndian &operator|=(T x) {
+    return *this = *this | x;
+  }
+
+private:
+  u8 val[SIZE];
+};
+
+using ib16 = BigEndian<i16>;
+using ib32 = BigEndian<i32>;
+using ib64 = BigEndian<i64>;
+using ub16 = BigEndian<u16>;
+using ub24 = BigEndian<u32, 3>;
+using ub32 = BigEndian<u32>;
+using ub64 = BigEndian<u64>;
+
+} // namespace mold
--- a/third_party/mold/main.cc
+++ b/third_party/mold/main.cc
@ -0,0 +1,188 @@
+// clang-format off
+#include "third_party/mold/common.h"
+// MISSING #include "config.h"
+
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/filesystem"
+#include "libc/calls/calls.h"
+#include "libc/calls/sigtimedwait.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+// MISSING #include <tbb/global_control.h>
+
+#ifdef USE_SYSTEM_MIMALLOC
+// MISSING #include <mimalloc-new-delete.h>
+#endif
+
+#ifdef __FreeBSD__
+// MISSING #include <sys/sysctl.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+#endif
+
+namespace mold {
+
+std::string mold_version_string = MOLD_VERSION;
+
+namespace elf {
+int main(int argc, char **argv);
+}
+
+namespace macho {
+int main(int argc, char **argv);
+}
+
+static std::string get_mold_version() {
+  std::string name = MOLD_IS_SOLD ? "mold (sold) " : "mold ";
+  if (mold_git_hash.empty())
+    return name + MOLD_VERSION + " (compatible with GNU ld)";
+  return name + MOLD_VERSION + " (" + mold_git_hash + "; compatible with GNU ld)";
+}
+
+void cleanup() {
+  if (output_tmpfile)
+    unlink(output_tmpfile);
+}
+
+std::string errno_string() {
+  // strerror is not thread-safe, so guard it with a lock.
+  static std::mutex mu;
+  std::scoped_lock lock(mu);
+  return strerror(errno);
+}
+
+// Returns the path of the mold executable itself
+std::string get_self_path() {
+#ifdef __FreeBSD__
+  // /proc may not be mounted on FreeBSD. The proper way to get the
+  // current executable's path is to use sysctl(2).
+  int mib[4];
+  mib[0] = CTL_KERN;
+  mib[1] = KERN_PROC;
+  mib[2] = KERN_PROC_PATHNAME;
+  mib[3] = -1;
+
+  size_t size;
+  sysctl(mib, 4, NULL, &size, NULL, 0);
+
+  std::string path;
+  path.resize(size);
+  sysctl(mib, 4, path.data(), &size, NULL, 0);
+  return path;
+#else
+  return std::filesystem::read_symlink("/proc/self/exe").string();
+#endif
+}
+
+// mold mmap's an output file, and the mmap succeeds even if there's
+// no enough space left on the filesystem. The actual disk blocks are
+// not allocated on the mmap call but when the program writes to it
+// for the first time.
+//
+// If a disk becomes full as a result of a write to an mmap'ed memory
+// region, the failure of the write is reported as a SIGBUS or structured
+// exeption with code EXCEPTION_IN_PAGE_ERROR on Windows. This
+// signal handler catches that signal and prints out a user-friendly
+// error message. Without this, it is very hard to realize that the
+// disk might be full.
+#ifdef _WIN32
+
+static LONG WINAPI vectored_handler(_EXCEPTION_POINTERS *exception_info) {
+  static std::mutex mu;
+  std::scoped_lock lock{mu};
+
+  PEXCEPTION_RECORD exception_record = exception_info->ExceptionRecord;
+  ULONG_PTR *exception_information = exception_record->ExceptionInformation;
+  if (exception_record->ExceptionCode == EXCEPTION_IN_PAGE_ERROR &&
+      (ULONG_PTR)output_buffer_start <= exception_information[1] &&
+      exception_information[1] < (ULONG_PTR)output_buffer_end) {
+
+    const char msg[] = "mold: failed to write to an output file. Disk full?\n";
+    (void)!write(_fileno(stderr), msg, sizeof(msg) - 1);
+  }
+
+  cleanup();
+  _exit(1);
+}
+
+void install_signal_handler() {
+  AddVectoredExceptionHandler(0, vectored_handler);
+}
+
+#else
+
+static void sighandler(int signo, siginfo_t *info, void *ucontext) {
+  static std::mutex mu;
+  std::scoped_lock lock{mu};
+
+  switch (signo) {
+  case SIGSEGV:
+  case SIGBUS:
+    if (output_buffer_start <= info->si_addr &&
+        info->si_addr < output_buffer_end) {
+      const char msg[] = "mold: failed to write to an output file. Disk full?\n";
+      (void)!write(STDERR_FILENO, msg, sizeof(msg) - 1);
+    }
+    break;
+  case SIGABRT: {
+    const char msg[] =
+      "mold: aborted\n"
+      "mold: If mold failed due to a spurious failure of pthread_create, "
+      "it's likely because of https://github.com/oneapi-src/oneTBB/pull/824. "
+      "You should ensure that you are using 2021.9.0 or newer version of libtbb.\n";
+    (void)!write(STDERR_FILENO, msg, sizeof(msg) - 1);
+    break;
+  }
+  }
+
+  _exit(1);
+}
+
+void install_signal_handler() {
+  struct sigaction action;
+  action.sa_sigaction = sighandler;
+  sigemptyset(&action.sa_mask);
+  action.sa_flags = SA_SIGINFO;
+
+  sigaction(SIGABRT, &action, NULL);
+  sigaction(SIGINT, &action, NULL);
+  sigaction(SIGTERM, &action, NULL);
+  sigaction(SIGBUS, &action, NULL);
+}
+
+#endif
+
+i64 get_default_thread_count() {
+  // mold doesn't scale well above 32 threads.
+  int n = tbb::global_control::active_value(
+    tbb::global_control::max_allowed_parallelism);
+  return std::min(n, 32);
+}
+
+} // namespace mold
+
+int main(int argc, char **argv) {
+  mold::mold_version = mold::get_mold_version();
+
+#if MOLD_IS_SOLD
+  std::string cmd = mold::filepath(argv[0]).filename().string();
+  if (cmd == "ld64" || cmd.starts_with("ld64."))
+    return mold::macho::main(argc, argv);
+#endif
+
+  return mold::elf::main(argc, argv);
+}
--- a/third_party/mold/mold.mk
+++ b/third_party/mold/mold.mk
@ -0,0 +1,61 @@
+#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
+#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
+
+PKGS += THIRD_PARTY_MOLD
+
+private CPPFLAGS += -std=c++20
+
+THIRD_PARTY_MOLD_ARTIFACTS += THIRD_PARTY_MOLD_A
+THIRD_PARTY_MOLD = $(THIRD_PARTY_MOLD_A_DEPS) $(THIRD_PARTY_MOLD_A)
+THIRD_PARTY_MOLD_A = o/$(MODE)/third_party/mold/mold.a
+THIRD_PARTY_MOLD_FILES := $(wildcard third_party/mold/*)
+THIRD_PARTY_MOLD_HDRS = $(filter %.h,$(THIRD_PARTY_MOLD_FILES))
+THIRD_PARTY_MOLD_SRCS = $(filter %.cc,$(THIRD_PARTY_MOLD_FILES))
+THIRD_PARTY_MOLD_OBJS = $(THIRD_PARTY_MOLD_SRCS:%.cc=o/$(MODE)/%.o)
+
+THIRD_PARTY_MOLD_A_DIRECTDEPS =				\
+	THIRD_PARTY_LIBCXX					\
+	THIRD_PARTY_XXHASH
+
+THIRD_PARTY_MOLD_A_DEPS :=				\
+	$(call uniq,$(foreach x,$(THIRD_PARTY_MOLD_A_DIRECTDEPS),$($(x))))
+
+# https://github.com/rui314/mold/blob/d4d93d7fb72dd19c44aafa4dd5397e35787d33ad/CMakeLists.txt#L62
+$(THIRD_PARTY_MOLD_OBJS): private			\
+		CPPFLAGS +=							\
+			-std=gnu++20					\
+			-fno-exceptions					\
+			-fno-unwind-tables				\
+			-fno-asynchronous-unwind-tables \
+			-Wno-sign-compare				\
+			-Wno-unused-function			\
+
+THIRD_PARTY_MOLD_CHECKS =				\
+	$(THIRD_PARTY_MOLD_A).pkg			\
+	$(THIRD_PARTY_MOLD_HDRS:%=o/$(MODE)/%.ok)
+
+$(THIRD_PARTY_MOLD_A):					\
+		third_party/mold/			\
+		$(THIRD_PARTY_MOLD_A).pkg		\
+		$(THIRD_PARTY_MOLD_OBJS)
+
+$(THIRD_PARTY_MOLD_A).pkg:				\
+		$(THIRD_PARTY_MOLD_OBJS)		\
+		$(foreach x,$(THIRD_PARTY_MOLD_A_DIRECTDEPS),$($(x)_A).pkg)
+
+o/$(MODE)/third_party/mold/mold.com.dbg:				\
+		$(THIRD_PARTY_MOLD)				\
+		o/$(MODE)/third_party/awk/main.o		\
+		$(CRT)					\
+		$(APE_NO_MODIFY_SELF)
+	@$(APELINK)
+ 
+THIRD_PARTY_MOLD_COMS = o/$(MODE)/third_party/mold/mold.com
+THIRD_PARTY_MOLD_BINS = $(THIRD_PARTY_MOLD_COMS) $(THIRD_PARTY_MOLD_COMS:%=%.dbg)
+THIRD_PARTY_MOLD_LIBS = $(THIRD_PARTY_MOLD_A)
+$(THIRD_PARTY_MOLD_OBJS): $(BUILD_FILES) third_party/mold/mold.mk
+
+.PHONY: o/$(MODE)/third_party/mold
+o/$(MODE)/third_party/mold:						\
+		$(THIRD_PARTY_MOLD_BINS)				\
+		$(THIRD_PARTY_AWK_CHECKS)
--- a/third_party/mold/multi-glob.cc
+++ b/third_party/mold/multi-glob.cc
@ -0,0 +1,167 @@
+// clang-format off
+// This file implements the Aho-Corasick algorithm to match multiple
+// glob patterns to symbol strings as quickly as possible.
+//
+// Here are some examples of glob patterns:
+//
+//    qt_private_api_tag*
+//    *16QAccessibleCache*
+//    *32QAbstractFileIconProviderPrivate*
+//    *17QPixmapIconEngine*
+//
+// `*` is a wildcard that matches any substring. We sometimes have
+// hundreds of glob patterns and have to match them against millions
+// of symbol strings.
+//
+// Aho-Corasick cannot handle complex patterns such as `*foo*bar*`.
+// We handle such patterns with the Glob class. Glob is relatively
+// slow, but complex patterns are rare in practice, so it should be
+// OK.
+
+#include "third_party/mold/common.h"
+
+#include "third_party/libcxx/queue"
+#include "third_party/libcxx/regex"
+
+namespace mold {
+
+std::optional<u32> MultiGlob::find(std::string_view str) {
+  std::call_once(once, [&] { compile(); });
+  u32 val = UINT32_MAX;
+
+  if (root) {
+    // Match against simple glob patterns
+    TrieNode *node = root.get();
+
+    auto walk = [&](u8 c) {
+      for (;;) {
+        if (node->children[c]) {
+          node = node->children[c].get();
+          val = std::min(val, node->value);
+          return;
+        }
+
+        if (!node->suffix_link)
+          return;
+        node = node->suffix_link;
+      }
+    };
+
+    walk('\0');
+    for (u8 c : str)
+      walk(c);
+    walk('\0');
+  }
+
+  // Match against complex glob patterns
+  for (std::pair<Glob, u32> &glob : globs)
+    if (glob.first.match(str))
+      val = std::min(val, glob.second);
+
+  if (val == UINT32_MAX)
+    return {};
+  return val;
+}
+
+static bool is_simple_pattern(std::string_view pat) {
+  static std::regex re(R"(\*?[^*[?]+\*?)", std::regex_constants::optimize);
+  return std::regex_match(pat.begin(), pat.end(), re);
+}
+
+static std::string handle_stars(std::string_view pat) {
+  std::string str(pat);
+
+  // Convert "foo" -> "\0foo\0", "*foo" -> "foo\0", "foo*" -> "\0foo"
+  // and "*foo*" -> "foo". Aho-Corasick can do only substring matching,
+  // so we use \0 as beginning/end-of-string markers.
+  if (str.starts_with('*') && str.ends_with('*'))
+    return str.substr(1, str.size() - 2);
+  if (str.starts_with('*'))
+    return str.substr(1) + "\0"s;
+  if (str.ends_with('*'))
+    return "\0"s + str.substr(0, str.size() - 1);
+  return "\0"s + str + "\0"s;
+}
+
+bool MultiGlob::add(std::string_view pat, u32 val) {
+  assert(!is_compiled);
+  assert(!pat.empty());
+
+  strings.push_back(std::string(pat));
+
+  // Complex glob pattern
+  if (!is_simple_pattern(pat)) {
+    if (std::optional<Glob> glob = Glob::compile(pat)) {
+      globs.push_back({std::move(*glob), val});
+      return true;
+    }
+    return false;
+  }
+
+  // Simple glob pattern
+  if (!root)
+    root.reset(new TrieNode);
+  TrieNode *node = root.get();
+
+  for (u8 c : handle_stars(pat)) {
+    if (!node->children[c])
+      node->children[c].reset(new TrieNode);
+    node = node->children[c].get();
+  }
+
+  node->value = std::min(node->value, val);
+  return true;
+}
+
+void MultiGlob::compile() {
+  is_compiled = true;
+  if (root) {
+    fix_suffix_links(*root);
+    fix_values();
+  }
+}
+
+void MultiGlob::fix_suffix_links(TrieNode &node) {
+  for (i64 i = 0; i < 256; i++) {
+    if (!node.children[i])
+      continue;
+
+    TrieNode &child = *node.children[i];
+
+    TrieNode *cur = node.suffix_link;
+    for (;;) {
+      if (!cur) {
+        child.suffix_link = root.get();
+        break;
+      }
+
+      if (cur->children[i]) {
+        child.suffix_link = cur->children[i].get();
+        break;
+      }
+
+      cur = cur->suffix_link;
+    }
+
+    fix_suffix_links(child);
+  }
+}
+
+void MultiGlob::fix_values() {
+  std::queue<TrieNode *> queue;
+  queue.push(root.get());
+
+  do {
+    TrieNode *node = queue.front();
+    queue.pop();
+
+    for (std::unique_ptr<TrieNode> &child : node->children) {
+      if (!child)
+        continue;
+      child->value = std::min(child->value, child->suffix_link->value);
+      queue.push(child.get());
+    }
+  } while (!queue.empty());
+}
+
+} // namespace mold
--- a/third_party/mold/output-file-unix.h
+++ b/third_party/mold/output-file-unix.h
@ -0,0 +1,203 @@
+// clang-format off
+#include "third_party/mold/common.h"
+
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/flock.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/at.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fd.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/splice.h"
+#include "third_party/libcxx/filesystem"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/mlock.h"
+#include "libc/sysv/consts/msync.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/prot.h"
+#include "libc/sysv/consts/madv.h"
+#include "libc/sysv/consts/mfd.h"
+#include "libc/sysv/consts/mremap.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/stat.h"
+#include "libc/calls/struct/stat.macros.h"
+#include "libc/calls/struct/timespec.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/utime.h"
+#include "libc/time/time.h"
+#include "libc/calls/makedev.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/thread/thread.h"
+#include "libc/calls/typedef/u.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/intrin/newbie.h"
+#include "libc/sock/select.h"
+#include "libc/sysv/consts/endian.h"
+
+namespace mold {
+
+inline u32 get_umask() {
+  u32 orig_umask = umask(0);
+  umask(orig_umask);
+  return orig_umask;
+}
+
+template <typename Context>
+static std::pair<i64, char *>
+open_or_create_file(Context &ctx, std::string path, i64 filesize, i64 perm) {
+  std::string tmpl = filepath(path).parent_path() / ".mold-XXXXXX";
+  char *path2 = (char *)save_string(ctx, tmpl).data();
+
+  i64 fd = mkstemp(path2);
+  if (fd == -1)
+    Fatal(ctx) << "cannot open " << path2 <<  ": " << errno_string();
+
+  // Reuse an existing file if exists and writable because on Linux,
+  // writing to an existing file is much faster than creating a fresh
+  // file and writing to it.
+  if (ctx.overwrite_output_file && rename(path.c_str(), path2) == 0) {
+    ::close(fd);
+    fd = ::open(path2, O_RDWR | O_CREAT, perm);
+    if (fd != -1 && !ftruncate(fd, filesize) && !fchmod(fd, perm & ~get_umask()))
+      return {fd, path2};
+
+    unlink(path2);
+    fd = ::open(path2, O_RDWR | O_CREAT, perm);
+    if (fd == -1)
+      Fatal(ctx) << "cannot open " << path2 << ": " << errno_string();
+  }
+
+  if (ftruncate(fd, filesize))
+    Fatal(ctx) << "ftruncate failed: " << errno_string();
+
+  if (fchmod(fd, (perm & ~get_umask())) == -1)
+    Fatal(ctx) << "fchmod failed: " << errno_string();
+  return {fd, path2};
+}
+
+template <typename Context>
+class MemoryMappedOutputFile : public OutputFile<Context> {
+public:
+  MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm)
+    : OutputFile<Context>(path, filesize, true) {
+    i64 fd;
+    std::tie(fd, output_tmpfile) = open_or_create_file(ctx, path, filesize, perm);
+
+    this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE,
+                           MAP_SHARED, fd, 0);
+    if (this->buf == MAP_FAILED)
+      Fatal(ctx) << path << ": mmap failed: " << errno_string();
+    ::close(fd);
+
+    mold::output_buffer_start = this->buf;
+    mold::output_buffer_end = this->buf + filesize;
+  }
+
+  ~MemoryMappedOutputFile() {
+    if (fd2 != -1)
+      ::close(fd2);
+  }
+
+  void close(Context &ctx) override {
+    Timer t(ctx, "close_file");
+
+    if (!this->is_unmapped)
+      munmap(this->buf, this->filesize);
+
+    // If an output file already exists, open a file and then remove it.
+    // This is the fastest way to unlink a file, as it does not make the
+    // system to immediately release disk blocks occupied by the file.
+    fd2 = ::open(this->path.c_str(), O_RDONLY);
+    if (fd2 != -1)
+      unlink(this->path.c_str());
+
+    if (rename(output_tmpfile, this->path.c_str()) == -1)
+      Fatal(ctx) << this->path << ": rename failed: " << errno_string();
+    output_tmpfile = nullptr;
+  }
+
+private:
+  int fd2 = -1;
+};
+
+template <typename Context>
+class MallocOutputFile : public OutputFile<Context> {
+public:
+  MallocOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm)
+    : OutputFile<Context>(path, filesize, false), perm(perm) {
+    this->buf = (u8 *)mmap(NULL, filesize, PROT_READ | PROT_WRITE,
+                           MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+    if (this->buf == MAP_FAILED)
+      Fatal(ctx) << "mmap failed: " << errno_string();
+  }
+
+  void close(Context &ctx) override {
+    Timer t(ctx, "close_file");
+
+    if (this->path == "-") {
+      fwrite(this->buf, this->filesize, 1, stdout);
+      fclose(stdout);
+      return;
+    }
+
+    i64 fd = ::open(this->path.c_str(), O_RDWR | O_CREAT, perm);
+    if (fd == -1)
+      Fatal(ctx) << "cannot open " << this->path << ": " << errno_string();
+
+    FILE *fp = fdopen(fd, "w");
+    fwrite(this->buf, this->filesize, 1, fp);
+    fclose(fp);
+  }
+
+private:
+  i64 perm;
+};
+
+template <typename Context>
+std::unique_ptr<OutputFile<Context>>
+OutputFile<Context>::open(Context &ctx, std::string path, i64 filesize, i64 perm) {
+  Timer t(ctx, "open_file");
+
+  if (path.starts_with('/') && !ctx.arg.chroot.empty())
+    path = ctx.arg.chroot + "/" + path_clean(path);
+
+  bool is_special = false;
+  if (path == "-") {
+    is_special = true;
+  } else {
+    struct stat st;
+    if (stat(path.c_str(), &st) == 0 && (st.st_mode & S_IFMT) != S_IFREG)
+      is_special = true;
+  }
+
+  OutputFile<Context> *file;
+  if (is_special)
+    file = new MallocOutputFile(ctx, path, filesize, perm);
+  else
+    file = new MemoryMappedOutputFile(ctx, path, filesize, perm);
+
+#ifdef MADV_HUGEPAGE
+  // Enable transparent huge page for an output memory-mapped file.
+  // On Linux, it has an effect only on tmpfs mounted with `huge=advise`,
+  // but it can make the linker ~10% faster. You can try it by creating
+  // a tmpfs with the following commands
+  //
+  //  $ mkdir tmp
+  //  $ sudo mount -t tmpfs -o size=2G,huge=advise none tmp
+  //
+  // and then specifying a path under the directory as an output file.
+  madvise(file->buf, filesize, MADV_HUGEPAGE);
+#endif
+
+  if (ctx.arg.filler != -1)
+    memset(file->buf, ctx.arg.filler, filesize);
+  return std::unique_ptr<OutputFile>(file);
+}
+
+} // namespace mold
--- a/third_party/mold/output-file-win32.h
+++ b/third_party/mold/output-file-win32.h
@ -0,0 +1,85 @@
+// clang-format off
+#include "third_party/mold/common.h"
+
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/flock.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/at.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fd.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/s.h"
+#include "libc/sysv/consts/splice.h"
+#include "third_party/libcxx/filesystem"
+#include "libc/nt/accounting.h"
+#include "libc/nt/automation.h"
+#include "libc/nt/console.h"
+#include "libc/nt/debug.h"
+#include "libc/nt/dll.h"
+#include "libc/nt/enum/keyaccess.h"
+#include "libc/nt/enum/regtype.h"
+#include "libc/nt/errors.h"
+#include "libc/nt/events.h"
+#include "libc/nt/files.h"
+#include "libc/nt/ipc.h"
+#include "libc/nt/memory.h"
+#include "libc/nt/paint.h"
+#include "libc/nt/process.h"
+#include "libc/nt/registry.h"
+#include "libc/nt/synchronization.h"
+#include "libc/nt/thread.h"
+#include "libc/nt/windows.h"
+#include "libc/nt/winsock.h"
+
+namespace mold {
+
+template <typename Context>
+class MallocOutputFile : public OutputFile<Context> {
+public:
+  MallocOutputFile(Context &ctx, std::string path, i64 filesize, i64 perm)
+    : OutputFile<Context>(path, filesize, false), perm(perm) {
+    this->buf = (u8 *)malloc(filesize);
+    if (!this->buf)
+      Fatal(ctx) << "malloc failed";
+  }
+
+  void close(Context &ctx) override {
+    Timer t(ctx, "close_file");
+
+    if (this->path == "-") {
+      fwrite(this->buf, this->filesize, 1, stdout);
+      fclose(stdout);
+      return;
+    }
+
+    i64 fd = ::open(this->path.c_str(), O_RDWR | O_CREAT, perm);
+    if (fd == -1)
+      Fatal(ctx) << "cannot open " << this->path << ": " << errno_string();
+
+    FILE *fp = fdopen(fd, "w");
+    fwrite(this->buf, this->filesize, 1, fp);
+    fclose(fp);
+    free(this->buf);
+  }
+
+private:
+  i64 perm;
+};
+
+template <typename Context>
+std::unique_ptr<OutputFile<Context>>
+OutputFile<Context>::open(Context &ctx, std::string path, i64 filesize, i64 perm) {
+  Timer t(ctx, "open_file");
+
+  if (path.starts_with('/') && !ctx.arg.chroot.empty())
+    path = ctx.arg.chroot + "/" + path_clean(path);
+
+  OutputFile<Context> *file = new MallocOutputFile(ctx, path, filesize, perm);
+
+  if (ctx.arg.filler != -1)
+    memset(file->buf, ctx.arg.filler, filesize);
+  return std::unique_ptr<OutputFile<Context>>(file);
+}
+
+} // namespace mold
--- a/third_party/mold/output-file.h
+++ b/third_party/mold/output-file.h
@ -0,0 +1,6 @@
+// clang-format off
+#if _WIN32
+#include "third_party/mold/output-file-win32.h"
+#else
+#include "third_party/mold/output-file-unix.h"
+#endif
--- a/third_party/mold/perf.cc
+++ b/third_party/mold/perf.cc
@ -0,0 +1,140 @@
+// clang-format off
+#include "third_party/mold/common.h"
+
+#include "third_party/libcxx/functional"
+#include "third_party/libcxx/iomanip"
+#include "third_party/libcxx/ios"
+
+#ifndef _WIN32
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/rusage.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sysv/consts/prio.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
+#include "libc/sysv/consts/rusage.h"
+#include "libc/calls/struct/itimerval.h"
+#include "libc/calls/struct/timeval.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/sock/select.h"
+#include "libc/sysv/consts/clock.h"
+#include "libc/sysv/consts/itimer.h"
+#include "libc/time/struct/timezone.h"
+#include "libc/time/time.h"
+#endif
+
+namespace mold {
+
+i64 Counter::get_value() {
+  return values.combine(std::plus());
+}
+
+void Counter::print() {
+  sort(instances, [](Counter *a, Counter *b) {
+    return a->get_value() > b->get_value();
+  });
+
+  for (Counter *c : instances)
+    std::cout << std::setw(20) << std::right << c->name
+              << "=" << c->get_value() << "\n";
+}
+
+static i64 now_nsec() {
+#ifdef _WIN32
+  return (i64)std::chrono::steady_clock::now().time_since_epoch().count();
+#else
+  struct timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return (i64)t.tv_sec * 1'000'000'000 + t.tv_nsec;
+#endif
+}
+
+static std::pair<i64, i64> get_usage() {
+#ifdef _WIN32
+  auto to_nsec = [](FILETIME t) -> i64 {
+    return ((u64)t.dwHighDateTime << 32 + (u64)t.dwLowDateTime) * 100;
+  };
+
+  FILETIME creation, exit, kernel, user;
+  GetProcessTimes(GetCurrentProcess(), &creation, &exit, &kernel, &user);
+  return {to_nsec(user), to_nsec(kernel)};
+#else
+  auto to_nsec = [](struct timeval t) -> i64 {
+    return (i64)t.tv_sec * 1'000'000'000 + t.tv_usec * 1'000;
+  };
+
+  struct rusage ru;
+  getrusage(RUSAGE_SELF, &ru);
+  return {to_nsec(ru.ru_utime), to_nsec(ru.ru_stime)};
+#endif
+}
+
+TimerRecord::TimerRecord(std::string name, TimerRecord *parent)
+  : name(name), parent(parent) {
+  start = now_nsec();
+  std::tie(user, sys) = get_usage();
+  if (parent)
+    parent->children.push_back(this);
+}
+
+void TimerRecord::stop() {
+  if (stopped)
+    return;
+  stopped = true;
+
+  i64 user2;
+  i64 sys2;
+  std::tie(user2, sys2) = get_usage();
+
+  end = now_nsec();
+  user = user2 - user;
+  sys = sys2 - sys;
+}
+
+static void print_rec(TimerRecord &rec, i64 indent) {
+  printf(" % 8.3f % 8.3f % 8.3f  %s%s\n",
+         ((double)rec.user / 1'000'000'000),
+         ((double)rec.sys / 1'000'000'000),
+         (((double)rec.end - rec.start) / 1'000'000'000),
+         std::string(indent * 2, ' ').c_str(),
+         rec.name.c_str());
+
+  sort(rec.children, [](TimerRecord *a, TimerRecord *b) {
+    return a->start < b->start;
+  });
+
+  for (TimerRecord *child : rec.children)
+    print_rec(*child, indent + 1);
+}
+
+void print_timer_records(
+    tbb::concurrent_vector<std::unique_ptr<TimerRecord>> &records) {
+  for (i64 i = records.size() - 1; i >= 0; i--)
+    records[i]->stop();
+
+  for (i64 i = 0; i < records.size(); i++) {
+    TimerRecord &inner = *records[i];
+    if (inner.parent)
+      continue;
+
+    for (i64 j = i - 1; j >= 0; j--) {
+      TimerRecord &outer = *records[j];
+      if (outer.start <= inner.start && inner.end <= outer.end) {
+        inner.parent = &outer;
+        outer.children.push_back(&inner);
+        break;
+      }
+    }
+  }
+
+  std::cout << "     User   System     Real  Name\n";
+
+  for (std::unique_ptr<TimerRecord> &rec : records)
+    if (!rec->parent)
+      print_rec(*rec, 0);
+
+  std::cout << std::flush;
+}
+
+} // namespace mold
--- a/third_party/mold/sha.h
+++ b/third_party/mold/sha.h
@ -0,0 +1,82 @@
+// clang-format off
+#pragma once
+
+#include "third_party/libcxx/cstdint"
+
+typedef uint8_t u8;
+static constexpr int64_t SHA256_SIZE = 32;
+
+#ifdef _WIN32
+// On Windows, we use Microsoft CNG.
+
+// MISSING #include <Windows.h>
+// MISSING #include <bcrypt.h>
+// MISSING #include <ntstatus.h>
+
+inline static BCRYPT_ALG_HANDLE get_sha256_handle() {
+  static std::once_flag once;
+  static BCRYPT_ALG_HANDLE alg;
+
+  std::call_once(once, [&] {
+    BCryptOpenAlgorithmProvider(&alg, BCRYPT_SHA256_ALGORITHM, nullptr, 0);
+  });
+  return alg;
+}
+
+inline void sha256_hash(u8 *in, size_t len, u8 *out) {
+  BCryptHash(get_sha256_handle(), nullptr, 0, in, len, out, SHA256_SIZE);
+}
+
+class SHA256Hash {
+public:
+  SHA256Hash() {
+    BCryptCreateHash(get_sha256_handle(), &handle, nullptr, 0, nullptr, 0, 0);
+  }
+
+  void update(u8 *data, size_t len) {
+    BCryptHashData(handle, data, len, 0);
+  }
+
+  void finish(u8 *out) {
+    BCryptFinishHash(handle, out, SHA256_SIZE, 0);
+  }
+
+private:
+  BCRYPT_HASH_HANDLE handle;
+};
+
+#else
+// On Unix, we use OpenSSL or the Apple's OpenSSL-compatible API.
+
+#ifdef __APPLE__
+#  define COMMON_DIGEST_FOR_OPENSSL
+// MISSING #include <CommonCrypto/CommonDigest.h>
+#  define SHA256(data, len, md) CC_SHA256(data, len, md)
+#else
+#  define OPENSSL_SUPPRESS_DEPRECATED 1
+// MISSING #include <openssl/sha.h>
+#endif
+
+inline void sha256_hash(u8 *in, size_t len, u8 *out) {
+  SHA256(in, len, out);
+}
+
+class SHA256Hash {
+public:
+  SHA256Hash() {
+    SHA256_Init(&ctx);
+  }
+
+  void update(u8 *data, size_t len) {
+    SHA256_Update(&ctx, data, len);
+  }
+
+  void finish(u8 *out) {
+    SHA256_Final(out, &ctx);
+  }
+
+private:
+  SHA256_CTX ctx;
+};
+
+#endif
--- a/third_party/mold/tar.cc
+++ b/third_party/mold/tar.cc
@ -0,0 +1,113 @@
+// clang-format off
+#include "third_party/mold/common.h"
+
+namespace mold {
+
+// A tar file consists of one or more Ustar header followed by data.
+// Each Ustar header represents a single file in an archive.
+//
+// tar is an old file format, and its `name` field is only 100 bytes long.
+// If `name` is longer than 100 bytes, we can emit a PAX header before a
+// Ustar header to store a long filename.
+//
+// For simplicity, we always emit a PAX header even for a short filename.
+struct UstarHeader {
+  UstarHeader() {
+    memset(this, 0, sizeof(*this));
+  }
+
+  void finalize() {
+    memset(checksum, ' ', sizeof(checksum));
+    memcpy(magic, "ustar", 5);
+    memcpy(version, "00", 2);
+
+    // Compute checksum
+    int sum = 0;
+    for (i64 i = 0; i < sizeof(*this); i++)
+      sum += ((u8 *)this)[i];
+
+    // We need to convince the compiler that sum isn't too big to silence
+    // -Werror=format-truncation.
+    ASSUME(sum < 01'000'000);
+    snprintf(checksum, sizeof(checksum), "%06o", sum);
+  }
+
+  char name[100];
+  char mode[8];
+  char uid[8];
+  char gid[8];
+  char size[12];
+  char mtime[12];
+  char checksum[8];
+  char typeflag[1];
+  char linkname[100];
+  char magic[6];
+  char version[2];
+  char uname[32];
+  char gname[32];
+  char devmajor[8];
+  char devminor[8];
+  char prefix[155];
+  char pad[12];
+};
+
+static std::string encode_path(std::string basedir, std::string path) {
+  path = path_clean(basedir + "/" + path);
+
+  // Construct a string which contains something like
+  // "16 path=foo/bar\n" where 16 is the size of the string
+  // including the size string itself.
+  i64 len = std::string(" path=\n").size() + path.size();
+  i64 total = std::to_string(len).size() + len;
+  total = std::to_string(total).size() + len;
+  return std::to_string(total) + " path=" + path + "\n";
+}
+
+std::unique_ptr<TarWriter>
+TarWriter::open(std::string output_path, std::string basedir) {
+  FILE *out = fopen(output_path.c_str(), "w");
+  if (!out)
+    return nullptr;
+  return std::unique_ptr<TarWriter>(new TarWriter(out, basedir));
+}
+
+TarWriter::~TarWriter() {
+  fclose(out);
+}
+
+void TarWriter::append(std::string path, std::string_view data) {
+  // Write PAX header
+  static_assert(sizeof(UstarHeader) == BLOCK_SIZE);
+  UstarHeader pax;
+
+  std::string attr = encode_path(basedir, path);
+  snprintf(pax.size, sizeof(pax.size), "%011zo", attr.size());
+  pax.typeflag[0] = 'x';
+  pax.finalize();
+  fwrite(&pax, sizeof(pax), 1, out);
+
+  // Write pathname
+  fwrite(attr.data(), attr.size(), 1, out);
+  fseek(out, align_to(ftell(out), BLOCK_SIZE), SEEK_SET);
+
+  // Write Ustar header
+  UstarHeader ustar;
+  memcpy(ustar.mode, "0000664", 8);
+  snprintf(ustar.size, sizeof(ustar.size), "%011zo", data.size());
+  ustar.finalize();
+  fwrite(&ustar, sizeof(ustar), 1, out);
+
+  // Write file contents
+  fwrite(data.data(), data.size(), 1, out);
+  fseek(out, align_to(ftell(out), BLOCK_SIZE), SEEK_SET);
+
+  // A tar file must ends with two empty blocks, so write such
+  // terminator and seek back.
+  u8 terminator[BLOCK_SIZE * 2] = {};
+  fwrite(&terminator, BLOCK_SIZE * 2, 1, out);
+  fseek(out, -BLOCK_SIZE * 2, SEEK_END);
+
+  assert(ftell(out) % BLOCK_SIZE == 0);
+}
+
+} // namespace mold
--- a/third_party/mold/test/elf/CMakeLists.txt
+++ b/third_party/mold/test/elf/CMakeLists.txt
@ -0,0 +1,74 @@
+// clang-format off
+option(MOLD_ENABLE_QEMU_TESTS "Enable tests on non-native targets" OFF)
+option(MOLD_ENABLE_QEMU_TESTS_RV32 "Enable tests for RV32" OFF)
+option(MOLD_ENABLE_QEMU_TESTS_POWER10 "Enable tests for Power10" OFF)
+
+function(add_target TRIPLE)
+  set(HOST ${CMAKE_HOST_SYSTEM_PROCESSOR})
+
+  if(${HOST} MATCHES "amd64")
+    set(HOST x86_64)
+  elseif(${HOST} MATCHES "arm.*")
+    set(HOST arm)
+  elseif(${HOST} STREQUAL "ppc64")
+    set(HOST powerpc64)
+  endif()
+
+  if(${TRIPLE} MATCHES "${HOST}-.*")
+    set(IS_NATIVE 1)
+  endif()
+
+  if(${TRIPLE} MATCHES "([^-]+)-.")
+    set(MACHINE ${CMAKE_MATCH_1})
+  endif()
+
+  if(IS_NATIVE OR MOLD_ENABLE_QEMU_TESTS)
+    file(GLOB ALL_TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS
+      "*.sh")
+
+    list(FILTER ALL_TESTS EXCLUDE REGEX "_")
+
+    file(GLOB TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS
+      "${MACHINE}_*.sh")
+
+    list(APPEND TESTS ${ALL_TESTS})
+
+    foreach(TEST IN LISTS TESTS)
+      string(REGEX REPLACE "\\.sh$" "" TESTNAME "${MACHINE}-${TEST}")
+
+      add_test(NAME ${TESTNAME}
+        COMMAND bash -x ${CMAKE_CURRENT_LIST_DIR}/${TEST}
+        WORKING_DIRECTORY ${mold_BINARY_DIR})
+
+      if(IS_NATIVE)
+        set_tests_properties(${TESTNAME} PROPERTIES
+          SKIP_REGULAR_EXPRESSION "skipped")
+      else()
+        set_tests_properties(${TESTNAME} PROPERTIES
+          ENVIRONMENT "TRIPLE=${TRIPLE}")
+      endif()
+    endforeach()
+  endif()
+endfunction()
+
+add_target(x86_64-linux-gnu)
+add_target(i686-linux-gnu)
+add_target(aarch64-linux-gnu)
+add_target(arm-linux-gnueabihf)
+add_target(riscv64-linux-gnu)
+add_target(powerpc-linux-gnu)
+add_target(powerpc64-linux-gnu)
+add_target(powerpc64le-linux-gnu)
+add_target(sparc64-linux-gnu)
+add_target(s390x-linux-gnu)
+add_target(m68k-linux-gnu)
+add_target(sh4-linux-gnu)
+add_target(alpha-linux-gnu)
+
+if(MOLD_ENABLE_QEMU_TESTS_RV32)
+  add_target(riscv32-linux-gnu)
+endif()
+
+if(MOLD_ENABLE_QEMU_TESTS_POWER10)
+  add_target(powerpc64le_power10-linux-gnu)
+endif()
--- a/third_party/mold/test/elf/aarch64_range-extension-thunk-disassembly.sh
+++ b/third_party/mold/test/elf/aarch64_range-extension-thunk-disassembly.sh
@ -0,0 +1,30 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = aarch64 ] || skip
+
+cat <<EOF | $CC -c -o $t/a.o -fPIC -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+void fn1();
+void fn2();
+
+__attribute__((section(".low")))  void fn1() { fn2(); }
+__attribute__((section(".high"))) void fn2() { fn1(); }
+
+int main() {
+  fn1();
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o \
+  -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
+
+$OBJDUMP -dr $t/exe | grep -Fq '<fn1$thunk>:'
--- a/third_party/mold/test/elf/abs-error.sh
+++ b/third_party/mold/test/elf/abs-error.sh
@ -0,0 +1,29 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = aarch64 ] && skip
+[ $MACHINE = ppc64 ] && skip
+[ $MACHINE = ppc64le ] && skip
+[ $MACHINE = s390x ] && skip
+[ $MACHINE = alpha ] && skip
+
+cat <<EOF | $CC -fPIC -c -o $t/a.o -xassembler -
+.globl foo
+foo = 3;
+EOF
+
+cat <<EOF | $CC -fno-PIC -c -o $t/b.o -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+extern char foo;
+int main() { printf("foo=%p\n", &foo); }
+EOF
+
+! $CC -B. -o $t/exe -pie $t/a.o $t/b.o -Wl,-z,text >& $t/log
+grep -q 'recompile with -fPIC' $t/log
--- a/third_party/mold/test/elf/absolute-symbols.sh
+++ b/third_party/mold/test/elf/absolute-symbols.sh
@ -0,0 +1,67 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+# This test crashes only on qemu-sparc64 running on GitHub Actions,
+# even though it works on a local x86-64 machine and on an actual
+# SPARC machine.
+[ $MACHINE = sparc64 ] && skip
+
+cat <<EOF | $CC -o $t/a.o -c -x assembler -
+.globl foo
+foo = 0x800008
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -fno-PIC -xc -
+#define _GNU_SOURCE 1
+#include "libc/calls/calls.h"
+#include "libc/calls/sigtimedwait.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/termios.h"
+#include "libc/fmt/conv.h"
+#include "libc/limits.h"
+#include "libc/mem/alg.h"
+#include "libc/mem/alloca.h"
+#include "libc/mem/mem.h"
+#include "libc/runtime/runtime.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/rand.h"
+#include "libc/stdio/temp.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/exit.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/rand48.h"
+// MISSING #include <ucontext.h>
+
+void handler(int signum, siginfo_t *info, void *ptr) {
+  printf("ip=%p\n", info->si_addr);
+  exit(0);
+}
+
+extern volatile int foo;
+
+int main() {
+  struct sigaction act;
+  act.sa_flags = SA_SIGINFO | SA_RESETHAND;
+  act.sa_sigaction = handler;
+  sigemptyset(&act.sa_mask);
+  sigaction(SIGSEGV, &act, 0);
+  foo = 5;
+}
+EOF
+
+$CC -B. -o $t/exe -no-pie $t/a.o $t/b.o
+$QEMU $t/exe | grep -q '^ip=0x80000.$'
--- a/third_party/mold/test/elf/allow-multiple-definition.sh
+++ b/third_party/mold/test/elf/allow-multiple-definition.sh
@ -0,0 +1,10 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+echo 'int main() { return 0; }' | $CC -c -o $t/a.o -xc -
+echo 'int main() { return 1; }' | $CC -c -o $t/b.o -xc -
+
+! $CC -B. -o $t/exe $t/a.o $t/b.o 2> /dev/null || false
+$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-allow-multiple-definition
+$CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-z,muldefs
--- a/third_party/mold/test/elf/ar-alignment.sh
+++ b/third_party/mold/test/elf/ar-alignment.sh
@ -0,0 +1,35 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+int two() { return 2; }
+EOF
+
+head -c 1 /dev/zero >> $t/a.o
+
+cat <<EOF | $CC -o $t/b.o -c -xc -
+int three() { return 3; }
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int two();
+int three();
+
+int main() {
+  printf("%d\n", two() + three());
+}
+EOF
+
+rm -f $t/d.a
+ar rcs $t/d.a $t/a.o $t/b.o
+
+$CC -B. -o $t/exe $t/c.o $t/d.a
--- a/third_party/mold/test/elf/arm_range-extension-thunk-disassembly.sh
+++ b/third_party/mold/test/elf/arm_range-extension-thunk-disassembly.sh
@ -0,0 +1,34 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = arm ] || skip
+
+cat <<EOF | $CC -c -o $t/a.o -fPIC -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+void fn1();
+void fn2();
+
+__attribute__((section(".low")))  void fn1() { fn2(); }
+__attribute__((section(".high"))) void fn2() { fn1(); }
+
+int main() {
+  fn1();
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o \
+  -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
+
+$OBJDUMP -dr $t/exe | grep -F -A7 '<fn1$thunk>:' > $t/log
+
+grep -Eq 'mov\s+ip, pc' $t/log
+grep -Eq 'bx\s+ip' $t/log
+grep -Eq 'add\s+ip, ip, pc' $t/log
--- a/third_party/mold/test/elf/arm_range-extension-thunk.sh
+++ b/third_party/mold/test/elf/arm_range-extension-thunk.sh
@ -0,0 +1,60 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = arm ] || skip
+
+echo 'int main() {}' | $CC -c -o /dev/null -xc - -O0 -mthumb >& /dev/null \
+  || skip
+
+cat <<EOF > $t/a.c
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+void fn3();
+void fn4();
+
+__attribute__((section(".low"))) void fn1() { printf(" fn1"); fn3(); }
+__attribute__((section(".low"))) void fn2() { printf(" fn2"); fn4(); }
+
+int main() {
+  printf(" main");
+  fn1();
+  printf("\n");
+}
+EOF
+
+cat <<EOF > $t/b.c
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+void fn1();
+void fn2();
+
+__attribute__((section(".high"))) void fn3() { printf(" fn3"); fn2(); }
+__attribute__((section(".high"))) void fn4() { printf(" fn4"); }
+EOF
+
+$CC -c -o $t/c.o $t/a.c -O0 -mthumb
+$CC -c -o $t/d.o $t/b.c -O0 -marm
+
+$CC -B. -o $t/exe $t/c.o $t/d.o \
+  -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
+$QEMU $t/exe | grep -q 'main fn1 fn3 fn2 fn4'
+
+$CC -c -o $t/e.o $t/a.c -O2 -mthumb
+$CC -c -o $t/f.o $t/b.c -O2 -marm
+
+$CC -B. -o $t/exe $t/e.o $t/f.o \
+  -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
+$QEMU $t/exe | grep -q 'main fn1 fn3 fn2 fn4'
--- a/third_party/mold/test/elf/arm_thumb-interwork.sh
+++ b/third_party/mold/test/elf/arm_thumb-interwork.sh
@ -0,0 +1,45 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[[ $MACHINE == arm* ]] || skip
+
+echo 'int foo() { return 0; }' | $CC -o /dev/null -c -xc - -mthumb 2> /dev/null || skip
+
+cat <<EOF | $CC -o $t/a.o -c -xc - -mthumb
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+int bar();
+int foo() {
+  printf(" foo");
+  bar();
+}
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc - -marm
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+int bar() {
+  printf(" bar\n");
+}
+
+int foo();
+
+int main() {
+  printf("main");
+  foo();
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o
+$QEMU $t/exe | grep -q 'main foo bar'
--- a/third_party/mold/test/elf/arm_tlsdesc.sh
+++ b/third_party/mold/test/elf/arm_tlsdesc.sh
@ -0,0 +1,72 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = arm ] || skip
+
+echo 'int main() {}' | $GCC -c -o /dev/null -xc - -O0 -mthumb >& /dev/null \
+  || skip
+
+cat <<EOF > $t/a.c
+extern _Thread_local int foo;
+
+__attribute__((section(".low")))
+int get_foo() {
+  int y = foo;
+  return y;
+}
+
+static _Thread_local int bar = 5;
+
+__attribute__((section(".high")))
+int get_bar() {
+  return bar;
+}
+EOF
+
+cat <<EOF > $t/b.c
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+_Thread_local int foo;
+
+int get_foo();
+int get_bar();
+
+int main() {
+  foo = 42;
+  printf("%d %d\n", get_foo(), get_bar());
+  return 0;
+}
+EOF
+
+$GCC -fPIC -mtls-dialect=gnu2 -c -o $t/c.o $t/a.c -marm
+$GCC -fPIC -mtls-dialect=gnu2 -c -o $t/d.o $t/b.c -marm
+
+$CC -B. -o $t/exe1 $t/c.o $t/d.o
+$QEMU $t/exe1 | grep -q '42 5'
+
+$CC -B. -o $t/exe2 $t/c.o $t/d.o -Wl,-no-relax
+$QEMU $t/exe2 | grep -q '42 5'
+
+$CC -B. -o $t/exe3 $t/c.o $t/d.o -Wl,-no-relax \
+  -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
+$QEMU $t/exe3 | grep -q '42 5'
+
+$GCC -fPIC -mtls-dialect=gnu2 -c -o $t/e.o $t/a.c -mthumb
+$GCC -fPIC -mtls-dialect=gnu2 -c -o $t/f.o $t/b.c -mthumb
+
+$CC -B. -o $t/exe4 $t/e.o $t/f.o
+$QEMU $t/exe4 | grep -q '42 5'
+
+$CC -B. -o $t/exe5 $t/e.o $t/f.o -Wl,-no-relax
+$QEMU $t/exe5 | grep -q '42 5'
+
+$CC -B. -o $t/exe6 $t/e.o $t/f.o -Wl,-no-relax \
+  -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000
+$QEMU $t/exe6 | grep -q '42 5'
--- a/third_party/mold/test/elf/as-needed-dso.sh
+++ b/third_party/mold/test/elf/as-needed-dso.sh
@ -0,0 +1,25 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/libfoo.so -shared -fPIC -Wl,-soname,libfoo.so -xc -
+int fn1() { return 42; }
+EOF
+
+cat <<EOF | $CC -o $t/libbar.so -shared -fPIC -Wl,-soname,libbar.so -xc -
+int fn1();
+int fn2() { return fn1(); }
+EOF
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+int fn2();
+int main() { fn2(); }
+EOF
+
+$CC -B. -o $t/exe1 $t/a.o -L$t -Wl,--as-needed -lbar -Wl,--allow-shlib-undefined
+readelf -W --dynamic $t/exe1 > $t/log1
+! grep -q libfoo $t/log1 || false
+
+$CC -B. -o $t/exe2 $t/a.o -L$t -Wl,--as-needed -lbar -lfoo
+readelf -W --dynamic $t/exe2 > $t/log2
+grep -q libfoo $t/log2
--- a/third_party/mold/test/elf/as-needed-weak.sh
+++ b/third_party/mold/test/elf/as-needed-weak.sh
@ -0,0 +1,32 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -fPIC -o $t/a.o -c -xc -
+__attribute__((weak)) int fn1();
+
+int main() {
+  if (fn1)
+    fn1();
+}
+EOF
+
+cat <<EOF | $CC -o $t/libfoo.so -shared -fPIC -Wl,-soname,libfoo.so -xc -
+int fn1() { return 42; }
+EOF
+
+cat <<EOF | $CC -o $t/libbar.so -shared -fPIC -Wl,-soname,libbar.so -xc -
+int fn2() { return 42; }
+EOF
+
+$CC -o $t/exe1 $t/a.o -Wl,-no-as-needed -L$t -lbar -lfoo
+
+readelf --dynamic $t/exe1 > $t/log1
+grep -Fq 'Shared library: [libfoo.so]' $t/log1
+grep -Fq 'Shared library: [libbar.so]' $t/log1
+
+$CC -o $t/exe2 $t/a.o -Wl,-as-needed -L$t -lbar -lfoo
+
+readelf --dynamic $t/exe2 > $t/log2
+! grep -Fq 'Shared library: [libfoo.so]' $t/log2 || false
+! grep -Fq 'Shared library: [libbar.so]' $t/log2 || false
--- a/third_party/mold/test/elf/as-needed.sh
+++ b/third_party/mold/test/elf/as-needed.sh
@ -0,0 +1,30 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+void fn1();
+int main() {
+  fn1();
+}
+EOF
+
+cat <<EOF | $CC -o $t/b.so -shared -fPIC -Wl,-soname,libfoo.so -xc -
+int fn1() { return 42; }
+EOF
+
+cat <<EOF | $CC -o $t/c.so -shared -fPIC -Wl,-soname,libbar.so -xc -
+int fn2() { return 42; }
+EOF
+
+$CC -B. -o $t/exe $t/a.o -Wl,--no-as-needed $t/b.so $t/c.so
+
+readelf --dynamic $t/exe > $t/readelf
+grep -Fq 'Shared library: [libfoo.so]' $t/readelf
+grep -Fq 'Shared library: [libbar.so]' $t/readelf
+
+$CC -B. -o $t/exe $t/a.o -Wl,--as-needed $t/b.so $t/c.so
+
+readelf --dynamic $t/exe > $t/readelf
+grep -Fq 'Shared library: [libfoo.so]' $t/readelf
+! grep -Fq 'Shared library: [libbar.so]' $t/readelf || false
--- a/third_party/mold/test/elf/as-needed2.sh
+++ b/third_party/mold/test/elf/as-needed2.sh
@ -0,0 +1,39 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -shared -fPIC -o $t/libfoo.so -Wl,--soname,libfoo.so -xc -
+int foo() { return 3; }
+EOF
+
+cat <<EOF | $CC -shared -fPIC -o $t/libbar.so -Wl,--soname,libbar.so -xc -
+int bar() { return 3; }
+EOF
+
+cat <<EOF | $CC -fPIC -c -o $t/a.o -xc -
+int foo();
+int baz() { return foo(); }
+EOF
+
+$CC -B. -shared -o $t/libbaz.so -Wl,--soname,libbaz.so -L$t $t/a.o -lfoo
+
+cat <<EOF | $CC -c -o $t/b.o -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+int baz();
+int main() {
+  printf("%d\n", baz());
+}
+EOF
+
+$CC -B. -o $t/exe $t/b.o -L$t -Wl,--as-needed -lbaz -lbar -lfoo
+
+readelf --dynamic $t/exe > $t/log
+grep -q libbaz $t/log || false
+! grep -q libbar $t/log || false
+grep -q libfoo $t/log || false
--- a/third_party/mold/test/elf/auxiliary.sh
+++ b/third_party/mold/test/elf/auxiliary.sh
@ -0,0 +1,16 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -x assembler -
+  .text
+  .globl _start
+_start:
+  nop
+EOF
+
+./mold -o $t/b.so $t/a.o -auxiliary foo -f bar -shared
+
+readelf --dynamic $t/b.so > $t/log
+grep -Fq 'Auxiliary library: [foo]' $t/log
+grep -Fq 'Auxiliary library: [bar]' $t/log
--- a/third_party/mold/test/elf/bno-symbolic.sh
+++ b/third_party/mold/test/elf/bno-symbolic.sh
@ -0,0 +1,43 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+# GCC produces buggy code for this test case on s390x.
+# https://sourceware.org/bugzilla/show_bug.cgi?id=29655
+[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-3]\.' && skip
+
+cat <<EOF | $CC -c -fPIC -o$t/a.o -xc -
+int foo = 4;
+
+int get_foo() {
+  return foo;
+}
+
+void *bar() {
+  return bar;
+}
+EOF
+
+$CC -B. -shared -fPIC -o $t/b.so $t/a.o -Wl,-Bsymbolic -Wl,-Bno-symbolic
+
+cat <<EOF | $CC -c -o $t/c.o -xc - -fno-PIE
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+extern int foo;
+int get_foo();
+void *bar();
+
+int main() {
+  foo = 3;
+  printf("%d %d %d\n", foo, get_foo(), bar == bar());
+}
+EOF
+
+$CC -B. -no-pie -o $t/exe $t/c.o $t/b.so
+$QEMU $t/exe | grep -q '3 3 1'
--- a/third_party/mold/test/elf/bsymbolic-functions.sh
+++ b/third_party/mold/test/elf/bsymbolic-functions.sh
@ -0,0 +1,34 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -c -o $t/a.o -fPIC -xc -
+int foo = 4;
+
+int get_foo() { return foo; }
+void *bar() { return bar; }
+EOF
+
+$CC -B. -shared -o $t/b.so $t/a.o -Wl,-Bsymbolic-functions
+
+cat <<EOF | $CC -c -o $t/c.o -xc - -fno-PIE
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int foo = 3;
+int x = 5;
+int get_foo();
+void *bar() { return &x; }
+
+int main() {
+  printf("%d %d %d\n", foo, get_foo(), bar == bar());
+}
+EOF
+
+$CC -B. -no-pie -o $t/exe $t/c.o $t/b.so
+$QEMU $t/exe | grep -q '3 3 0'
--- a/third_party/mold/test/elf/bsymbolic.sh
+++ b/third_party/mold/test/elf/bsymbolic.sh
@ -0,0 +1,30 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -c -fPIC -o$t/a.o -xc -
+int foo = 4;
+int get_foo() { return foo; }
+EOF
+
+$CC -B. -shared -fPIC -o $t/b.so $t/a.o -Wl,-Bsymbolic
+
+cat <<EOF | $CC -c -o $t/c.o -xc - -fno-PIE
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int foo = 3;
+int get_foo();
+
+int main() {
+  printf("%d %d\n", foo, get_foo());
+}
+EOF
+
+$CC -B. -no-pie -o $t/exe $t/c.o $t/b.so
+$QEMU $t/exe | grep -q '3 4'
--- a/third_party/mold/test/elf/bug178.sh
+++ b/third_party/mold/test/elf/bug178.sh
@ -0,0 +1,17 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+# Verify that mold does not crash if no object file is included
+# in the output. The resulting executable doesn't contain any
+# meaningful code or data, so this is an edge case, though.
+
+cat <<EOF | $CC -x assembler -c -o $t/a.o -
+.globl foo
+foo:
+EOF
+
+rm -f $t/a.a
+ar rcs $t/a.a $t/a.o
+
+./mold -o $t/exe $t/a.a
--- a/third_party/mold/test/elf/build-id.sh
+++ b/third_party/mold/test/elf/build-id.sh
@ -0,0 +1,24 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+echo 'int main() { return 0; }' > $t/a.c
+
+$CC -B. -o $t/exe $t/a.c -Wl,-build-id
+readelf -n $t/exe | grep -qv 'GNU.*0x00000010.*NT_GNU_BUILD_ID'
+
+$CC -B. -o $t/exe $t/a.c -Wl,-build-id=uuid
+readelf -nW $t/exe |
+  grep -Eq 'GNU.*0x00000010.*NT_GNU_BUILD_ID.*Build ID: ............4...[89abcdef]'
+
+$CC -B. -o $t/exe $t/a.c -Wl,-build-id=md5
+readelf -n $t/exe | grep -q 'GNU.*0x00000010.*NT_GNU_BUILD_ID'
+
+$CC -B. -o $t/exe $t/a.c -Wl,-build-id=sha1
+readelf -n $t/exe | grep -q 'GNU.*0x00000014.*NT_GNU_BUILD_ID'
+
+$CC -B. -o $t/exe $t/a.c -Wl,-build-id=sha256
+readelf -n $t/exe | grep -q 'GNU.*0x00000020.*NT_GNU_BUILD_ID'
+
+$CC -B. -o $t/exe $t/a.c -Wl,-build-id=0xdeadbeefdeadbeef
+readelf -n $t/exe | grep -q 'Build ID: deadbeefdeadbeef'
--- a/third_party/mold/test/elf/canonical-plt.sh
+++ b/third_party/mold/test/elf/canonical-plt.sh
@ -0,0 +1,46 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+# GCC produces buggy code for this test case on s390x.
+# https://sourceware.org/bugzilla/show_bug.cgi?id=29655
+[ $MACHINE = s390x ] && $CC -v 2>&1 | grep -E '^gcc version 1[0-3]\.' && skip
+
+cat <<EOF | $CC -o $t/a.so -fPIC -shared -xc -
+void *foo() {
+  return foo;
+}
+
+void *bar() {
+  return bar;
+}
+EOF
+
+cat <<EOF | $CC -o $t/b.o -c -xc - -fPIC
+void *bar();
+
+void *baz() {
+  return bar;
+}
+EOF
+
+cat <<EOF | $CC -o $t/c.o -c -xc - -fno-PIC
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+void *foo();
+void *bar();
+void *baz();
+
+int main() {
+  printf("%d %d %d\n", foo == foo(), bar == bar(), bar == baz());
+}
+EOF
+
+$CC -B. -no-pie -o $t/exe $t/a.so $t/b.o $t/c.o
+$QEMU $t/exe | grep -q '^1 1 1$'
--- a/third_party/mold/test/elf/cmdline.sh
+++ b/third_party/mold/test/elf/cmdline.sh
@ -0,0 +1,8 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+{ ./mold -zfoo || true; } 2>&1 | grep -q 'unknown command line option: -zfoo'
+{ ./mold -z foo || true; } 2>&1 | grep -q 'unknown command line option: -z foo'
+{ ./mold -abcdefg || true; } 2>&1 | grep -q 'unknown command line option: -abcdefg'
+{ ./mold --abcdefg || true; } 2>&1 | grep -q 'unknown command line option: --abcdefg'
--- a/third_party/mold/test/elf/color-diagnostics.sh
+++ b/third_party/mold/test/elf/color-diagnostics.sh
@ -0,0 +1,20 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+int foo();
+int main() { foo(); }
+EOF
+
+! ./mold -o $t/exe $t/a.o --color-diagnostics 2> $t/log
+! grep -q $'\033' $t/log || false
+
+! ./mold -o $t/exe $t/a.o --color-diagnostics=always 2> $t/log
+grep -q $'\033' $t/log
+
+! ./mold -o $t/exe $t/a.o --color-diagnostics=never 2> $t/log
+! grep -q $'\033' $t/log || false
+
+! ./mold -o $t/exe $t/a.o --color-diagnostics=auto 2> $t/log
+! grep -q $'\033' $t/log || false
--- a/third_party/mold/test/elf/comment.sh
+++ b/third_party/mold/test/elf/comment.sh
@ -0,0 +1,11 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -c -o $t/a.o -xc -
+int main() {}
+EOF
+
+$CC -B. -o $t/exe $t/a.o
+readelf -p .comment $t/exe | grep -q '[ms]old'
+readelf -SW $t/exe | grep -Eq '\.comment.*\bMS\b'
--- a/third_party/mold/test/elf/common-archive.sh
+++ b/third_party/mold/test/elf/common-archive.sh
@ -0,0 +1,53 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/a.o -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int foo;
+int bar;
+extern int baz;
+__attribute__((weak)) int two();
+
+int main() {
+  printf("%d %d %d %d\n", foo, bar, baz, two ? two() : -1);
+}
+EOF
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/b.o -
+int foo = 5;
+EOF
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/c.o -
+int bar;
+int two() { return 2; }
+EOF
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/d.o -
+int baz;
+EOF
+
+rm -f $t/e.a
+ar rcs $t/e.a $t/b.o $t/c.o $t/d.o
+
+$CC -B. -o $t/exe $t/a.o $t/e.a
+$QEMU $t/exe | grep -q '5 0 0 -1'
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/f.o -
+int bar = 0;
+int baz = 7;
+int two() { return 2; }
+EOF
+
+rm -f $t/f.a
+ar rcs $t/f.a $t/b.o $t/f.o
+
+$CC -B. -o $t/exe $t/a.o $t/f.a
+$QEMU $t/exe | grep -q '5 0 7 2'
--- a/third_party/mold/test/elf/common-ref.sh
+++ b/third_party/mold/test/elf/common-ref.sh
@ -0,0 +1,38 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/a.o -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int bar;
+
+int main() {
+  printf("%d\n", bar);
+}
+EOF
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/b.o -
+int foo;
+EOF
+
+rm -f $t/c.a
+ar rcs $t/c.a $t/b.o
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/d.o -
+int foo;
+int bar = 5;
+int get_foo() { return foo; }
+EOF
+
+rm -f $t/e.a
+ar rcs $t/e.a $t/d.o
+
+$CC -B. -o $t/exe $t/a.o $t/c.a $t/e.a
+$QEMU $t/exe | grep -q 5
--- a/third_party/mold/test/elf/common.inc
+++ b/third_party/mold/test/elf/common.inc
@ -0,0 +1,92 @@
+// clang-format off
+# -*- mode: sh -*-
+
+# Make sure all commands print out messages in English
+export LC_ALL=C
+
+canonical_name() {
+  case $1 in
+  i?86) echo i386 ;;
+  arm*) echo arm ;;
+  powerpc) echo ppc ;;
+  powerpc64) echo ppc64 ;;
+  powerpc64le) echo ppc64le ;;
+  *) echo $1
+  esac
+}
+
+HOST=$(canonical_name $(uname -m))
+
+# Set tool names
+if [ "$TRIPLE" = "" ]; then
+  MACHINE=$HOST
+  TESTDIR=out/test/elf/$HOST
+  CC="${TEST_CC:-cc}"
+  CXX="${TEST_CXX:-c++}"
+  GCC="${TEST_GCC:-gcc}"
+  GXX="${TEST_GXX:-g++}"
+  OBJDUMP=objdump
+  OBJCOPY=objcopy
+  STRIP=strip
+  QEMU=
+elif [ "$TRIPLE" = powerpc64le_power10-linux-gnu ]; then
+  TRIPLE=powerpc64le-linux-gnu
+  MACHINE=ppc64le
+  TESTDIR=out/test/elf/ppc64le-power10
+  CC="${TEST_CC:-$TRIPLE-gcc} -mcpu=power10"
+  CXX="${TEST_CXX:-$TRIPLE-g++} -mcpu=power10"
+  GCC="${TEST_GCC:-$TRIPLE-gcc} -mcpu=power10"
+  GXX="${TEST_GXX:-$TRIPLE-g++} -mcpu=power10"
+  OBJDUMP="$TRIPLE-objdump"
+  OBJCOPY="$TRIPLE-objcopy"
+  STRIP="$TRIPLE-strip"
+  QEMU="qemu-ppc64le -L /usr/$TRIPLE -cpu power10"
+else
+  MACHINE=$(canonical_name $(echo $TRIPLE | sed 's/-.*//'))
+  TESTDIR=out/test/elf/$MACHINE
+  CC="${TEST_CC:-$TRIPLE-gcc}"
+  CXX="${TEST_CXX:-$TRIPLE-g++}"
+  GCC="${TEST_GCC:-$TRIPLE-gcc}"
+  GXX="${TEST_GXX:-$TRIPLE-g++}"
+  OBJDUMP="$TRIPLE-objdump"
+  OBJCOPY="$TRIPLE-objcopy"
+  STRIP="$TRIPLE-strip"
+  QEMU="qemu-$MACHINE -L /usr/$TRIPLE"
+fi
+
+# Common functions
+test_cflags() {
+  echo 'int main() {}' | $CC "$@" -o /dev/null -xc - >& /dev/null
+}
+
+supports_ifunc() {
+  echo 'void x() __attribute__((ifunc("y"))); void *y() { return 0; }' | \
+    $CC -c -o /dev/null -xc - >& /dev/null
+}
+
+skip() {
+  echo skipped
+  trap - EXIT
+  exit 0
+}
+
+on_error() {
+  code=$?
+  echo "command failed: $1: $BASH_COMMAND"
+  trap - EXIT
+  exit $code
+}
+
+on_exit() {
+  echo OK
+  exit 0
+}
+
+trap 'on_error $LINENO' ERR
+trap on_exit EXIT
+
+# Print out the startup message
+testname=$(basename "$0" .sh)
+echo -n "Testing $testname ... "
+t=$TESTDIR/$testname
+mkdir -p $t
--- a/third_party/mold/test/elf/common.sh
+++ b/third_party/mold/test/elf/common.sh
@ -0,0 +1,33 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/a.o -
+int foo;
+int bar;
+int baz = 42;
+EOF
+
+cat <<EOF | $CC -fcommon -xc -c -o $t/b.o -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int foo;
+int bar = 5;
+int baz;
+
+int main() {
+  printf("%d %d %d\n", foo, bar, baz);
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o
+$QEMU $t/exe | grep -q '0 5 42'
+
+readelf --sections $t/exe > $t/log
+grep -q '.common .*NOBITS' $t/log
--- a/third_party/mold/test/elf/compress-debug-sections-zstd.sh
+++ b/third_party/mold/test/elf/compress-debug-sections-zstd.sh
@ -0,0 +1,29 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+# arm-linux-gnueabihf-objcopy crashes on x86-64
+[ $MACHINE = arm ] && skip
+[ $MACHINE = riscv32 ] && skip
+
+command -v zstdcat >& /dev/null || skip
+
+cat <<EOF | $CC -c -g -o $t/a.o -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int main() {
+  printf("Hello world\n");
+  return 0;
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o -Wl,--compress-debug-sections=zstd
+$OBJCOPY --dump-section .debug_info=$t/debug_info $t/exe
+dd if=$t/debug_info of=$t/debug_info.zstd bs=24 skip=1 status=none
+zstdcat $t/debug_info.zstd > /dev/null
--- a/third_party/mold/test/elf/compress-debug-sections.sh
+++ b/third_party/mold/test/elf/compress-debug-sections.sh
@ -0,0 +1,25 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+command -v dwarfdump >& /dev/null || skip
+
+cat <<EOF | $CC -c -g -o $t/a.o -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+int main() {
+  printf("Hello world\n");
+  return 0;
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o -Wl,--compress-debug-sections=zlib
+dwarfdump $t/exe > $t/log
+grep -Fq '.debug_info SHF_COMPRESSED' $t/log
+grep -Fq '.debug_str SHF_COMPRESSED' $t/log
--- a/third_party/mold/test/elf/compressed-debug-info.sh
+++ b/third_party/mold/test/elf/compressed-debug-info.sh
@ -0,0 +1,21 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+command -v dwarfdump >& /dev/null || skip
+
+cat <<EOF | $CXX -c -o $t/a.o -g -gz=zlib -xc++ -
+int main() {
+  return 0;
+}
+EOF
+
+cat <<EOF | $CXX -c -o $t/b.o -g -gz=zlib -xc++ -
+int foo() {
+  return 0;
+}
+EOF
+
+$CC -B. -o $t/exe $t/a.o $t/b.o
+dwarfdump $t/exe > /dev/null
+readelf --sections $t/exe | grep -Fq .debug_info
--- a/third_party/mold/test/elf/copyrel-alignment.sh
+++ b/third_party/mold/test/elf/copyrel-alignment.sh
@ -0,0 +1,43 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = ppc64 ] && skip
+[ $MACHINE = ppc64le ] && skip
+[ $MACHINE = alpha ] && skip
+
+cat <<EOF | $CC -fPIC -shared -o $t/a.so -xc -
+__attribute__((aligned(32))) int foo = 5;
+EOF
+
+cat <<EOF | $CC -fPIC -shared -o $t/b.so -xc -
+__attribute__((aligned(8))) int foo = 5;
+EOF
+
+cat <<EOF | $CC -fPIC -shared -o $t/c.so -xc -
+__attribute__((aligned(256))) int foo = 5;
+EOF
+
+cat <<EOF | $CC -fno-PIE -o $t/d.o -c -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+extern int foo;
+int main() { printf("%d %p\n", foo, &foo); }
+EOF
+
+$CC -B. -o $t/exe1 $t/d.o $t/a.so -no-pie
+$QEMU $t/exe1 > /dev/null
+readelf -W --sections $t/exe1 | grep -q '\.copyrel.* 32$'
+
+$CC -B. -o $t/exe2 $t/d.o $t/b.so -no-pie
+$QEMU $t/exe2 > /dev/null
+readelf -W --sections $t/exe2 | grep -q '\.copyrel.* 8$'
+
+$CC -B. -o $t/exe3 $t/d.o $t/c.so -no-pie
+$QEMU $t/exe3 > /dev/null
+readelf -W --sections $t/exe3 | grep -q '\.copyrel.* 256$'
--- a/third_party/mold/test/elf/copyrel-protected.sh
+++ b/third_party/mold/test/elf/copyrel-protected.sh
@ -0,0 +1,22 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+[ $MACHINE = ppc64 ] && skip
+[ $MACHINE = ppc64le ] && skip
+[ $MACHINE = alpha ] && skip
+
+cat <<EOF | $CC -o $t/a.o -c -xc -fno-PIE -
+extern int foo;
+
+int main() {
+  return foo;
+}
+EOF
+
+cat <<EOF | $CC -shared -o $t/b.so -xc -
+__attribute__((visibility("protected"))) int foo;
+EOF
+
+! $CC -B. $t/a.o $t/b.so -o $t/exe >& $t/log -no-pie || false
+grep -Fq 'cannot make copy relocation for protected symbol' $t/log
--- a/third_party/mold/test/elf/copyrel-relro.sh
+++ b/third_party/mold/test/elf/copyrel-relro.sh
@ -0,0 +1,53 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -fno-PIE -
+#include "libc/runtime/runtime.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/sigtimedwait.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+extern const char readonly[100];
+extern char readwrite[100];
+
+static int segv = 0;
+static jmp_buf buf;
+
+void handler(int sig) {
+  segv = 1;
+  longjmp(buf, 1);
+}
+
+int main() {
+  signal(SIGSEGV, handler);
+
+  readwrite[0] = 5;
+  int x = segv;
+
+  if (setjmp(buf) == 0)
+    *(char *)readonly = 5;
+  int y = segv;
+
+  printf("sigsegv %d %d\n", x, y);
+}
+EOF
+
+cat <<EOF | $CC -fPIC -shared -o $t/b.so -xc -
+const char readonly[100] = "abc";
+char readwrite[100] = "abc";
+EOF
+
+$CC -B. $t/a.o $t/b.so -o $t/exe -no-pie
+$QEMU $t/exe | grep -q '^sigsegv 0 1$'
--- a/third_party/mold/test/elf/copyrel.sh
+++ b/third_party/mold/test/elf/copyrel.sh
@ -0,0 +1,36 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -fno-PIC -o $t/a.o -c -xc -
+#include "libc/calls/calls.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/stdio/dprintf.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"
+
+extern int foo;
+extern int *get_bar();
+
+int main() {
+  printf("%d %d %d\n", foo, *get_bar(), &foo == get_bar());
+  return 0;
+}
+EOF
+
+cat <<EOF | $CC -fno-PIC -o $t/b.o -c -xc -
+extern int bar;
+int *get_bar() { return &bar; }
+EOF
+
+cat <<EOF | $CC -fPIC -o $t/c.o -c -xc -
+int foo = 42;
+extern int bar __attribute__((alias("foo")));
+extern int baz __attribute__((alias("foo")));
+EOF
+
+$CC -B. -shared -o $t/c.so $t/c.o
+$CC -B. -no-pie -o $t/exe $t/a.o $t/b.o $t/c.so
+$QEMU $t/exe | grep -q '42 42 1'
--- a/third_party/mold/test/elf/dead-debug-sections.sh
+++ b/third_party/mold/test/elf/dead-debug-sections.sh
@ -0,0 +1,32 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+command -v dwarfdump >& /dev/null || skip
+
+cat <<EOF | $CXX -c -o $t/a.o -g -xc++ -
+extern const char *msg;
+struct Foo {
+  Foo() { msg = "Hello world"; }
+};
+Foo x;
+EOF
+
+cat <<EOF | $CXX -c -o $t/b.o -g -xc++ -
+extern const char *msg;
+struct Foo {
+  Foo() { msg = "Hello world"; }
+};
+Foo y;
+EOF
+
+cat <<EOF | $CXX -o $t/c.o -c -xc++ -g -
+#include "third_party/libcxx/cstdio"
+const char *msg;
+int main() { printf("%s\n", msg); }
+EOF
+
+$CXX -o $t/exe $t/a.o $t/b.o $t/c.o -g
+$QEMU $t/exe | grep -q 'Hello world'
+
+dwarfdump $t/exe > /dev/null
--- a/third_party/mold/test/elf/debug-macro-section.sh
+++ b/third_party/mold/test/elf/debug-macro-section.sh
@ -0,0 +1,23 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF > $t/a.h
+#define A 23
+#define B 99
+EOF
+
+cat <<EOF | $GCC -o $t/b.o -c -xc - -I$t -g3
+// MISSING #include "a.h"
+extern int z();
+int main () { return z() - 122; }
+EOF
+
+cat <<EOF | $GCC -o $t/c.o -c -xc - -I$t -g3
+// MISSING #include "a.h"
+int z()  { return A + B; }
+EOF
+
+$GCC -B. -o $t/exe $t/b.o $t/c.o
+$OBJDUMP --dwarf=macro $t/exe > $t/log
+! grep 'DW_MACRO_import -.* 0x0$' $t/log || false
--- a/third_party/mold/test/elf/default-symver.sh
+++ b/third_party/mold/test/elf/default-symver.sh
@ -0,0 +1,14 @@
+// clang-format off
+#!/bin/bash
+. $(dirname $0)/common.inc
+
+cat <<EOF | $CC -o $t/a.o -c -xc -
+void foo() {}
+EOF
+
+$CC -B. -o $t/b.so -shared $t/a.o -Wl,-default-symver
+readelf --dyn-syms $t/b.so | grep -q ' foo@@b\.so$'
+
+$CC -B. -o $t/b.so -shared $t/a.o \
+  -Wl,--soname=bar -Wl,-default-symver
+readelf --dyn-syms $t/b.so | grep -q ' foo@@bar$'
--- a/Show more
+++ b/Show more