This commit is contained in:
Farid Zakaria 2023-07-08 20:57:13 +00:00
parent 6bc04598bf
commit 2b4d6124d9
20 changed files with 63 additions and 6666 deletions

View file

@ -1,331 +0,0 @@
// clang-format off
// Alpha is a 64-bit RISC ISA developed by DEC (Digital Equipment
// Corporation) in the early '90s. It aimed to be an ISA that would last
// 25 years. DEC expected Alpha would become 1000x faster during that time
// span. Since the ISA was developed from scratch for future machines,
// it's 64-bit from the beginning. There's no 32-bit variant.
//
// DEC ported its own Unix (Tru64) to Alpha. Microsoft also ported Windows
// NT to it. But it wasn't a huge commercial success.
//
// DEC was acquired by Compaq in 1997. In the late '90s, Intel and
// Hewlett-Packard were advertising that their upcoming Itanium processor
// would achieve significantly better performance than RISC processors, so
// Compaq decided to discontinue the Alpha processor line to switch to
// Itanium. Itanium resulted in a miserable failure, but it still suceeded
// to wipe out several RISC processors just by promising overly optimistic
// perf numbers. Alpha as an ISA would probably have been fine after 25
// years since its introduction (which is 1992 + 25 = 2017), but the
// company and its market didn't last that long.
//
// From the linker's point of view, there are a few peculiarities in its
// psABI as shown below:
//
// - Alpha lacks PC-relative memory load/store instructions, so it uses
// register-relative load/store instructions in position-independent
// code. Specifically, GP (which is an alias for $r29) is always
// maintained to refer to .got+0x8000, and global variables' addresses
// are loaded in a GP-relative manner.
//
// - It looks like even function addresses are first loaded to register
// in a GP-relative manner before calling it. We can relax it to
// convert the instruction sequence with a direct branch instruction,
// but by default, object files don't use a direct branch to call a
// function. Therefore, by default, we don't need to create a PLT.
// Any function call is made by first reading its address from GOT and
// jump to the address.
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = ALPHA;
// A 32-bit immediate can be materialized in a register with a "load high"
// and a "load low" instruction sequence. The first instruction sets the
// upper 16 bits in a register, and the second one set the lower 16
// bits. When doing so, they sign-extend an immediate. Therefore, if the
// 15th bit of an immediate happens to be 1, setting a "low half" value
// negates the upper 16 bit values that has already been set in a
// register. To compensate that, we need to add 0x8000 when setting the
// upper 16 bits.
static u32 hi(u32 val) {
return bits(val + 0x8000, 31, 16);
}
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_ALPHA_SREL32:
*(ul32 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
u64 GP = ctx.got->shdr.sh_addr + 0x8000;
switch (rel.r_type) {
case R_ALPHA_REFQUAD:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_ALPHA_GPREL32:
*(ul32 *)loc = S + A - GP;
break;
case R_ALPHA_LITERAL:
if (A)
*(ul16 *)loc = ctx.extra.got->get_addr(sym, A) - GP;
else
*(ul16 *)loc = GOT + G - GP;
break;
case R_ALPHA_BRSGP:
*(ul32 *)loc |= bits(S + A - P - 4, 22, 0);
break;
case R_ALPHA_GPDISP:
*(ul16 *)loc = hi(GP - P);
*(ul16 *)(loc + A) = GP - P;
break;
case R_ALPHA_SREL32:
*(ul32 *)loc = S + A - P;
break;
case R_ALPHA_GPRELHIGH:
*(ul16 *)loc = hi(S + A - GP);
break;
case R_ALPHA_GPRELLOW:
*(ul16 *)loc = S + A - GP;
break;
case R_ALPHA_TLSGD:
*(ul16 *)loc = sym.get_tlsgd_addr(ctx) - GP;
break;
case R_ALPHA_TLSLDM:
*(ul16 *)loc = ctx.got->get_tlsld_addr(ctx) - GP;
break;
case R_ALPHA_DTPRELHI:
*(ul16 *)loc = hi(S + A - ctx.dtp_addr);
break;
case R_ALPHA_DTPRELLO:
*(ul16 *)loc = S + A - ctx.dtp_addr;
break;
case R_ALPHA_GOTTPREL:
*(ul16 *)loc = sym.get_gottp_addr(ctx) + A - GP;
break;
case R_ALPHA_TPRELHI:
*(ul16 *)loc = hi(S + A - ctx.tp_addr);
break;
case R_ALPHA_TPRELLO:
*(ul16 *)loc = S + A - ctx.tp_addr;
break;
case R_ALPHA_LITUSE:
case R_ALPHA_HINT:
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_ALPHA_REFLONG:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul32 *)loc = *val;
else
*(ul32 *)loc = S + A;
break;
case R_ALPHA_REFQUAD:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul64 *)loc = *val;
else
*(ul64 *)loc = S + A;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
Error(ctx) << sym << ": GNU ifunc symbol is not supported on Alpha";
switch (rel.r_type) {
case R_ALPHA_REFQUAD:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_ALPHA_LITERAL:
if (rel.r_addend)
ctx.extra.got->add_symbol(sym, rel.r_addend);
else
sym.flags |= NEEDS_GOT;
break;
case R_ALPHA_SREL32:
scan_pcrel(ctx, sym, rel);
break;
case R_ALPHA_BRSGP:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_ALPHA_TLSGD:
sym.flags |= NEEDS_TLSGD;
break;
case R_ALPHA_TLSLDM:
ctx.needs_tlsld = true;
break;
case R_ALPHA_GOTTPREL:
sym.flags |= NEEDS_GOTTP;
break;
case R_ALPHA_TPRELHI:
case R_ALPHA_TPRELLO:
check_tlsle(ctx, sym, rel);
break;
case R_ALPHA_GPREL32:
case R_ALPHA_LITUSE:
case R_ALPHA_GPDISP:
case R_ALPHA_HINT:
case R_ALPHA_GPRELHIGH:
case R_ALPHA_GPRELLOW:
case R_ALPHA_DTPRELHI:
case R_ALPHA_DTPRELLO:
break;
default:
Fatal(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
// An R_ALPHA_LITERAL relocation may request the linker to create a GOT
// entry for an external symbol with a non-zero addend. This is an unusual
// request which is not found in any other targets.
//
// Referring an external symbol with a non-zero addend is a bad practice
// because we need to create as many dynamic relocations as the number of
// distinctive addends for the same symbol.
//
// We don't want to mess up the implementation of the common GOT section
// for Alpha. So we create another GOT-like section, .alpha_got. Any GOT
// entry for an R_ALPHA_LITERAL reloc with a non-zero addend is created
// not in .got but in .alpha_got.
//
// Since .alpha_got entries are accessed relative to GP, .alpha_got
// needs to be close enough to .got. It's actually placed next to .got.
void AlphaGotSection::add_symbol(Symbol<E> &sym, i64 addend) {
assert(addend);
std::scoped_lock lock(mu);
entries.push_back({&sym, addend});
}
bool operator<(const AlphaGotSection::Entry &a, const AlphaGotSection::Entry &b) {
return std::tuple(a.sym->file->priority, a.sym->sym_idx, a.addend) <
std::tuple(b.sym->file->priority, b.sym->sym_idx, b.addend);
};
u64 AlphaGotSection::get_addr(Symbol<E> &sym, i64 addend) {
auto it = std::lower_bound(entries.begin(), entries.end(), Entry{&sym, addend});
assert(it != entries.end());
return this->shdr.sh_addr + (it - entries.begin()) * sizeof(Word<E>);
}
i64 AlphaGotSection::get_reldyn_size(Context<E> &ctx) const {
i64 n = 0;
for (const Entry &e : entries)
if (e.sym->is_imported || (ctx.arg.pic && !e.sym->is_absolute()))
n++;
return n;
}
void AlphaGotSection::finalize() {
sort(entries);
remove_duplicates(entries);
shdr.sh_size = entries.size() * sizeof(Word<E>);
}
void AlphaGotSection::copy_buf(Context<E> &ctx) {
ElfRel<E> *dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
reldyn_offset);
for (i64 i = 0; i < entries.size(); i++) {
Entry &e = entries[i];
u64 P = this->shdr.sh_addr + sizeof(Word<E>) * i;
ul64 *buf = (ul64 *)(ctx.buf + this->shdr.sh_offset + sizeof(Word<E>) * i);
if (e.sym->is_imported) {
*buf = ctx.arg.apply_dynamic_relocs ? e.addend : 0;
*dynrel++ = ElfRel<E>(P, E::R_ABS, e.sym->get_dynsym_idx(ctx), e.addend);
} else {
*buf = e.sym->get_addr(ctx) + e.addend;
if (ctx.arg.pic && !e.sym->is_absolute())
*dynrel++ = ElfRel<E>(P, E::R_RELATIVE, 0, *buf);
}
}
}
} // namespace mold::elf

View file

@ -1,737 +0,0 @@
// clang-format off
// ARM32 is a bit special from the linker's viewpoint because ARM
// processors support two different instruction encodings: Thumb and
// ARM (in a narrower sense). Thumb instructions are either 16 bits or
// 32 bits, while ARM instructions are all 32 bits. Feature-wise,
// thumb is a subset of ARM, so not all ARM instructions are
// representable in Thumb.
//
// ARM processors originally supported only ARM instructions. Thumb
// instructions were later added to increase code density.
//
// ARM processors runs in either ARM mode or Thumb mode. The mode can
// be switched using BX (branch and mode exchange)-family instructions.
// We need to use that instructions to, for example, call a function
// encoded in Thumb from a function encoded in ARM. Sometimes, the
// linker even has to emit an interworking thunk code to switch mode.
//
// ARM instructions are aligned to 4 byte boundaries. Thumb are to 2
// byte boundaries.
//
// You can distinguish Thumb functions from ARM functions by looking
// at the least significant bit (LSB) of its "address". If LSB is 0,
// it's ARM; otherwise, Thumb.
//
// For example, if a symbol `foo` is of type STT_FUNC and has value
// 0x2001, `foo` is a function using Thumb instructions whose address
// is 0x2000 (not 0x2001, as Thumb instructions are always 2-byte
// aligned). Likewise, if a function pointer has value 0x2001, it
// refers a Thumb function at 0x2000.
//
// https://github.com/ARM-software/abi-aa/blob/main/aaelf32/aaelf32.rst
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = ARM32;
template <>
i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
switch (rel.r_type) {
case R_ARM_ABS32:
case R_ARM_REL32:
case R_ARM_TARGET1:
case R_ARM_BASE_PREL:
case R_ARM_GOTOFF32:
case R_ARM_GOT_PREL:
case R_ARM_GOT_BREL:
case R_ARM_TLS_GD32:
case R_ARM_TLS_LDM32:
case R_ARM_TLS_LDO32:
case R_ARM_TLS_IE32:
case R_ARM_TLS_LE32:
case R_ARM_TLS_GOTDESC:
case R_ARM_TARGET2:
return *(il32 *)loc;
case R_ARM_THM_JUMP11:
return sign_extend(*(ul16 *)loc, 10) << 1;
case R_ARM_THM_CALL:
case R_ARM_THM_JUMP24:
case R_ARM_THM_TLS_CALL: {
u32 S = bit(*(ul16 *)loc, 10);
u32 J1 = bit(*(ul16 *)(loc + 2), 13);
u32 J2 = bit(*(ul16 *)(loc + 2), 11);
u32 I1 = !(J1 ^ S);
u32 I2 = !(J2 ^ S);
u32 imm10 = bits(*(ul16 *)loc, 9, 0);
u32 imm11 = bits(*(ul16 *)(loc + 2), 10, 0);
u32 val = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1);
return sign_extend(val, 24);
}
case R_ARM_CALL:
case R_ARM_JUMP24:
case R_ARM_PLT32:
case R_ARM_TLS_CALL:
return sign_extend(*(ul32 *)loc, 23) << 2;
case R_ARM_MOVW_PREL_NC:
case R_ARM_MOVW_ABS_NC:
case R_ARM_MOVT_PREL:
case R_ARM_MOVT_ABS: {
u32 imm12 = bits(*(ul32 *)loc, 11, 0);
u32 imm4 = bits(*(ul32 *)loc, 19, 16);
return sign_extend((imm4 << 12) | imm12, 15);
}
case R_ARM_PREL31:
return sign_extend(*(ul32 *)loc, 30);
case R_ARM_THM_MOVW_PREL_NC:
case R_ARM_THM_MOVW_ABS_NC:
case R_ARM_THM_MOVT_PREL:
case R_ARM_THM_MOVT_ABS: {
u32 imm4 = bits(*(ul16 *)loc, 3, 0);
u32 i = bit(*(ul16 *)loc, 10);
u32 imm3 = bits(*(ul16 *)(loc + 2), 14, 12);
u32 imm8 = bits(*(ul16 *)(loc + 2), 7, 0);
u32 val = (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8;
return sign_extend(val, 15);
}
default:
return 0;
}
}
static void write_mov_imm(u8 *loc, u32 val) {
u32 imm12 = bits(val, 11, 0);
u32 imm4 = bits(val, 15, 12);
*(ul32 *)loc = (*(ul32 *)loc & 0xfff0f000) | (imm4 << 16) | imm12;
}
static void write_thm_b_imm(u8 *loc, u32 val) {
// https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/BL--BLX--immediate-
u32 sign = bit(val, 24);
u32 I1 = bit(val, 23);
u32 I2 = bit(val, 22);
u32 J1 = !I1 ^ sign;
u32 J2 = !I2 ^ sign;
u32 imm10 = bits(val, 21, 12);
u32 imm11 = bits(val, 11, 1);
ul16 *buf = (ul16 *)loc;
buf[0] = (buf[0] & 0b1111'1000'0000'0000) | (sign << 10) | imm10;
buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11;
}
static void write_thm_mov_imm(u8 *loc, u32 val) {
// https://developer.arm.com/documentation/ddi0406/cb/Application-Level-Architecture/Instruction-Details/Alphabetical-list-of-instructions/MOVT
u32 imm4 = bits(val, 15, 12);
u32 i = bit(val, 11);
u32 imm3 = bits(val, 10, 8);
u32 imm8 = bits(val, 7, 0);
ul16 *buf = (ul16 *)loc;
buf[0] = (buf[0] & 0b1111'1011'1111'0000) | (i << 10) | imm4;
buf[1] = (buf[1] & 0b1000'1111'0000'0000) | (imm3 << 12) | imm8;
}
template <>
void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
switch (rel.r_type) {
case R_ARM_NONE:
break;
case R_ARM_ABS32:
case R_ARM_REL32:
case R_ARM_TARGET1:
case R_ARM_BASE_PREL:
case R_ARM_GOTOFF32:
case R_ARM_GOT_PREL:
case R_ARM_GOT_BREL:
case R_ARM_TLS_GD32:
case R_ARM_TLS_LDM32:
case R_ARM_TLS_LDO32:
case R_ARM_TLS_IE32:
case R_ARM_TLS_LE32:
case R_ARM_TLS_GOTDESC:
case R_ARM_TARGET2:
*(ul32 *)loc = val;
break;
case R_ARM_THM_JUMP11:
*(ul16 *)loc = (*(ul16 *)loc & 0xf800) | bits(val, 11, 1);
break;
case R_ARM_THM_CALL:
case R_ARM_THM_JUMP24:
case R_ARM_THM_TLS_CALL:
write_thm_b_imm(loc, val);
break;
case R_ARM_CALL:
case R_ARM_JUMP24:
case R_ARM_PLT32:
*(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
break;
case R_ARM_MOVW_PREL_NC:
case R_ARM_MOVW_ABS_NC:
case R_ARM_MOVT_PREL:
case R_ARM_MOVT_ABS:
write_mov_imm(loc, val);
break;
case R_ARM_PREL31:
*(ul32 *)loc = (*(ul32 *)loc & 0x8000'0000) | (val & 0x7fff'ffff);
break;
case R_ARM_THM_MOVW_PREL_NC:
case R_ARM_THM_MOVW_ABS_NC:
case R_ARM_THM_MOVT_PREL:
case R_ARM_THM_MOVT_ABS:
write_thm_mov_imm(loc, val);
break;
default:
unreachable();
}
}
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const ul32 insn[] = {
0xe52d'e004, // push {lr}
0xe59f'e004, // ldr lr, 2f
0xe08f'e00e, // 1: add lr, pc, lr
0xe5be'f008, // ldr pc, [lr, #8]!
0x0000'0000, // 2: .word .got.plt - 1b - 8
0xe320'f000, // nop
0xe320'f000, // nop
0xe320'f000, // nop
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16;
}
static const ul32 plt_entry[] = {
0xe59f'c004, // 1: ldr ip, 2f
0xe08c'c00f, // add ip, ip, pc
0xe59c'f000, // ldr pc, [ip]
0x0000'0000, // 2: .word sym@GOT - 1b
};
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
memcpy(buf, plt_entry, sizeof(plt_entry));
*(ul32 *)(buf + 12) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 12;
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
memcpy(buf, plt_entry, sizeof(plt_entry));
*(ul32 *)(buf + 12) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 12;
}
// ARM does not use .eh_frame for exception handling. Instead, it uses
// .ARM.exidx and .ARM.extab. So this function is empty.
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {}
// ARM and Thumb branch instructions can jump within ±16 MiB.
static bool is_jump_reachable(i64 val) {
return sign_extend(val, 24) == val;
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
auto get_tls_trampoline_addr = [&, i = 0](u64 addr) mutable {
for (; i < output_section->thunks.size(); i++) {
i64 disp = output_section->shdr.sh_addr + output_section->thunks[i]->offset -
addr;
if (is_jump_reachable(disp))
return disp;
}
unreachable();
};
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
u64 S = sym.get_addr(ctx);
u64 A = get_addend(*this, rel);
u64 P = get_addr() + rel.r_offset;
u64 T = S & 1;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
auto get_thumb_thunk_addr = [&] { return get_thunk_addr(i); };
auto get_arm_thunk_addr = [&] { return get_thunk_addr(i) + 4; };
switch (rel.r_type) {
case R_ARM_ABS32:
case R_ARM_TARGET1:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_ARM_REL32:
*(ul32 *)loc = S + A - P;
break;
case R_ARM_THM_CALL: {
if (sym.is_remaining_undef_weak()) {
// On ARM, calling an weak undefined symbol jumps to the
// next instruction.
*(ul32 *)loc = 0x8000'f3af; // NOP.W
break;
}
// THM_CALL relocation refers either BL or BLX instruction.
// They are different in only one bit. We need to use BL if
// the jump target is Thumb. Otherwise, use BLX.
i64 val = S + A - P;
if (is_jump_reachable(val)) {
if (T) {
write_thm_b_imm(loc, val);
*(ul16 *)(loc + 2) |= 0x1000; // rewrite to BL
} else {
write_thm_b_imm(loc, align_to(val, 4));
*(ul16 *)(loc + 2) &= ~0x1000; // rewrite to BLX
}
} else {
write_thm_b_imm(loc, align_to(get_arm_thunk_addr() + A - P, 4));
*(ul16 *)(loc + 2) &= ~0x1000; // rewrite to BLX
}
break;
}
case R_ARM_BASE_PREL:
*(ul32 *)loc = GOT + A - P;
break;
case R_ARM_GOTOFF32:
*(ul32 *)loc = ((S + A) | T) - GOT;
break;
case R_ARM_GOT_PREL:
case R_ARM_TARGET2:
*(ul32 *)loc = GOT + G + A - P;
break;
case R_ARM_GOT_BREL:
*(ul32 *)loc = G + A;
break;
case R_ARM_CALL: {
if (sym.is_remaining_undef_weak()) {
*(ul32 *)loc = 0xe320'f000; // NOP
break;
}
// Just like THM_CALL, ARM_CALL relocation refers either BL or
// BLX instruction. We may need to rewrite BL → BLX or BLX → BL.
bool is_bl = ((*(ul32 *)loc & 0xff00'0000) == 0xeb00'0000);
bool is_blx = ((*(ul32 *)loc & 0xfe00'0000) == 0xfa00'0000);
if (!is_bl && !is_blx)
Fatal(ctx) << *this << ": R_ARM_CALL refers neither BL nor BLX";
u64 val = S + A - P;
if (is_jump_reachable(val)) {
if (T) {
*(ul32 *)loc = 0xfa00'0000; // BLX
*(ul32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2);
} else {
*(ul32 *)loc = 0xeb00'0000; // BL
*(ul32 *)loc |= bits(val, 25, 2);
}
} else {
*(ul32 *)loc = 0xeb00'0000; // BL
*(ul32 *)loc |= bits(get_arm_thunk_addr() + A - P, 25, 2);
}
break;
}
case R_ARM_JUMP24: {
if (sym.is_remaining_undef_weak()) {
*(ul32 *)loc = 0xe320'f000; // NOP
break;
}
// These relocs refers a B (unconditional branch) instruction.
// Unlike BL or BLX, we can't rewrite B to BX in place when the
// processor mode switch is required because BX doesn't takes an
// immediate; it takes only a register. So if mode switch is
// required, we jump to a linker-synthesized thunk which does the
// job with a longer code sequence.
u64 val = S + A - P;
if (!is_jump_reachable(val) || T)
val = get_arm_thunk_addr() + A - P;
*(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
break;
}
case R_ARM_PLT32:
if (sym.is_remaining_undef_weak()) {
*(ul32 *)loc = 0xe320'f000; // NOP
} else {
u64 val = (T ? get_arm_thunk_addr() : S) + A - P;
*(ul32 *)loc = (*(ul32 *)loc & 0xff00'0000) | bits(val, 25, 2);
}
break;
case R_ARM_THM_JUMP11:
assert(T);
check(S + A - P, -(1 << 11), 1 << 11);
*(ul16 *)loc &= 0xf800;
*(ul16 *)loc |= bits(S + A - P, 11, 1);
break;
case R_ARM_THM_JUMP19: {
i64 val = S + A - P;
check(val, -(1 << 19), 1 << 19);
// sign:J2:J1:imm6:imm11:'0'
u32 sign = bit(val, 20);
u32 J2 = bit(val, 19);
u32 J1 = bit(val, 18);
u32 imm6 = bits(val, 17, 12);
u32 imm11 = bits(val, 11, 1);
*(ul16 *)loc &= 0b1111'1011'1100'0000;
*(ul16 *)loc |= (sign << 10) | imm6;
*(ul16 *)(loc + 2) &= 0b1101'0000'0000'0000;
*(ul16 *)(loc + 2) |= (J2 << 13) | (J1 << 11) | imm11;
break;
}
case R_ARM_THM_JUMP24: {
if (sym.is_remaining_undef_weak()) {
*(ul32 *)loc = 0x8000'f3af; // NOP
break;
}
// Just like R_ARM_JUMP24, we need to jump to a thunk if we need to
// switch processor mode.
u64 val = S + A - P;
if (!is_jump_reachable(val) || !T)
val = get_thumb_thunk_addr() + A - P;
write_thm_b_imm(loc, val);
break;
}
case R_ARM_MOVW_PREL_NC:
write_mov_imm(loc, ((S + A) | T) - P);
break;
case R_ARM_MOVW_ABS_NC:
write_mov_imm(loc, (S + A) | T);
break;
case R_ARM_THM_MOVW_PREL_NC:
write_thm_mov_imm(loc, ((S + A) | T) - P);
break;
case R_ARM_PREL31:
check(S + A - P, -(1LL << 30), 1LL << 30);
*(ul32 *)loc &= 0x8000'0000;
*(ul32 *)loc |= (S + A - P) & 0x7fff'ffff;
break;
case R_ARM_THM_MOVW_ABS_NC:
write_thm_mov_imm(loc, (S + A) | T);
break;
case R_ARM_MOVT_PREL:
write_mov_imm(loc, (S + A - P) >> 16);
break;
case R_ARM_THM_MOVT_PREL:
write_thm_mov_imm(loc, (S + A - P) >> 16);
break;
case R_ARM_MOVT_ABS:
write_mov_imm(loc, (S + A) >> 16);
break;
case R_ARM_THM_MOVT_ABS:
write_thm_mov_imm(loc, (S + A) >> 16);
break;
case R_ARM_TLS_GD32:
*(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - P;
break;
case R_ARM_TLS_LDM32:
*(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - P;
break;
case R_ARM_TLS_LDO32:
*(ul32 *)loc = S + A - ctx.dtp_addr;
break;
case R_ARM_TLS_IE32:
*(ul32 *)loc = sym.get_gottp_addr(ctx) + A - P;
break;
case R_ARM_TLS_LE32:
*(ul32 *)loc = S + A - ctx.tp_addr;
break;
case R_ARM_TLS_GOTDESC:
if (sym.has_tlsdesc(ctx)) {
// A is odd if the corresponding TLS_CALL is Thumb.
if (A & 1)
*(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 6;
else
*(ul32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - 4;
} else {
*(ul32 *)loc = S - ctx.tp_addr;
}
break;
case R_ARM_TLS_CALL:
if (sym.has_tlsdesc(ctx)) {
// BL <tls_trampoline>
*(ul32 *)loc = 0xeb00'0000 | bits(get_tls_trampoline_addr(P + 8), 25, 2);
} else {
// BL -> NOP
*(ul32 *)loc = 0xe320'f000;
}
break;
case R_ARM_THM_TLS_CALL:
if (sym.has_tlsdesc(ctx)) {
u64 val = align_to(get_tls_trampoline_addr(P + 4), 4);
write_thm_b_imm(loc, val);
*(ul16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX
} else {
// BL -> NOP.W
*(ul32 *)loc = 0x8000'f3af;
}
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : get_addend(*this, rel);
switch (rel.r_type) {
case R_ARM_ABS32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul32 *)loc = *val;
else
*(ul32 *)loc = S + A;
break;
case R_ARM_TLS_LDO32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul32 *)loc = *val;
else
*(ul32 *)loc = S + A - ctx.dtp_addr;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
break;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_ARM_ABS32:
case R_ARM_MOVT_ABS:
case R_ARM_THM_MOVT_ABS:
case R_ARM_TARGET1:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_ARM_THM_CALL:
case R_ARM_CALL:
case R_ARM_JUMP24:
case R_ARM_PLT32:
case R_ARM_THM_JUMP24:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_ARM_GOT_PREL:
case R_ARM_GOT_BREL:
case R_ARM_TARGET2:
sym.flags |= NEEDS_GOT;
break;
case R_ARM_MOVT_PREL:
case R_ARM_THM_MOVT_PREL:
case R_ARM_PREL31:
scan_pcrel(ctx, sym, rel);
break;
case R_ARM_TLS_GD32:
sym.flags |= NEEDS_TLSGD;
break;
case R_ARM_TLS_LDM32:
ctx.needs_tlsld = true;
break;
case R_ARM_TLS_IE32:
sym.flags |= NEEDS_GOTTP;
break;
case R_ARM_TLS_GOTDESC:
if (!relax_tlsdesc(ctx, sym))
sym.flags |= NEEDS_TLSDESC;
break;
case R_ARM_TLS_LE32:
check_tlsle(ctx, sym, rel);
break;
case R_ARM_REL32:
case R_ARM_BASE_PREL:
case R_ARM_GOTOFF32:
case R_ARM_THM_JUMP11:
case R_ARM_THM_JUMP19:
case R_ARM_MOVW_PREL_NC:
case R_ARM_MOVW_ABS_NC:
case R_ARM_THM_MOVW_PREL_NC:
case R_ARM_THM_MOVW_ABS_NC:
case R_ARM_TLS_LDO32:
case R_ARM_TLS_CALL:
case R_ARM_THM_TLS_CALL:
case R_ARM_V4BX:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <>
void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
// TLS trampoline code. ARM32's TLSDESC is designed so that this
// common piece of code is factored out from object files to reduce
// output size. Since no one provide, the linker has to synthesize it.
static ul32 hdr[] = {
0xe08e'0000, // add r0, lr, r0
0xe590'1004, // ldr r1, [r0, #4]
0xe12f'ff11, // bx r1
};
// This is a range extension and mode switch thunk.
// It has two entry points: +0 for Thumb and +4 for ARM.
const u8 entry[] = {
// .thumb
0xfc, 0x46, // mov ip, pc
0x60, 0x47, // bx ip # jumps to the following `ldr` insn
// .arm
0x04, 0xc0, 0x9f, 0xe5, // ldr ip, 2f
0x0f, 0xc0, 0x8c, 0xe0, // 1: add ip, ip, pc
0x1c, 0xff, 0x2f, 0xe1, // bx ip
0x00, 0x00, 0x00, 0x00, // 2: .word sym - 1b
};
static_assert(E::thunk_hdr_size == sizeof(hdr));
static_assert(E::thunk_size == sizeof(entry));
memcpy(buf, hdr, sizeof(hdr));
for (i64 i = 0; i < symbols.size(); i++) {
u8 *loc = buf + sizeof(hdr) + i * sizeof(entry);
memcpy(loc, entry, sizeof(entry));
u64 S = symbols[i]->get_addr(ctx);
u64 P = output_section.shdr.sh_addr + offset + sizeof(hdr) + i * sizeof(entry);
*(ul32 *)(loc + 16) = S - P - 16;
}
}
// ARM executables use an .ARM.exidx section to look up an exception
// handling record for the current instruction pointer. The table needs
// to be sorted by their addresses.
//
// Other target uses .eh_frame_hdr instead for the same purpose.
// I don't know why only ARM uses the different mechanism, but it's
// likely that it's due to some historical reason.
//
// This function sorts .ARM.exidx records.
void fixup_arm_exidx_section(Context<E> &ctx) {
Timer t(ctx, "fixup_arm_exidx_section");
OutputSection<E> *osec = find_section(ctx, SHT_ARM_EXIDX);
if (!osec)
return;
// .ARM.exidx records consists of a signed 31-bit relative address
// and a 32-bit value. The relative address indicates the start
// address of a function that the record covers. The value is one of
// the followings:
//
// 1. CANTUNWIND indicating that there's no unwinding info for the function,
// 2. a compact unwinding record encoded into a 32-bit value, or
// 3. a 31-bit relative address which points to a larger record in
// the .ARM.extab section.
//
// CANTUNWIND is value 1. The most significant bit is set in (2) but
// not in (3). So we can distinguished them just by looking at a value.
const u32 EXIDX_CANTUNWIND = 1;
struct Entry {
ul32 addr;
ul32 val;
};
if (osec->shdr.sh_size % sizeof(Entry))
Fatal(ctx) << "invalid .ARM.exidx section size";
Entry *ent = (Entry *)(ctx.buf + osec->shdr.sh_offset);
i64 num_entries = osec->shdr.sh_size / sizeof(Entry);
// Entry's addresses are relative to themselves. In order to sort
// records by addresses, we first translate them so that the addresses
// are relative to the beginning of the section.
auto is_relative = [](u32 val) {
return val != EXIDX_CANTUNWIND && !(val & 0x8000'0000);
};
tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
i64 offset = sizeof(Entry) * i;
ent[i].addr = sign_extend(ent[i].addr, 30) + offset;
if (is_relative(ent[i].val))
ent[i].val = 0x7fff'ffff & (ent[i].val + offset);
});
tbb::parallel_sort(ent, ent + num_entries, [](const Entry &a, const Entry &b) {
return a.addr < b.addr;
});
// Make addresses relative to themselves.
tbb::parallel_for((i64)0, num_entries, [&](i64 i) {
i64 offset = sizeof(Entry) * i;
ent[i].addr = 0x7fff'ffff & (ent[i].addr - offset);
if (is_relative(ent[i].val))
ent[i].val = 0x7fff'ffff & (ent[i].val - offset);
});
// .ARM.exidx's sh_link should be set to the .text section index.
// Runtime doesn't care about it, but the binutils's strip command does.
if (ctx.shdr) {
if (Chunk<E> *text = find_section(ctx, ".text")) {
osec->shdr.sh_link = text->shndx;
ctx.shdr->copy_buf(ctx);
}
}
}
} // namespace mold::elf

View file

@ -1,595 +0,0 @@
// clang-format off
// This file contains ARM64-specific code. Being new, the ARM64's ELF
// psABI doesn't have anything peculiar. ARM64 is a clean RISC
// instruction set that supports PC-relative load/store instructions.
//
// Unlike ARM32, instructions length doesn't vary. All ARM64
// instructions are 4 bytes long.
//
// Branch instructions used for function call can jump within ±128 MiB.
// We need to create range extension thunks to support binaries whose
// .text is larger than that.
//
// Unlike most other targets, the TLSDESC access model is used by default
// for -fPIC to access thread-local variables instead of the less
// efficient GD model. You can still enable GD but it needs the
// -mtls-dialect=trad flag. Since GD is used rarely, we don't need to
// implement GD → LE relaxation.
//
// https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = ARM64;
static void write_adrp(u8 *buf, u64 val) {
*(ul32 *)buf |= (bits(val, 13, 12) << 29) | (bits(val, 32, 14) << 5);
}
static void write_adr(u8 *buf, u64 val) {
*(ul32 *)buf |= (bits(val, 1, 0) << 29) | (bits(val, 20, 2) << 5);
}
static void write_movn_movz(u8 *buf, i64 val) {
*(ul32 *)buf &= 0b0000'0000'0110'0000'0000'0000'0001'1111;
if (val >= 0)
*(ul32 *)buf |= 0xd280'0000 | (bits(val, 15, 0) << 5); // rewrite to movz
else
*(ul32 *)buf |= 0x9280'0000 | (bits(~val, 15, 0) << 5); // rewrite to movn
}
static u64 page(u64 val) {
return val & 0xffff'ffff'ffff'f000;
}
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const ul32 insn[] = {
0xa9bf'7bf0, // stp x16, x30, [sp,#-16]!
0x9000'0010, // adrp x16, .got.plt[2]
0xf940'0211, // ldr x17, [x16, .got.plt[2]]
0x9100'0210, // add x16, x16, .got.plt[2]
0xd61f'0220, // br x17
0xd503'201f, // nop
0xd503'201f, // nop
0xd503'201f, // nop
};
u64 gotplt = ctx.gotplt->shdr.sh_addr + 16;
u64 plt = ctx.plt->shdr.sh_addr;
memcpy(buf, insn, sizeof(insn));
write_adrp(buf + 4, page(gotplt) - page(plt + 4));
*(ul32 *)(buf + 8) |= bits(gotplt, 11, 3) << 10;
*(ul32 *)(buf + 12) |= (gotplt & 0xfff) << 10;
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static const ul32 insn[] = {
0x9000'0010, // adrp x16, .got.plt[n]
0xf940'0211, // ldr x17, [x16, .got.plt[n]]
0x9100'0210, // add x16, x16, .got.plt[n]
0xd61f'0220, // br x17
};
u64 gotplt = sym.get_gotplt_addr(ctx);
u64 plt = sym.get_plt_addr(ctx);
memcpy(buf, insn, sizeof(insn));
write_adrp(buf, page(gotplt) - page(plt));
*(ul32 *)(buf + 4) |= bits(gotplt, 11, 3) << 10;
*(ul32 *)(buf + 8) |= (gotplt & 0xfff) << 10;
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static const ul32 insn[] = {
0x9000'0010, // adrp x16, GOT[n]
0xf940'0211, // ldr x17, [x16, GOT[n]]
0xd61f'0220, // br x17
0xd503'201f, // nop
};
u64 got = sym.get_got_addr(ctx);
u64 plt = sym.get_plt_addr(ctx);
memcpy(buf, insn, sizeof(insn));
write_adrp(buf, page(got) - page(plt));
*(ul32 *)(buf + 4) |= bits(got, 11, 3) << 10;
}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_AARCH64_ABS64:
*(ul64 *)loc = val;
break;
case R_AARCH64_PREL32:
*(ul32 *)loc = val - this->shdr.sh_addr - offset;
break;
case R_AARCH64_PREL64:
*(ul64 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
static bool is_adrp(u8 *loc) {
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADRP--Form-PC-relative-address-to-4KB-page-
u32 insn = *(ul32 *)loc;
return (bits(insn, 31, 24) & 0b1001'1111) == 0b1001'0000;
}
static bool is_ldr(u8 *loc) {
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate--
u32 insn = *(ul32 *)loc;
return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1111'1001'0100;
}
static bool is_add(u8 *loc) {
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--immediate---Add--immediate--
u32 insn = *(ul32 *)loc;
return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1001'0001'0000;
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_AARCH64_ABS64:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_AARCH64_LDST8_ABS_LO12_NC:
case R_AARCH64_ADD_ABS_LO12_NC:
*(ul32 *)loc |= bits(S + A, 11, 0) << 10;
break;
case R_AARCH64_LDST16_ABS_LO12_NC:
*(ul32 *)loc |= bits(S + A, 11, 1) << 10;
break;
case R_AARCH64_LDST32_ABS_LO12_NC:
*(ul32 *)loc |= bits(S + A, 11, 2) << 10;
break;
case R_AARCH64_LDST64_ABS_LO12_NC:
*(ul32 *)loc |= bits(S + A, 11, 3) << 10;
break;
case R_AARCH64_LDST128_ABS_LO12_NC:
*(ul32 *)loc |= bits(S + A, 11, 4) << 10;
break;
case R_AARCH64_MOVW_UABS_G0:
check(S + A, 0, 1 << 16);
*(ul32 *)loc |= bits(S + A, 15, 0) << 5;
break;
case R_AARCH64_MOVW_UABS_G0_NC:
*(ul32 *)loc |= bits(S + A, 15, 0) << 5;
break;
case R_AARCH64_MOVW_UABS_G1:
check(S + A, 0, 1LL << 32);
*(ul32 *)loc |= bits(S + A, 31, 16) << 5;
break;
case R_AARCH64_MOVW_UABS_G1_NC:
*(ul32 *)loc |= bits(S + A, 31, 16) << 5;
break;
case R_AARCH64_MOVW_UABS_G2:
check(S + A, 0, 1LL << 48);
*(ul32 *)loc |= bits(S + A, 47, 32) << 5;
break;
case R_AARCH64_MOVW_UABS_G2_NC:
*(ul32 *)loc |= bits(S + A, 47, 32) << 5;
break;
case R_AARCH64_MOVW_UABS_G3:
*(ul32 *)loc |= bits(S + A, 63, 48) << 5;
break;
case R_AARCH64_ADR_GOT_PAGE:
if (sym.has_got(ctx)) {
i64 val = page(G + GOT + A) - page(P);
check(val, -(1LL << 32), 1LL << 32);
write_adrp(loc, val);
} else {
// Relax GOT-loading ADRP+LDR to an immediate ADRP+ADD
i64 val = page(S + A) - page(P);
check(val, -(1LL << 32), 1LL << 32);
write_adrp(loc, val);
u32 reg = bits(*(ul32 *)loc, 4, 0);
*(ul32 *)(loc + 4) = 0x9100'0000 | (reg << 5) | reg; // ADD
*(ul32 *)(loc + 4) |= bits(S + A, 11, 0) << 10;
i++;
}
break;
case R_AARCH64_ADR_PREL_PG_HI21: {
// The ARM64 psABI defines that an `ADRP x0, foo` and `ADD x0, x0,
// :lo12: foo` instruction pair to materialize a PC-relative address
// in a register can be relaxed to `NOP` followed by `ADR x0, foo`
// if foo is in PC ± 1 MiB.
if (ctx.arg.relax && i + 1 < rels.size() &&
sign_extend(S + A - P - 4, 20) == S + A - P - 4) {
const ElfRel<E> &rel2 = rels[i + 1];
if (rel2.r_type == R_AARCH64_ADD_ABS_LO12_NC &&
rel2.r_sym == rel.r_sym &&
rel2.r_offset == rel.r_offset + 4 &&
rel2.r_addend == rel.r_addend &&
is_adrp(loc) &&
is_add(loc + 4)) {
u32 reg1 = bits(*(ul32 *)loc, 4, 0);
u32 reg2 = bits(*(ul32 *)(loc + 4), 4, 0);
if (reg1 == reg2) {
*(ul32 *)loc = 0xd503'201f; // nop
*(ul32 *)(loc + 4) = 0x1000'0000 | reg1; // adr
write_adr(loc + 4, S + A - P - 4);
i++;
break;
}
}
}
i64 val = page(S + A) - page(P);
check(val, -(1LL << 32), 1LL << 32);
write_adrp(loc, val);
break;
}
case R_AARCH64_ADR_PREL_LO21:
check(S + A - P, -(1LL << 20), 1LL << 20);
write_adr(loc, S + A - P);
break;
case R_AARCH64_CALL26:
case R_AARCH64_JUMP26: {
if (sym.is_remaining_undef_weak()) {
// On ARM, calling an weak undefined symbol jumps to the
// next instruction.
*(ul32 *)loc = 0xd503'201f; // nop
break;
}
i64 val = S + A - P;
if (val < -(1 << 27) || (1 << 27) <= val)
val = get_thunk_addr(i) + A - P;
*(ul32 *)loc |= bits(val, 27, 2);
break;
}
case R_AARCH64_PLT32:
check(S + A - P, -(1LL << 31), 1LL << 31);
*(ul32 *)loc = S + A - P;
break;
case R_AARCH64_CONDBR19:
case R_AARCH64_LD_PREL_LO19:
check(S + A - P, -(1LL << 20), 1LL << 20);
*(ul32 *)loc |= bits(S + A - P, 20, 2) << 5;
break;
case R_AARCH64_PREL16:
check(S + A - P, -(1LL << 15), 1LL << 15);
*(ul16 *)loc = S + A - P;
break;
case R_AARCH64_PREL32:
check(S + A - P, -(1LL << 31), 1LL << 32);
*(ul32 *)loc = S + A - P;
break;
case R_AARCH64_PREL64:
*(ul64 *)loc = S + A - P;
break;
case R_AARCH64_LD64_GOT_LO12_NC:
*(ul32 *)loc |= bits(G + GOT + A, 11, 3) << 10;
break;
case R_AARCH64_LD64_GOTPAGE_LO15: {
i64 val = G + GOT + A - page(GOT);
check(val, 0, 1 << 15);
*(ul32 *)loc |= bits(val, 14, 3) << 10;
break;
}
case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: {
i64 val = page(sym.get_gottp_addr(ctx) + A) - page(P);
check(val, -(1LL << 32), 1LL << 32);
write_adrp(loc, val);
break;
}
case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
*(ul32 *)loc |= bits(sym.get_gottp_addr(ctx) + A, 11, 3) << 10;
break;
case R_AARCH64_TLSLE_MOVW_TPREL_G0: {
i64 val = S + A - ctx.tp_addr;
check(val, -(1 << 15), 1 << 15);
write_movn_movz(loc, val);
break;
}
case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
*(ul32 *)loc |= bits(S + A - ctx.tp_addr, 15, 0) << 5;
break;
case R_AARCH64_TLSLE_MOVW_TPREL_G1: {
i64 val = S + A - ctx.tp_addr;
check(val, -(1LL << 31), 1LL << 31);
write_movn_movz(loc, val >> 16);
break;
}
case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
*(ul32 *)loc |= bits(S + A - ctx.tp_addr, 31, 16) << 5;
break;
case R_AARCH64_TLSLE_MOVW_TPREL_G2: {
i64 val = S + A - ctx.tp_addr;
check(val, -(1LL << 47), 1LL << 47);
write_movn_movz(loc, val >> 32);
break;
}
case R_AARCH64_TLSLE_ADD_TPREL_HI12: {
i64 val = S + A - ctx.tp_addr;
check(val, 0, 1LL << 24);
*(ul32 *)loc |= bits(val, 23, 12) << 10;
break;
}
case R_AARCH64_TLSLE_ADD_TPREL_LO12:
check(S + A - ctx.tp_addr, 0, 1 << 12);
*(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10;
break;
case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
*(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10;
break;
case R_AARCH64_TLSGD_ADR_PAGE21: {
i64 val = page(sym.get_tlsgd_addr(ctx) + A) - page(P);
check(val, -(1LL << 32), 1LL << 32);
write_adrp(loc, val);
break;
}
case R_AARCH64_TLSGD_ADD_LO12_NC:
*(ul32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A, 11, 0) << 10;
break;
case R_AARCH64_TLSDESC_ADR_PAGE21:
if (sym.has_tlsdesc(ctx)) {
i64 val = page(sym.get_tlsdesc_addr(ctx) + A) - page(P);
check(val, -(1LL << 32), 1LL << 32);
write_adrp(loc, val);
} else {
// adrp x0, 0 -> movz x0, #tls_ofset_hi, lsl #16
i64 val = (S + A - ctx.tp_addr);
check(val, -(1LL << 32), 1LL << 32);
*(ul32 *)loc = 0xd2a0'0000 | (bits(val, 32, 16) << 5);
}
break;
case R_AARCH64_TLSDESC_LD64_LO12:
if (sym.has_tlsdesc(ctx)) {
*(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 3) << 10;
} else {
// ldr x2, [x0] -> movk x0, #tls_ofset_lo
u32 offset_lo = (S + A - ctx.tp_addr) & 0xffff;
*(ul32 *)loc = 0xf280'0000 | (offset_lo << 5);
}
break;
case R_AARCH64_TLSDESC_ADD_LO12:
if (sym.has_tlsdesc(ctx)) {
*(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 0) << 10;
} else {
// add x0, x0, #0 -> nop
*(ul32 *)loc = 0xd503'201f;
}
break;
case R_AARCH64_TLSDESC_CALL:
if (!sym.has_tlsdesc(ctx)) {
// blr x2 -> nop
*(ul32 *)loc = 0xd503'201f;
}
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_AARCH64_ABS64:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul64 *)loc = *val;
else
*(ul64 *)loc = S + A;
break;
case R_AARCH64_ABS32: {
i64 val = S + A;
check(val, 0, 1LL << 32);
*(ul32 *)loc = val;
break;
}
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
break;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = (u8 *)(contents.data() + rel.r_offset);
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_AARCH64_ABS64:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_AARCH64_ADR_GOT_PAGE:
// An ADR_GOT_PAGE and GOT_LO12_NC relocation pair is used to load a
// symbol's address from GOT. If the GOT value is a link-time
// constant, we may be able to rewrite the ADRP+LDR instruction pair
// with an ADRP+ADD, eliminating a GOT memory load.
if (ctx.arg.relax && sym.is_relative() && !sym.is_imported &&
!sym.is_ifunc() && i + 1 < rels.size()) {
// ADRP+LDR must be consecutive and use the same register to relax.
const ElfRel<E> &rel2 = rels[i + 1];
if (rel2.r_type == R_AARCH64_LD64_GOT_LO12_NC &&
rel2.r_offset == rel.r_offset + 4 &&
rel2.r_sym == rel.r_sym &&
rel.r_addend == 0 &&
rel2.r_addend == 0 &&
is_adrp(loc) &&
is_ldr(loc + 4)) {
u32 rd = bits(*(ul32 *)loc, 4, 0);
u32 rn = bits(*(ul32 *)(loc + 4), 9, 5);
u32 rt = bits(*(ul32 *)(loc + 4), 4, 0);
if (rd == rn && rn == rt) {
i++;
break;
}
}
}
sym.flags |= NEEDS_GOT;
break;
case R_AARCH64_LD64_GOT_LO12_NC:
case R_AARCH64_LD64_GOTPAGE_LO15:
sym.flags |= NEEDS_GOT;
break;
case R_AARCH64_CALL26:
case R_AARCH64_JUMP26:
case R_AARCH64_PLT32:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
sym.flags |= NEEDS_GOTTP;
break;
case R_AARCH64_ADR_PREL_PG_HI21:
scan_pcrel(ctx, sym, rel);
break;
case R_AARCH64_TLSGD_ADR_PAGE21:
sym.flags |= NEEDS_TLSGD;
break;
case R_AARCH64_TLSDESC_ADR_PAGE21:
case R_AARCH64_TLSDESC_LD64_LO12:
case R_AARCH64_TLSDESC_ADD_LO12:
if (!relax_tlsdesc(ctx, sym))
sym.flags |= NEEDS_TLSDESC;
break;
case R_AARCH64_TLSLE_MOVW_TPREL_G0:
case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
case R_AARCH64_TLSLE_MOVW_TPREL_G1:
case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC:
case R_AARCH64_TLSLE_MOVW_TPREL_G2:
case R_AARCH64_TLSLE_ADD_TPREL_HI12:
case R_AARCH64_TLSLE_ADD_TPREL_LO12:
case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
check_tlsle(ctx, sym, rel);
break;
case R_AARCH64_ADD_ABS_LO12_NC:
case R_AARCH64_ADR_PREL_LO21:
case R_AARCH64_CONDBR19:
case R_AARCH64_LD_PREL_LO19:
case R_AARCH64_LDST16_ABS_LO12_NC:
case R_AARCH64_LDST32_ABS_LO12_NC:
case R_AARCH64_LDST64_ABS_LO12_NC:
case R_AARCH64_LDST128_ABS_LO12_NC:
case R_AARCH64_LDST8_ABS_LO12_NC:
case R_AARCH64_MOVW_UABS_G0:
case R_AARCH64_MOVW_UABS_G0_NC:
case R_AARCH64_MOVW_UABS_G1:
case R_AARCH64_MOVW_UABS_G1_NC:
case R_AARCH64_MOVW_UABS_G2:
case R_AARCH64_MOVW_UABS_G2_NC:
case R_AARCH64_MOVW_UABS_G3:
case R_AARCH64_PREL16:
case R_AARCH64_PREL32:
case R_AARCH64_PREL64:
case R_AARCH64_TLSGD_ADD_LO12_NC:
case R_AARCH64_TLSDESC_CALL:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <>
void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
static const ul32 data[] = {
0x9000'0010, // adrp x16, 0 # R_AARCH64_ADR_PREL_PG_HI21
0x9100'0210, // add x16, x16 # R_AARCH64_ADD_ABS_LO12_NC
0xd61f'0200, // br x16
};
static_assert(E::thunk_size == sizeof(data));
for (i64 i = 0; i < symbols.size(); i++) {
u64 S = symbols[i]->get_addr(ctx);
u64 P = output_section.shdr.sh_addr + offset + i * E::thunk_size;
u8 *loc = buf + i * E::thunk_size;
memcpy(loc , data, sizeof(data));
write_adrp(loc, page(S) - page(P));
*(ul32 *)(loc + 4) |= bits(S, 11, 0) << 10;
}
}
} // namespace mold::elf

View file

@ -1,565 +0,0 @@
// clang-format off
// i386 is similar to x86-64 but lacks PC-relative memory access
// instructions. So it's not straightforward to support position-
// independent code (PIC) on that target.
//
// If an object file is compiled with -fPIC, a function that needs to load
// a value from memory first obtains its own address with the following
// code
//
// call __x86.get_pc_thunk.bx
//
// where __x86.get_pc_thunk.bx is defined as
//
// __x86.get_pc_thunk.bx:
// mov (%esp), %ebx # move the return address to %ebx
// ret
//
// . With the function's own address (or, more precisely, the address
// immediately after the call instruction), the function can compute an
// absolute address of a variable with its address + link-time constant.
//
// Executing call-mov-ret isn't very cheap, and allocating one register to
// store PC isn't cheap too, especially given that i386 has only 8
// general-purpose registers. But that's the cost of PIC on i386. You need
// to pay it when creating a .so and a position-independent executable.
//
// When a position-independent function calls another function, it sets
// %ebx to the address of .got. Position-independent PLT entries use that
// register to load values from .got.plt/.got.
//
// If we are creating a position-dependent executable (PDE), we can't
// assume that %ebx is set to .got. For PDE, we need to create position-
// dependent PLT entries which don't use %ebx.
//
// https://github.com/rui314/psabi/blob/main/i386.pdf
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = I386;
template <>
i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
switch (rel.r_type) {
case R_386_8:
case R_386_PC8:
return *loc;
case R_386_16:
case R_386_PC16:
return *(ul16 *)loc;
case R_386_32:
case R_386_PC32:
case R_386_GOT32:
case R_386_GOT32X:
case R_386_PLT32:
case R_386_GOTOFF:
case R_386_GOTPC:
case R_386_TLS_LDM:
case R_386_TLS_GOTIE:
case R_386_TLS_LE:
case R_386_TLS_IE:
case R_386_TLS_GD:
case R_386_TLS_LDO_32:
case R_386_SIZE32:
case R_386_TLS_GOTDESC:
return *(ul32 *)loc;
default:
return 0;
}
}
template <>
void write_addend(u8 *loc, i64 val, const ElfRel<E> &rel) {
switch (rel.r_type) {
case R_386_NONE:
break;
case R_386_8:
case R_386_PC8:
*loc = val;
break;
case R_386_16:
case R_386_PC16:
*(ul16 *)loc = val;
break;
case R_386_32:
case R_386_PC32:
case R_386_GOT32:
case R_386_GOT32X:
case R_386_PLT32:
case R_386_GOTOFF:
case R_386_GOTPC:
case R_386_TLS_LDM:
case R_386_TLS_GOTIE:
case R_386_TLS_LE:
case R_386_TLS_IE:
case R_386_TLS_GD:
case R_386_TLS_LDO_32:
case R_386_SIZE32:
case R_386_TLS_GOTDESC:
*(ul32 *)loc = val;
break;
default:
unreachable();
}
}
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
if (ctx.arg.pic) {
static const u8 insn[] = {
0xf3, 0x0f, 0x1e, 0xfb, // endbr32
0x51, // push %ecx
0x8d, 0x8b, 0, 0, 0, 0, // lea GOTPLT+4(%ebx), %ecx
0xff, 0x31, // push (%ecx)
0xff, 0x61, 0x04, // jmp *0x4(%ecx)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 7) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr + 4;
} else {
static const u8 insn[] = {
0xf3, 0x0f, 0x1e, 0xfb, // endbr32
0x51, // push %ecx
0xb9, 0, 0, 0, 0, // mov GOTPLT+4, %ecx
0xff, 0x31, // push (%ecx)
0xff, 0x61, 0x04, // jmp *0x4(%ecx)
0xcc, // (padding)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 6) = ctx.gotplt->shdr.sh_addr + 4;
}
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
if (ctx.arg.pic) {
static const u8 insn[] = {
0xf3, 0x0f, 0x1e, 0xfb, // endbr32
0xb9, 0, 0, 0, 0, // mov $reloc_offset, %ecx
0xff, 0xa3, 0, 0, 0, 0, // jmp *foo@GOT(%ebx)
0xcc, // (padding)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
*(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
} else {
static const u8 insn[] = {
0xf3, 0x0f, 0x1e, 0xfb, // endbr32
0xb9, 0, 0, 0, 0, // mov $reloc_offset, %ecx
0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT
0xcc, // (padding)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
*(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx);
}
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
if (ctx.arg.pic) {
static const u8 insn[] = {
0xf3, 0x0f, 0x1e, 0xfb, // endbr32
0xff, 0xa3, 0, 0, 0, 0, // jmp *foo@GOT(%ebx)
0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 6) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr;
} else {
static const u8 insn[] = {
0xf3, 0x0f, 0x1e, 0xfb, // endbr32
0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT
0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding)
};
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 6) = sym.get_got_addr(ctx);
}
}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_386_32:
*(ul32 *)loc = val;
break;
case R_386_PC32:
*(ul32 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
static u32 relax_got32x(u8 *loc) {
// mov imm(%reg1), %reg2 -> lea imm(%reg1), %reg2
if (loc[0] == 0x8b)
return 0x8d00 | loc[1];
return 0;
}
// Relax GD to LE
static void relax_gd_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
static const u8 insn[] = {
0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
0x81, 0xc0, 0, 0, 0, 0, // add $tp_offset, %eax
};
switch (rel.r_type) {
case R_386_PLT32:
case R_386_PC32:
memcpy(loc - 3, insn, sizeof(insn));
*(ul32 *)(loc + 5) = val;
break;
case R_386_GOT32:
case R_386_GOT32X:
memcpy(loc - 2, insn, sizeof(insn));
*(ul32 *)(loc + 6) = val;
break;
default:
unreachable();
}
}
// Relax LD to LE
static void relax_ld_to_le(u8 *loc, ElfRel<E> rel, u64 val) {
switch (rel.r_type) {
case R_386_PLT32:
case R_386_PC32: {
static const u8 insn[] = {
0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
0x2d, 0, 0, 0, 0, // sub $tls_size, %eax
};
memcpy(loc - 2, insn, sizeof(insn));
*(ul32 *)(loc + 5) = val;
break;
}
case R_386_GOT32:
case R_386_GOT32X: {
static const u8 insn[] = {
0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax
0x2d, 0, 0, 0, 0, // sub $tls_size, %eax
0x90, // nop
};
memcpy(loc - 2, insn, sizeof(insn));
*(ul32 *)(loc + 5) = val;
break;
}
default:
unreachable();
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
u64 S = sym.get_addr(ctx);
u64 A = get_addend(*this, rel);
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_386_8:
check(S + A, 0, 1 << 8);
*loc = S + A;
break;
case R_386_16:
check(S + A, 0, 1 << 16);
*(ul16 *)loc = S + A;
break;
case R_386_32:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_386_PC8:
check(S + A - P, -(1 << 7), 1 << 7);
*loc = S + A - P;
break;
case R_386_PC16:
check(S + A - P, -(1 << 15), 1 << 15);
*(ul16 *)loc = S + A - P;
break;
case R_386_PC32:
case R_386_PLT32:
*(ul32 *)loc = S + A - P;
break;
case R_386_GOT32:
*(ul32 *)loc = G + A;
break;
case R_386_GOT32X:
if (sym.has_got(ctx)) {
*(ul32 *)loc = G + A;
} else {
u32 insn = relax_got32x(loc - 2);
assert(insn);
loc[-2] = insn >> 8;
loc[-1] = insn;
*(ul32 *)loc = S + A - GOT;
}
break;
case R_386_GOTOFF:
*(ul32 *)loc = S + A - GOT;
break;
case R_386_GOTPC:
*(ul32 *)loc = GOT + A - P;
break;
case R_386_TLS_GOTIE:
*(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
break;
case R_386_TLS_LE:
*(ul32 *)loc = S + A - ctx.tp_addr;
break;
case R_386_TLS_IE:
*(ul32 *)loc = sym.get_gottp_addr(ctx) + A;
break;
case R_386_TLS_GD:
if (sym.has_tlsgd(ctx)) {
*(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
} else {
relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr);
i++;
}
break;
case R_386_TLS_LDM:
if (ctx.got->has_tlsld(ctx)) {
*(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
} else {
relax_ld_to_le(loc, rels[i + 1], ctx.tp_addr - ctx.tls_begin);
i++;
}
break;
case R_386_TLS_LDO_32:
*(ul32 *)loc = S + A - ctx.dtp_addr;
break;
case R_386_SIZE32:
*(ul32 *)loc = sym.esym().st_size + A;
break;
case R_386_TLS_GOTDESC:
if (sym.has_tlsdesc(ctx)) {
*(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - GOT;
} else {
static const u8 insn[] = {
0x8d, 0x05, 0, 0, 0, 0, // lea 0, %eax
};
memcpy(loc - 2, insn, sizeof(insn));
*(ul32 *)loc = S + A - ctx.tp_addr;
}
break;
case R_386_TLS_DESC_CALL:
if (!sym.has_tlsdesc(ctx)) {
// call *(%eax) -> nop
loc[0] = 0x66;
loc[1] = 0x90;
}
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : get_addend(*this, rel);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_386_8:
check(S + A, 0, 1 << 8);
*loc = S + A;
break;
case R_386_16:
check(S + A, 0, 1 << 16);
*(ul16 *)loc = S + A;
break;
case R_386_32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul32 *)loc = *val;
else
*(ul32 *)loc = S + A;
break;
case R_386_PC8:
check(S + A, -(1 << 7), 1 << 7);
*loc = S + A;
break;
case R_386_PC16:
check(S + A, -(1 << 15), 1 << 15);
*(ul16 *)loc = S + A;
break;
case R_386_PC32:
*(ul32 *)loc = S + A;
break;
case R_386_GOTPC:
*(ul32 *)loc = GOT + A;
break;
case R_386_GOTOFF:
*(ul32 *)loc = S + A - GOT;
break;
case R_386_TLS_LDO_32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul32 *)loc = *val;
else
*(ul32 *)loc = S + A - ctx.dtp_addr;
break;
case R_386_SIZE32:
*(ul32 *)loc = sym.esym().st_size + A;
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = (u8 *)(contents.data() + rel.r_offset);
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_386_8:
case R_386_16:
scan_absrel(ctx, sym, rel);
break;
case R_386_32:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_386_PC8:
case R_386_PC16:
case R_386_PC32:
scan_pcrel(ctx, sym, rel);
break;
case R_386_GOT32:
case R_386_GOTPC:
sym.flags |= NEEDS_GOT;
break;
case R_386_GOT32X: {
// We always want to relax GOT32X because static PIE doesn't
// work without it.
bool do_relax = !sym.is_imported && sym.is_relative() &&
relax_got32x(loc - 2);
if (!do_relax)
sym.flags |= NEEDS_GOT;
break;
}
case R_386_PLT32:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_386_TLS_GOTIE:
case R_386_TLS_IE:
sym.flags |= NEEDS_GOTTP;
break;
case R_386_TLS_GD:
if (i + 1 == rels.size())
Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
if (u32 ty = rels[i + 1].r_type;
ty != R_386_PLT32 && ty != R_386_PC32 &&
ty != R_386_GOT32 && ty != R_386_GOT32X)
Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32";
// We always relax if -static because libc.a doesn't contain
// __tls_get_addr().
if (ctx.arg.is_static ||
(ctx.arg.relax && !ctx.arg.shared && !sym.is_imported))
i++;
else
sym.flags |= NEEDS_TLSGD;
break;
case R_386_TLS_LDM:
if (i + 1 == rels.size())
Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
if (u32 ty = rels[i + 1].r_type;
ty != R_386_PLT32 && ty != R_386_PC32 &&
ty != R_386_GOT32 && ty != R_386_GOT32X)
Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32";
// We always relax if -static because libc.a doesn't contain
// __tls_get_addr().
if (ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared))
i++;
else
ctx.needs_tlsld = true;
break;
case R_386_TLS_GOTDESC:
if (!relax_tlsdesc(ctx, sym))
sym.flags |= NEEDS_TLSDESC;
break;
case R_386_TLS_LE:
check_tlsle(ctx, sym, rel);
break;
case R_386_GOTOFF:
case R_386_TLS_LDO_32:
case R_386_SIZE32:
case R_386_TLS_DESC_CALL:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
} // namespace mold::elf

View file

@ -1,326 +0,0 @@
// clang-format off
// This file contains code for the Motorola 68000 series microprocessors,
// which is often abbreviated as m68k. Running a Unix-like system on a
// m68k-based machine today is probably a retro-computing hobby activity,
// but the processor was a popular choice to build Unix computers during
// '80s. Early Sun workstations for example used m68k. Macintosh until
// 1994 were based on m68k as well until they switched to PowerPC (and
// then to x86 and to ARM.)
//
// From the linker's point of view, it is not hard to support m68k. It's
// just a 32-bit big-endian CISC ISA. Compared to comtemporary i386,
// m68k's psABI is actually simpler because m68k has PC-relative memory
// access instructions and therefore can support position-independent
// code without too much hassle.
//
// https://github.com/rui314/psabi/blob/main/m68k.pdf
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = M68K;
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const u8 insn[] = {
0x2f, 0x00, // move.l %d0, -(%sp)
0x2f, 0x3b, 0x01, 0x70, 0, 0, 0, 0, // move.l (GOTPLT+4, %pc), -(%sp)
0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOTPLT+8, %pc])
};
memcpy(buf, insn, sizeof(insn));
*(ub32 *)(buf + 6) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr;
*(ub32 *)(buf + 14) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 4;
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static const u8 insn[] = {
0x20, 0x3c, 0, 0, 0, 0, // move.l PLT_OFFSET, %d0
0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOTPLT_ENTRY, %pc])
};
memcpy(buf, insn, sizeof(insn));
*(ub32 *)(buf + 2) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
*(ub32 *)(buf + 10) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8;
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static const u8 insn[] = {
0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOT_ENTRY, %pc])
};
memcpy(buf, insn, sizeof(insn));
*(ub32 *)(buf + 4) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 2;
}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_68K_32:
*(ub32 *)loc = val;
break;
case R_68K_PC32:
*(ub32 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
auto write16 = [&](u64 val) {
check(val, 0, 1 << 16);
*(ub16 *)loc = val;
};
auto write16s = [&](u64 val) {
check(val, -(1 << 15), 1 << 15);
*(ub16 *)loc = val;
};
auto write8 = [&](u64 val) {
check(val, 0, 1 << 8);
*loc = val;
};
auto write8s = [&](u64 val) {
check(val, -(1 << 7), 1 << 7);
*loc = val;
};
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_68K_32:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_68K_16:
write16(S + A);
break;
case R_68K_8:
write8(S + A);
break;
case R_68K_PC32:
case R_68K_PLT32:
*(ub32 *)loc = S + A - P;
break;
case R_68K_PC16:
case R_68K_PLT16:
write16s(S + A - P);
break;
case R_68K_PC8:
case R_68K_PLT8:
write8s(S + A - P);
break;
case R_68K_GOTPCREL32:
*(ub32 *)loc = GOT + A - P;
break;
case R_68K_GOTPCREL16:
write16s(GOT + A - P);
break;
case R_68K_GOTPCREL8:
write8s(GOT + A - P);
break;
case R_68K_GOTOFF32:
*(ub32 *)loc = G + A;
break;
case R_68K_GOTOFF16:
write16(G + A);
break;
case R_68K_GOTOFF8:
write8(G + A);
break;
case R_68K_TLS_GD32:
*(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
break;
case R_68K_TLS_GD16:
write16(sym.get_tlsgd_addr(ctx) + A - GOT);
break;
case R_68K_TLS_GD8:
write8(sym.get_tlsgd_addr(ctx) + A - GOT);
break;
case R_68K_TLS_LDM32:
*(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
break;
case R_68K_TLS_LDM16:
write16(ctx.got->get_tlsld_addr(ctx) + A - GOT);
break;
case R_68K_TLS_LDM8:
write8(ctx.got->get_tlsld_addr(ctx) + A - GOT);
break;
case R_68K_TLS_LDO32:
*(ub32 *)loc = S + A - ctx.dtp_addr;
break;
case R_68K_TLS_LDO16:
write16s(S + A - ctx.dtp_addr);
break;
case R_68K_TLS_LDO8:
write8s(S + A - ctx.dtp_addr);
break;
case R_68K_TLS_IE32:
*(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
break;
case R_68K_TLS_IE16:
write16(sym.get_gottp_addr(ctx) + A - GOT);
break;
case R_68K_TLS_IE8:
write8(sym.get_gottp_addr(ctx) + A - GOT);
break;
case R_68K_TLS_LE32:
*(ub32 *)loc = S + A - ctx.tp_addr;
break;
case R_68K_TLS_LE16:
write16(S + A - ctx.tp_addr);
break;
case R_68K_TLS_LE8:
write8(S + A - ctx.tp_addr);
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_68K_32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ub32 *)loc = *val;
else
*(ub32 *)loc = S + A;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
Error(ctx) << sym << ": GNU ifunc symbol is not supported on m68k";
switch (rel.r_type) {
case R_68K_32:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_68K_16:
case R_68K_8:
scan_absrel(ctx, sym, rel);
break;
case R_68K_PC32:
case R_68K_PC16:
case R_68K_PC8:
scan_pcrel(ctx, sym, rel);
break;
case R_68K_GOTPCREL32:
case R_68K_GOTPCREL16:
case R_68K_GOTPCREL8:
case R_68K_GOTOFF32:
case R_68K_GOTOFF16:
case R_68K_GOTOFF8:
sym.flags |= NEEDS_GOT;
break;
case R_68K_PLT32:
case R_68K_PLT16:
case R_68K_PLT8:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_68K_TLS_GD32:
case R_68K_TLS_GD16:
case R_68K_TLS_GD8:
sym.flags |= NEEDS_TLSGD;
break;
case R_68K_TLS_LDM32:
case R_68K_TLS_LDM16:
case R_68K_TLS_LDM8:
ctx.needs_tlsld = true;
break;
case R_68K_TLS_IE32:
case R_68K_TLS_IE16:
case R_68K_TLS_IE8:
sym.flags |= NEEDS_GOTTP;
break;
case R_68K_TLS_LE32:
case R_68K_TLS_LE16:
case R_68K_TLS_LE8:
check_tlsle(ctx, sym, rel);
break;
case R_68K_TLS_LDO32:
case R_68K_TLS_LDO16:
case R_68K_TLS_LDO8:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
} // namespace mold::elf

View file

@ -1,452 +0,0 @@
// clang-format off
// This file implements the PowerPC 32-bit ISA. For 64-bit PowerPC, see
// arch-ppc64v1.cpp and arch-ppc64v2.cpp.
//
// PPC32 is a RISC ISA. It has 32 general-purpose registers (GPRs).
// r0, r11 and r12 are reserved for static linkers, so we can use these
// registers in PLTs and range extension thunks. In addition to that, it
// has a few special registers. Notable ones are LR which holds a return
// address and CTR which we can use to store a branch target address.
//
// It feels that the PPC32 psABI is unnecessarily complicated at first
// glance, but that is mainly stemmed from the fact that the ISA lacks
// PC-relative load/store instructions. Since machine instructions cannot
// load data relative to its own address, it is not straightforward to
// support position-independent code (PIC) on PPC32.
//
// A position-independent function typically contains the following code
// in the prologue to obtain its own address:
//
// mflr r0 // save the current return address to %r0
// bcl 20, 31, 4 // call the next instruction as if it were a function
// mtlr r12 // save the return address to %r12
// mtlr r0 // restore the original return address
//
// An object file compiled with -fPIC contains a data section named
// `.got2` to store addresses of locally-defined global variables and
// constants. A PIC function usually computes its .got2+0x8000 and set it
// to %r30. This scheme allows the function to access global objects
// defined in the same input file with a single %r30-relative load/store
// instruction with a 16-bit offset, given that .got2 is smaller than
// 0x10000 (or 65536) bytes.
//
// Since each object file has its own .got2, %r30 refers to different
// places in a merged .got2 for two functions that came from different
// input files. Therefore, %r30 makes sense only within a single function.
//
// Technically, we can reuse a %r30 value in our PLT if we create a PLT
// _for each input file_ (that's what GNU ld seems to be doing), but that
// doesn't seems to be worth its complexity. Our PLT simply doesn't rely
// on a %r30 value.
//
// https://github.com/rui314/psabi/blob/main/ppc32.pdf
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = PPC32;
static u64 lo(u64 x) { return x & 0xffff; }
static u64 hi(u64 x) { return x >> 16; }
static u64 ha(u64 x) { return (x + 0x8000) >> 16; }
static u64 high(u64 x) { return (x >> 16) & 0xffff; }
static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const ub32 insn[] = {
// Get the address of this PLT section
0x7c08'02a6, // mflr r0
0x429f'0005, // bcl 20, 31, 4
0x7d88'02a6, // 1: mflr r12
0x7c08'03a6, // mtlr r0
// Compute the runtime address of GOTPLT+12
0x3d8c'0000, // addis r12, r12, (GOTPLT - 1b)@higha
0x398c'0000, // addi r12, r12, (GOTPLT - 1b)@lo
// Compute the PLT entry offset
0x7d6c'5850, // sub r11, r11, r12
0x1d6b'0003, // mulli r11, r11, 3
// Load GOTPLT[2] and branch to GOTPLT[1]
0x800c'fff8, // lwz r0, -8(r12)
0x7c09'03a6, // mtctr r0
0x818c'fffc, // lwz r12, -4(r12)
0x4e80'0420, // bctr
0x6000'0000, // nop
0x6000'0000, // nop
0x6000'0000, // nop
0x6000'0000, // nop
};
static_assert(sizeof(insn) == E::plt_hdr_size);
memcpy(buf, insn, sizeof(insn));
ub32 *loc = (ub32 *)buf;
loc[4] |= higha(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4);
}
static const ub32 plt_entry[] = {
// Get the address of this PLT entry
0x7c08'02a6, // mflr r0
0x429f'0005, // bcl 20, 31, 4
0x7d88'02a6, // mflr r12
0x7c08'03a6, // mtlr r0
// Load an address from the GOT/GOTPLT entry and jump to that address
0x3d6c'0000, // addis r11, r12, OFFSET@higha
0x396b'0000, // addi r11, r11, OFFSET@lo
0x818b'0000, // lwz r12, 0(r11)
0x7d89'03a6, // mtctr r12
0x4e80'0420, // bctr
};
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static_assert(E::plt_size == sizeof(plt_entry));
memcpy(buf, plt_entry, sizeof(plt_entry));
ub32 *loc = (ub32 *)buf;
i64 offset = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8;
loc[4] |= higha(offset);
loc[5] |= lo(offset);
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static_assert(E::pltgot_size == sizeof(plt_entry));
memcpy(buf, plt_entry, sizeof(plt_entry));
ub32 *loc = (ub32 *)buf;
i64 offset = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 8;
loc[4] |= higha(offset);
loc[5] |= lo(offset);
}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_PPC_ADDR32:
*(ub32 *)loc = val;
break;
case R_PPC_REL32:
*(ub32 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
u64 GOT2 = file.ppc32_got2 ? file.ppc32_got2->get_addr() : 0;
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_PPC_ADDR32:
case R_PPC_UADDR32:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_PPC_ADDR14:
*(ub32 *)loc |= bits(S + A, 15, 2) << 2;
break;
case R_PPC_ADDR16:
case R_PPC_UADDR16:
case R_PPC_ADDR16_LO:
*(ub16 *)loc = lo(S + A);
break;
case R_PPC_ADDR16_HI:
*(ub16 *)loc = hi(S + A);
break;
case R_PPC_ADDR16_HA:
*(ub16 *)loc = ha(S + A);
break;
case R_PPC_ADDR24:
*(ub32 *)loc |= bits(S + A, 25, 2) << 2;
break;
case R_PPC_ADDR30:
*(ub32 *)loc |= bits(S + A, 31, 2) << 2;
break;
case R_PPC_PLT16_LO:
*(ub16 *)loc = lo(G + GOT - A - GOT2);
break;
case R_PPC_PLT16_HI:
*(ub16 *)loc = hi(G + GOT - A - GOT2);
break;
case R_PPC_PLT16_HA:
*(ub16 *)loc = ha(G + GOT - A - GOT2);
break;
case R_PPC_PLT32:
*(ub32 *)loc = G + GOT - A - GOT2;
break;
case R_PPC_REL14:
*(ub32 *)loc |= bits(S + A - P, 15, 2) << 2;
break;
case R_PPC_REL16:
case R_PPC_REL16_LO:
*(ub16 *)loc = lo(S + A - P);
break;
case R_PPC_REL16_HI:
*(ub16 *)loc = hi(S + A - P);
break;
case R_PPC_REL16_HA:
*(ub16 *)loc = ha(S + A - P);
break;
case R_PPC_REL24:
case R_PPC_LOCAL24PC: {
i64 val = S + A - P;
if (sign_extend(val, 25) != val)
val = get_thunk_addr(i) - P;
*(ub32 *)loc |= bits(val, 25, 2) << 2;
break;
}
case R_PPC_PLTREL24: {
i64 val = S - P;
if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
val = get_thunk_addr(i) - P;
*(ub32 *)loc |= bits(val, 25, 2) << 2;
break;
}
case R_PPC_REL32:
case R_PPC_PLTREL32:
*(ub32 *)loc = S + A - P;
break;
case R_PPC_GOT16:
case R_PPC_GOT16_LO:
*(ub16 *)loc = lo(G + A);
break;
case R_PPC_GOT16_HI:
*(ub16 *)loc = hi(G + A);
break;
case R_PPC_GOT16_HA:
*(ub16 *)loc = ha(G + A);
break;
case R_PPC_TPREL16_LO:
*(ub16 *)loc = lo(S + A - ctx.tp_addr);
break;
case R_PPC_TPREL16_HI:
*(ub16 *)loc = hi(S + A - ctx.tp_addr);
break;
case R_PPC_TPREL16_HA:
*(ub16 *)loc = ha(S + A - ctx.tp_addr);
break;
case R_PPC_DTPREL16_LO:
*(ub16 *)loc = lo(S + A - ctx.dtp_addr);
break;
case R_PPC_DTPREL16_HI:
*(ub16 *)loc = hi(S + A - ctx.dtp_addr);
break;
case R_PPC_DTPREL16_HA:
*(ub16 *)loc = ha(S + A - ctx.dtp_addr);
break;
case R_PPC_GOT_TLSGD16:
*(ub16 *)loc = sym.get_tlsgd_addr(ctx) - GOT;
break;
case R_PPC_GOT_TLSLD16:
*(ub16 *)loc = ctx.got->get_tlsld_addr(ctx) - GOT;
break;
case R_PPC_GOT_TPREL16:
*(ub16 *)loc = sym.get_gottp_addr(ctx) - GOT;
break;
case R_PPC_TLS:
case R_PPC_TLSGD:
case R_PPC_TLSLD:
case R_PPC_PLTSEQ:
case R_PPC_PLTCALL:
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_PPC_ADDR32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ub32 *)loc = *val;
else
*(ub32 *)loc = S + A;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_PPC_ADDR32:
case R_PPC_UADDR32:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_PPC_ADDR14:
case R_PPC_ADDR16:
case R_PPC_UADDR16:
case R_PPC_ADDR16_LO:
case R_PPC_ADDR16_HI:
case R_PPC_ADDR16_HA:
case R_PPC_ADDR24:
case R_PPC_ADDR30:
scan_absrel(ctx, sym, rel);
break;
case R_PPC_REL14:
case R_PPC_REL16:
case R_PPC_REL16_LO:
case R_PPC_REL16_HI:
case R_PPC_REL16_HA:
case R_PPC_REL32:
scan_pcrel(ctx, sym, rel);
break;
case R_PPC_GOT16:
case R_PPC_GOT16_LO:
case R_PPC_GOT16_HI:
case R_PPC_GOT16_HA:
case R_PPC_PLT16_LO:
case R_PPC_PLT16_HI:
case R_PPC_PLT16_HA:
case R_PPC_PLT32:
sym.flags |= NEEDS_GOT;
break;
case R_PPC_REL24:
case R_PPC_PLTREL24:
case R_PPC_PLTREL32:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_PPC_GOT_TLSGD16:
sym.flags |= NEEDS_TLSGD;
break;
case R_PPC_GOT_TLSLD16:
ctx.needs_tlsld = true;
break;
case R_PPC_GOT_TPREL16:
sym.flags |= NEEDS_GOTTP;
break;
case R_PPC_TPREL16_LO:
case R_PPC_TPREL16_HI:
case R_PPC_TPREL16_HA:
check_tlsle(ctx, sym, rel);
break;
case R_PPC_LOCAL24PC:
case R_PPC_TLS:
case R_PPC_TLSGD:
case R_PPC_TLSLD:
case R_PPC_DTPREL16_LO:
case R_PPC_DTPREL16_HI:
case R_PPC_DTPREL16_HA:
case R_PPC_PLTSEQ:
case R_PPC_PLTCALL:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <>
void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
static const ub32 local_thunk[] = {
// Get this thunk's address
0x7c08'02a6, // mflr r0
0x429f'0005, // bcl 20, 31, 4
0x7d88'02a6, // mflr r12
0x7c08'03a6, // mtlr r0
// Materialize the destination's address in %r11 and jump to that address
0x3d6c'0000, // addis r11, r12, OFFSET@higha
0x396b'0000, // addi r11, r11, OFFSET@lo
0x7d69'03a6, // mtctr r11
0x4e80'0420, // bctr
0x6000'0000, // nop
};
static_assert(E::thunk_size == sizeof(plt_entry));
static_assert(E::thunk_size == sizeof(local_thunk));
for (i64 i = 0; i < symbols.size(); i++) {
ub32 *loc = (ub32 *)(buf + i * E::thunk_size);
Symbol<E> &sym = *symbols[i];
if (sym.has_plt(ctx)) {
memcpy(loc, plt_entry, sizeof(plt_entry));
u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx);
i64 val = got - get_addr(i) - 8;
loc[4] |= higha(val);
loc[5] |= lo(val);
} else {
memcpy(loc, local_thunk, sizeof(local_thunk));
i64 val = sym.get_addr(ctx) - get_addr(i) - 8;
loc[4] |= higha(val);
loc[5] |= lo(val);
}
}
}
} // namespace mold::elf

View file

@ -1,687 +0,0 @@
// clang-format off
// This file contains code for the 64-bit PowerPC ELFv1 ABI that is
// commonly used for big-endian PPC systems. Modern PPC systems that use
// the processor in the little-endian mode use the ELFv2 ABI instead. For
// ELFv2, see arch-ppc64v2.cc.
//
// Even though they are similiar, ELFv1 isn't only different from ELFv2 in
// endianness. The most notable difference is, in ELFv1, a function
// pointer doesn't directly refer to the entry point of a function but
// instead refers to a data structure so-called "function descriptor".
//
// The function descriptor is essentially a pair of a function entry point
// address and a value that should be set to %r2 before calling that
// function. There is also a third member for "the environment pointer for
// languages such as Pascal and PL/1" according to the psABI, but it looks
// like no one acutally uses it. In total, the function descriptor is 24
// bytes long. Here is why we need it.
//
// PPC generally lacks PC-relative data access instructions. Position-
// independent code sets GOT + 0x8000 to %r2 and access global variables
// relative to %r2.
//
// Each ELF file has its own GOT. If a function calls another function in
// the same ELF file, it doesn't have to reset %r2. However, if it is in
// other file (e.g. other .so), it has to set a new value to %r2 so that
// the register contains the callee's GOT + 0x8000.
//
// In this way, you can't call a function just by knowing the function's
// entry point address. You also need to know a proper %r2 value for the
// function. This is why a function pointer refers to a tuple of an
// address and a %r2 value.
//
// If a function call is made through PLT, PLT takes care of restoring %r2.
// Therefore, the caller has to restore %r2 only for function calls
// through function pointers.
//
// .opd (short for "official procedure descriptors") contains function
// descriptors.
//
// You can think OPD as this: even in other targets, a function can have a
// few different addresses for different purposes. It may not only have an
// entry point address but may also have PLT and/or GOT addresses.
// In PPCV1, it may have an OPD address in addition to these. OPD address
// is used for relocations that refers to the address of a function as a
// function pointer.
//
// https://github.com/rui314/psabi/blob/main/ppc64v1.pdf
#include "third_party/mold/elf/mold.h"
#include "third_party/libcxx/algorithm"
// MISSING #include <tbb/parallel_for_each.h>
namespace mold::elf {
using E = PPC64V1;
static u64 lo(u64 x) { return x & 0xffff; }
static u64 hi(u64 x) { return x >> 16; }
static u64 ha(u64 x) { return (x + 0x8000) >> 16; }
static u64 high(u64 x) { return (x >> 16) & 0xffff; }
static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
// .plt is used only for lazy symbol resolution on PPC64. All PLT
// calls are made via range extension thunks even if they are within
// reach. Thunks read addresses from .got.plt and jump there.
// Therefore, once PLT symbols are resolved and final addresses are
// written to .got.plt, thunks just skip .plt and directly jump to the
// resolved addresses.
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const ub32 insn[] = {
0x7d88'02a6, // mflr r12
0x429f'0005, // bcl 20, 31, 4 // obtain PC
0x7d68'02a6, // mflr r11
0xe84b'0024, // ld r2,36(r11)
0x7d88'03a6, // mtlr r12
0x7d62'5a14, // add r11,r2,r11
0xe98b'0000, // ld r12,0(r11)
0xe84b'0008, // ld r2,8(r11)
0x7d89'03a6, // mtctr r12
0xe96b'0010, // ld r11,16(r11)
0x4e80'0420, // bctr
// .quad .got.plt - .plt - 8
0x0000'0000,
0x0000'0000,
};
static_assert(sizeof(insn) == E::plt_hdr_size);
memcpy(buf, insn, sizeof(insn));
*(ub64 *)(buf + 44) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8;
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
ub32 *loc = (ub32 *)buf;
i64 idx = sym.get_plt_idx(ctx);
// The PPC64 ELFv1 ABI requires PLT entries to be vary in size depending
// on their indices. Unlike other targets, .got.plt is filled not by us
// but by the loader, so we don't have a control over where the initial
// call to the PLT entry jumps to. So we need to strictly follow the PLT
// section layout as the loader expect it to be.
if (idx < 0x8000) {
static const ub32 insn[] = {
0x3800'0000, // li r0, PLT_INDEX
0x4b00'0000, // b plt0
};
memcpy(loc, insn, sizeof(insn));
loc[0] |= idx;
loc[1] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 4) & 0x00ff'ffff;
} else {
static const ub32 insn[] = {
0x3c00'0000, // lis r0, PLT_INDEX@high
0x6000'0000, // ori r0, r0, PLT_INDEX@lo
0x4b00'0000, // b plt0
};
memcpy(loc, insn, sizeof(insn));
loc[0] |= high(idx);
loc[1] |= lo(idx);
loc[2] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 8) & 0x00ff'ffff;
}
}
// .plt.got is not necessary on PPC64 because range extension thunks
// directly read GOT entries and jump there.
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_PPC64_ADDR64:
*(ub64 *)loc = val;
break;
case R_PPC64_REL32:
*(ub32 *)loc = val - this->shdr.sh_addr - offset;
break;
case R_PPC64_REL64:
*(ub64 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
u64 TOC = ctx.extra.TOC->value;
switch (rel.r_type) {
case R_PPC64_ADDR64:
apply_toc_rel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_PPC64_TOC:
apply_toc_rel(ctx, *ctx.extra.TOC, rel, loc, TOC, A, P, dynrel);
break;
case R_PPC64_TOC16_HA:
*(ub16 *)loc = ha(S + A - TOC);
break;
case R_PPC64_TOC16_LO:
*(ub16 *)loc = lo(S + A - TOC);
break;
case R_PPC64_TOC16_DS:
check(S + A - TOC, -(1 << 15), 1 << 15);
*(ub16 *)loc |= (S + A - TOC) & 0xfffc;
break;
case R_PPC64_TOC16_LO_DS:
*(ub16 *)loc |= (S + A - TOC) & 0xfffc;
break;
case R_PPC64_REL24: {
i64 val = sym.get_addr(ctx, NO_OPD) + A - P;
if (sym.has_plt(ctx) || sign_extend(val, 25) != val)
val = get_thunk_addr(i) + A - P;
check(val, -(1 << 25), 1 << 25);
*(ub32 *)loc |= bits(val, 25, 2) << 2;
// If a callee is an external function, PLT saves %r2 to the
// caller's r2 save slot. We need to restore it after function
// return. To do so, there's usually a NOP as a placeholder
// after a BL. 0x6000'0000 is a NOP.
if (sym.has_plt(ctx) && *(ub32 *)(loc + 4) == 0x6000'0000)
*(ub32 *)(loc + 4) = 0xe841'0028; // ld r2, 40(r1)
break;
}
case R_PPC64_REL32:
*(ub32 *)loc = S + A - P;
break;
case R_PPC64_REL64:
*(ub64 *)loc = S + A - P;
break;
case R_PPC64_REL16_HA:
*(ub16 *)loc = ha(S + A - P);
break;
case R_PPC64_REL16_LO:
*(ub16 *)loc = lo(S + A - P);
break;
case R_PPC64_PLT16_HA:
*(ub16 *)loc = ha(G + GOT - TOC);
break;
case R_PPC64_PLT16_HI:
*(ub16 *)loc = hi(G + GOT - TOC);
break;
case R_PPC64_PLT16_LO:
*(ub16 *)loc = lo(G + GOT - TOC);
break;
case R_PPC64_PLT16_LO_DS:
*(ub16 *)loc |= (G + GOT - TOC) & 0xfffc;
break;
case R_PPC64_GOT_TPREL16_HA:
*(ub16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSGD16_HA:
*(ub16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSGD16_LO:
*(ub16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSLD16_HA:
*(ub16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSLD16_LO:
*(ub16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC);
break;
case R_PPC64_DTPREL16_HA:
*(ub16 *)loc = ha(S + A - ctx.dtp_addr);
break;
case R_PPC64_DTPREL16_LO:
*(ub16 *)loc = lo(S + A - ctx.dtp_addr);
break;
case R_PPC64_TPREL16_HA:
*(ub16 *)loc = ha(S + A - ctx.tp_addr);
break;
case R_PPC64_TPREL16_LO:
*(ub16 *)loc = lo(S + A - ctx.tp_addr);
break;
case R_PPC64_GOT_TPREL16_LO_DS:
*(ub16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc;
break;
case R_PPC64_PLTSEQ:
case R_PPC64_PLTCALL:
case R_PPC64_TLS:
case R_PPC64_TLSGD:
case R_PPC64_TLSLD:
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_PPC64_ADDR64:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ub64 *)loc = *val;
else
*(ub64 *)loc = S + A;
break;
case R_PPC64_ADDR32: {
i64 val = S + A;
check(val, 0, 1LL << 32);
*(ub32 *)loc = val;
break;
}
case R_PPC64_DTPREL64:
*(ub64 *)loc = S + A - ctx.dtp_addr;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT | NEEDS_PPC_OPD;
// Any relocation except R_PPC64_REL24 is considered as an
// address-taking relocation.
if (rel.r_type != R_PPC64_REL24 && sym.get_type() == STT_FUNC)
sym.flags |= NEEDS_PPC_OPD;
switch (rel.r_type) {
case R_PPC64_ADDR64:
case R_PPC64_TOC:
scan_toc_rel(ctx, sym, rel);
break;
case R_PPC64_GOT_TPREL16_HA:
sym.flags |= NEEDS_GOTTP;
break;
case R_PPC64_REL24:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_PPC64_PLT16_HA:
sym.flags |= NEEDS_GOT;
break;
case R_PPC64_GOT_TLSGD16_HA:
sym.flags |= NEEDS_TLSGD;
break;
case R_PPC64_GOT_TLSLD16_HA:
ctx.needs_tlsld = true;
break;
case R_PPC64_TPREL16_HA:
case R_PPC64_TPREL16_LO:
check_tlsle(ctx, sym, rel);
break;
case R_PPC64_REL32:
case R_PPC64_REL64:
case R_PPC64_TOC16_HA:
case R_PPC64_TOC16_LO:
case R_PPC64_TOC16_LO_DS:
case R_PPC64_TOC16_DS:
case R_PPC64_REL16_HA:
case R_PPC64_REL16_LO:
case R_PPC64_PLT16_HI:
case R_PPC64_PLT16_LO:
case R_PPC64_PLT16_LO_DS:
case R_PPC64_PLTSEQ:
case R_PPC64_PLTCALL:
case R_PPC64_GOT_TPREL16_LO_DS:
case R_PPC64_GOT_TLSGD16_LO:
case R_PPC64_GOT_TLSLD16_LO:
case R_PPC64_TLS:
case R_PPC64_TLSGD:
case R_PPC64_TLSLD:
case R_PPC64_DTPREL16_HA:
case R_PPC64_DTPREL16_LO:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <>
void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
// If the destination is .plt.got, we save the current r2, read an
// address of a function descriptor from .got, restore %r2 and jump
// to the function.
static const ub32 pltgot_thunk[] = {
// Store the caller's %r2
0xf841'0028, // std %r2, 40(%r1)
// Load an address of a function descriptor
0x3d82'0000, // addis %r12, %r2, foo@got@toc@ha
0xe98c'0000, // ld %r12, foo@got@toc@lo(%r12)
// Restore the callee's %r2
0xe84c'0008, // ld %r2, 8(%r12)
// Jump to the function
0xe98c'0000, // ld %r12, 0(%r12)
0x7d89'03a6, // mtctr %r12
0x4e80'0420, // bctr
};
// If the destination is .plt, read a function descriptor from .got.plt.
static const ub32 plt_thunk[] = {
// Store the caller's %r2
0xf841'0028, // std %r2, 40(%r1)
// Materialize an address of a function descriptor
0x3d82'0000, // addis %r12, %r2, foo@gotplt@toc@ha
0x398c'0000, // addi %r12, %r12, foo@gotplt@toc@lo
// Restore the callee's %r2
0xe84c'0008, // ld %r2, 8(%r12)
// Jump to the function
0xe98c'0000, // ld %r12, 0(%r12)
0x7d89'03a6, // mtctr %r12
0x4e80'0420, // bctr
};
// If the destination is a non-imported function, we directly jump
// to the function entry address.
static const ub32 local_thunk[] = {
0x3d82'0000, // addis r12, r2, foo@toc@ha
0x398c'0000, // addi r12, r12, foo@toc@lo
0x7d89'03a6, // mtctr r12
0x4e80'0420, // bctr
0x6000'0000, // nop
0x6000'0000, // nop
0x6000'0000, // nop
};
static_assert(E::thunk_size == sizeof(pltgot_thunk));
static_assert(E::thunk_size == sizeof(plt_thunk));
static_assert(E::thunk_size == sizeof(local_thunk));
for (i64 i = 0; i < symbols.size(); i++) {
Symbol<E> &sym = *symbols[i];
ub32 *loc = (ub32 *)(buf + i * E::thunk_size);
if (sym.has_got(ctx)) {
memcpy(loc, pltgot_thunk, sizeof(pltgot_thunk));
i64 val = sym.get_got_addr(ctx) - ctx.extra.TOC->value;
loc[1] |= higha(val);
loc[2] |= lo(val);
} else if(sym.has_plt(ctx)) {
memcpy(loc, plt_thunk, sizeof(plt_thunk));
i64 val = sym.get_gotplt_addr(ctx) - ctx.extra.TOC->value;
loc[1] |= higha(val);
loc[2] |= lo(val);
} else {
memcpy(loc, local_thunk, sizeof(local_thunk));
i64 val = sym.get_addr(ctx, NO_OPD) - ctx.extra.TOC->value;
loc[0] |= higha(val);
loc[1] |= lo(val);
}
}
}
static InputSection<E> *get_opd_section(ObjectFile<E> &file) {
for (std::unique_ptr<InputSection<E>> &isec : file.sections)
if (isec && isec->name() == ".opd")
return isec.get();
return nullptr;
}
static ElfRel<E> *
get_relocation_at(Context<E> &ctx, InputSection<E> &isec, i64 offset) {
std::span<ElfRel<E>> rels = isec.get_rels(ctx);
auto it = std::lower_bound(rels.begin(), rels.end(), offset,
[](const ElfRel<E> &r, i64 offset) {
return r.r_offset < offset;
});
if (it == rels.end())
return nullptr;
if (it->r_offset != offset)
return nullptr;
return &*it;
}
struct OpdSymbol {
bool operator<(const OpdSymbol &x) const { return r_offset < x.r_offset; }
u64 r_offset = 0;
Symbol<E> *sym = nullptr;
};
static Symbol<E> *
get_opd_sym_at(Context<E> &ctx, std::span<OpdSymbol> syms, u64 offset) {
auto it = std::lower_bound(syms.begin(), syms.end(), OpdSymbol{offset});
if (it == syms.end())
return nullptr;
if (it->r_offset != offset)
return nullptr;
return it->sym;
}
// Compiler creates an .opd entry for each function symbol. The intention
// is to make it possible to create an output .opd section just by linking
// input .opd sections in the same manner as we do to other normal input
// sections.
//
// However, in reality, .opd isn't a normal input section. It needs many
// special treatments as follows:
//
// 1. A function symbol refers to not a .text but an .opd. Its address
// works fine for address-taking relocations such as R_PPC64_ADDR64.
// However, R_PPC64_REL24 (which is used for branch instruction) needs
// a function's real address instead of the function's .opd address.
// We need to read .opd contents to find out a function entry point
// address to apply R_PPC64_REL24.
//
// 2. Output .opd entries are needed only for functions whose addresses
// are taken. Just copying input .opd sections to an output would
// produces lots of dead .opd entries.
//
// 3. In this design, all function symbols refer to an .opd section, and
// that doesn't work well with graph traversal optimizations such as
// garbage collection or identical comdat folding. For example, garbage
// collector would mark an .opd alive which in turn mark all functions
// thatare referenced by .opd as alive, effectively keeping all
// functions as alive.
//
// The problem is that the compiler creates a half-baked .opd section, and
// the linker has to figure out what all these .opd entries and
// relocations are trying to achieve. It's like the compiler would emit a
// half-baked .plt section in an object file and the linker has to deal
// with that. That's not a good design.
//
// So, in this function, we undo what the compiler did to .opd. We remove
// function symbols from .opd and reattach them to their function entry
// points. We also rewrite relocations that directly refer to an input
// .opd section so that they refer to function symbols instead. We then
// mark input .opd sections as dead.
//
// After this function, we mark symbols with the NEEDS_PPC_OPD flag if the
// symbol needs an .opd entry. We then create an output .opd just like we
// do for .plt or .got.
void ppc64v1_rewrite_opd(Context<E> &ctx) {
tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
InputSection<E> *opd = get_opd_section(*file);
if (!opd)
return;
opd->is_alive = false;
// Move symbols from .opd to .text.
std::vector<OpdSymbol> opd_syms;
for (Symbol<E> *sym : file->symbols) {
if (sym->file != file || sym->get_input_section() != opd)
continue;
if (u32 ty = sym->get_type(); ty != STT_FUNC && ty != STT_GNU_IFUNC)
continue;
ElfRel<E> *rel = get_relocation_at(ctx, *opd, sym->value);
if (!rel)
Fatal(ctx) << *file << ": cannot find a relocation in .opd for "
<< *sym << " at offset 0x" << std::hex << (u64)sym->value;
Symbol<E> *sym2 = file->symbols[rel->r_sym];
if (sym2->get_type() != STT_SECTION)
Fatal(ctx) << *file << ": bad relocation in .opd referring " << *sym2;
opd_syms.push_back({sym->value, sym});
sym->set_input_section(sym2->get_input_section());
sym->value = rel->r_addend;
}
// Sort symbols so that get_opd_sym_at() can do binary search.
sort(opd_syms);
// Rewrite relocations so that they directly refer to .opd.
for (std::unique_ptr<InputSection<E>> &isec : file->sections) {
if (!isec || !isec->is_alive || isec.get() == opd)
continue;
for (ElfRel<E> &r : isec->get_rels(ctx)) {
Symbol<E> &sym = *file->symbols[r.r_sym];
if (sym.get_input_section() != opd)
continue;
Symbol<E> *real_sym = get_opd_sym_at(ctx, opd_syms, r.r_addend);
if (!real_sym)
Fatal(ctx) << *isec << ": cannot find a symbol in .opd for " << r
<< " at offset 0x" << std::hex << (u64)r.r_addend;
r.r_sym = real_sym->sym_idx;
r.r_addend = 0;
}
}
});
}
// When a function is exported, the dynamic symbol for the function should
// refers to the function's .opd entry. This function marks such symbols
// with NEEDS_PPC_OPD.
void ppc64v1_scan_symbols(Context<E> &ctx) {
tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
for (Symbol<E> *sym : file->symbols)
if (sym->file == file && sym->is_exported)
if (u32 ty = sym->get_type(); ty == STT_FUNC || ty == STT_GNU_IFUNC)
sym->flags |= NEEDS_PPC_OPD;
});
// Functions referenced by the ELF header also have to have .opd entries.
auto mark = [&](std::string_view name) {
if (!name.empty())
if (Symbol<E> &sym = *get_symbol(ctx, name); !sym.is_imported)
sym.flags |= NEEDS_PPC_OPD;
};
mark(ctx.arg.entry);
mark(ctx.arg.init);
mark(ctx.arg.fini);
}
void PPC64OpdSection::add_symbol(Context<E> &ctx, Symbol<E> *sym) {
sym->set_opd_idx(ctx, symbols.size());
symbols.push_back(sym);
this->shdr.sh_size += ENTRY_SIZE;
}
i64 PPC64OpdSection::get_reldyn_size(Context<E> &ctx) const {
if (ctx.arg.pic)
return symbols.size() * 2;
return 0;
}
void PPC64OpdSection::copy_buf(Context<E> &ctx) {
ub64 *buf = (ub64 *)(ctx.buf + this->shdr.sh_offset);
ElfRel<E> *rel = nullptr;
if (ctx.arg.pic)
rel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset + reldyn_offset);
for (Symbol<E> *sym : symbols) {
u64 addr = sym->get_addr(ctx, NO_PLT | NO_OPD);
*buf++ = addr;
*buf++ = ctx.extra.TOC->value;
*buf++ = 0;
if (ctx.arg.pic) {
u64 loc = sym->get_opd_addr(ctx);
*rel++ = ElfRel<E>(loc, E::R_RELATIVE, 0, addr);
*rel++ = ElfRel<E>(loc + 8, E::R_RELATIVE, 0, ctx.extra.TOC->value);
}
}
}
} // namespace mold::elf

View file

@ -1,555 +0,0 @@
// clang-format off
// This file implements the PowerPC ELFv2 ABI which was standardized in
// 2014. Modern little-endian PowerPC systems are based on this ABI.
// The ABI is often referred to as "ppc64le". This shouldn't be confused
// with "ppc64" which refers to the original, big-endian PowerPC systems.
//
// PPC64 is a bit tricky to support because PC-relative load/store
// instructions hadn't been available until Power10 which debuted in 2021.
// Prior to Power10, it wasn't trivial for position-independent code (PIC)
// to load a value from, for example, .got, as we can't do that with [PC +
// the offset to the .got entry].
//
// In the following, I'll explain how PIC is supported on pre-Power10
// systems first and then explain what has changed with Power10.
//
//
// Position-independent code on Power9 or earlier:
//
// We can get the program counter on older PPC64 systems with the
// following four instructions
//
// mflr r1 // save the current link register to r1
// bl .+4 // branch to the next instruction as if it were a function
// mflr r0 // copy the return address to r0
// mtlr r1 // restore the original link register value
//
// , but it's too expensive to do if we do this for each load/store.
//
// As a workaround, most functions are compiled in such a way that r2 is
// assumed to always contain the address of .got + 0x8000. With this, we
// can for example load the first entry of .got with a single instruction
// `lw r0, -0x8000(r2)`. r2 is called the TOC pointer.
//
// There's only one .got for each ELF module. Therefore, if a callee is in
// the same ELF module, r2 doesn't have to be recomputed. Most function
// calls are usually within the same ELF module, so this mechanism is
// efficient.
//
// A function compiled for pre-Power10 usually has two entry points,
// global and local. The global entry point usually 8 bytes precedes
// the local entry point. In between is the following instructions:
//
// addis r2, r12, .TOC.@ha
// addi r2, r2, .TOC.@lo + 4;
//
// The global entry point assumes that the address of itself is in r12,
// and it computes its own TOC pointer from r12. It's easy to do so for
// the callee because the offset between its .got + 0x8000 and the
// function is known at link-time. The above code sequence then falls
// through to the local entry point that assumes r2 is .got + 0x8000.
//
// So, if a callee's TOC pointer is different from the current one
// (e.g. calling a function in another .so), we first load the callee's
// address to r12 (e.g. from .got.plt with a r2-relative load) and branch
// to that address. Then the callee computes its own TOC pointer using
// r12.
//
//
// Position-independent code on Power10:
//
// Power10 added 8-bytes-long instructions to the ISA. Some of them are
// PC-relative load/store instructions that take 34 bits offsets.
// Functions compiled with `-mcpu=power10` use these instructions for PIC.
// r2 does not have a special meaning in such fucntions.
//
// When a fucntion compiled for Power10 calls a function that uses the TOC
// pointer, we need to compute a correct value for TOC and set it to r2
// before transferring the control to the callee. Thunks are responsible
// for doing it.
//
// `_NOTOC` relocations such as `R_PPC64_REL24_NOTOC` indicate that the
// callee does not use TOC (i.e. compiled with `-mcpu=power10`). If a
// function using TOC is referenced via a `_NOTOC` relocation, that call
// is made through a range extension thunk.
//
//
// Note on section names: the PPC64 psABI uses a weird naming convention
// which calls .got.plt .plt. We ignored that part because it's just
// confusing. Since the runtime only cares about segments, we should be
// able to name sections whatever we want.
//
// https://github.com/rui314/psabi/blob/main/ppc64v2.pdf
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = PPC64V2;
static u64 lo(u64 x) { return x & 0xffff; }
static u64 hi(u64 x) { return x >> 16; }
static u64 ha(u64 x) { return (x + 0x8000) >> 16; }
static u64 high(u64 x) { return (x >> 16) & 0xffff; }
static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; }
static u64 prefix34(u64 x) {
return bits(x, 33, 16) | (bits(x, 15, 0) << 32);
}
// .plt is used only for lazy symbol resolution on PPC64. All PLT
// calls are made via range extension thunks even if they are within
// reach. Thunks read addresses from .got.plt and jump there.
// Therefore, once PLT symbols are resolved and final addresses are
// written to .got.plt, thunks just skip .plt and directly jump to the
// resolved addresses.
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const ul32 insn[] = {
// Get PC
0x7c08'02a6, // mflr r0
0x429f'0005, // bcl 20, 31, 4 // obtain PC
0x7d68'02a6, // mflr r11
0x7c08'03a6, // mtlr r0
// Compute the PLT entry index
0xe80b'002c, // ld r0, 44(r11)
0x7d8b'6050, // subf r12, r11, r12
0x7d60'5a14, // add r11, r0, r11
0x380c'ffcc, // addi r0, r12, -52
0x7800'f082, // rldicl r0, r0, 62, 2
// Load .got.plt[0] and .got.plt[1] and branch to .got.plt[0]
0xe98b'0000, // ld r12, 0(r11)
0x7d89'03a6, // mtctr r12
0xe96b'0008, // ld r11, 8(r11)
0x4e80'0420, // bctr
// .quad .got.plt - .plt - 8
0x0000'0000,
0x0000'0000,
};
memcpy(buf, insn, sizeof(insn));
*(ul64 *)(buf + 52) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8;
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
// When the control is transferred to a PLT entry, the PLT entry's
// address is already set to %r12 by the caller.
i64 offset = ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx);
*(ul32 *)buf = 0x4b00'0000 | (offset & 0x00ff'ffff); // b plt0
}
// .plt.got is not necessary on PPC64 because range extension thunks
// directly read GOT entries and jump there.
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_PPC64_ADDR64:
*(ul64 *)loc = val;
break;
case R_PPC64_REL32:
*(ul32 *)loc = val - this->shdr.sh_addr - offset;
break;
case R_PPC64_REL64:
*(ul64 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
static u64 get_local_entry_offset(Context<E> &ctx, Symbol<E> &sym) {
i64 val = sym.esym().ppc_local_entry;
assert(val <= 7);
if (val == 7)
Fatal(ctx) << sym << ": local entry offset 7 is reserved";
if (val == 0 || val == 1)
return 0;
return 1 << val;
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
u64 TOC = ctx.extra.TOC->value;
auto r2save_thunk_addr = [&] { return get_thunk_addr(i); };
auto no_r2save_thunk_addr = [&] { return get_thunk_addr(i) + 4; };
switch (rel.r_type) {
case R_PPC64_ADDR64:
if (name() == ".toc")
apply_toc_rel(ctx, sym, rel, loc, S, A, P, dynrel);
else
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_PPC64_TOC16_HA:
*(ul16 *)loc = ha(S + A - TOC);
break;
case R_PPC64_TOC16_LO:
*(ul16 *)loc = lo(S + A - TOC);
break;
case R_PPC64_TOC16_DS:
case R_PPC64_TOC16_LO_DS:
*(ul16 *)loc |= (S + A - TOC) & 0xfffc;
break;
case R_PPC64_REL24:
if (sym.has_plt(ctx) || !sym.esym().preserves_r2()) {
i64 val = r2save_thunk_addr() + A - P;
*(ul32 *)loc |= bits(val, 25, 2) << 2;
// The thunk saves %r2 to the caller's r2 save slot. We need to
// restore it after function return. To do so, there's usually a
// NOP as a placeholder after a BL. 0x6000'0000 is a NOP.
if (*(ul32 *)(loc + 4) == 0x6000'0000)
*(ul32 *)(loc + 4) = 0xe841'0018; // ld r2, 24(r1)
} else {
i64 val = S + get_local_entry_offset(ctx, sym) + A - P;
if (sign_extend(val, 25) != val)
val = no_r2save_thunk_addr() + A - P;
*(ul32 *)loc |= bits(val, 25, 2) << 2;
}
break;
case R_PPC64_REL24_NOTOC:
if (sym.has_plt(ctx) || sym.esym().uses_toc()) {
i64 val = no_r2save_thunk_addr() + A - P;
*(ul32 *)loc |= bits(val, 25, 2) << 2;
} else {
i64 val = S + A - P;
if (sign_extend(val, 25) != val)
val = no_r2save_thunk_addr() + A - P;
*(ul32 *)loc |= bits(val, 25, 2) << 2;
}
break;
case R_PPC64_REL32:
*(ul32 *)loc = S + A - P;
break;
case R_PPC64_REL64:
*(ul64 *)loc = S + A - P;
break;
case R_PPC64_REL16_HA:
*(ul16 *)loc = ha(S + A - P);
break;
case R_PPC64_REL16_LO:
*(ul16 *)loc = lo(S + A - P);
break;
case R_PPC64_PLT16_HA:
*(ul16 *)loc = ha(G + GOT - TOC);
break;
case R_PPC64_PLT16_HI:
*(ul16 *)loc = hi(G + GOT - TOC);
break;
case R_PPC64_PLT16_LO:
*(ul16 *)loc = lo(G + GOT - TOC);
break;
case R_PPC64_PLT16_LO_DS:
*(ul16 *)loc |= (G + GOT - TOC) & 0xfffc;
break;
case R_PPC64_PLT_PCREL34:
case R_PPC64_PLT_PCREL34_NOTOC:
case R_PPC64_GOT_PCREL34:
*(ul64 *)loc |= prefix34(G + GOT - P);
break;
case R_PPC64_PCREL34:
*(ul64 *)loc |= prefix34(S + A - P);
break;
case R_PPC64_GOT_TPREL16_HA:
*(ul16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TPREL16_LO_DS:
*(ul16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc;
break;
case R_PPC64_GOT_TPREL_PCREL34:
*(ul64 *)loc |= prefix34(sym.get_gottp_addr(ctx) - P);
break;
case R_PPC64_GOT_TLSGD16_HA:
*(ul16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSGD16_LO:
*(ul16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSGD_PCREL34:
*(ul64 *)loc |= prefix34(sym.get_tlsgd_addr(ctx) - P);
break;
case R_PPC64_GOT_TLSLD16_HA:
*(ul16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSLD16_LO:
*(ul16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC);
break;
case R_PPC64_GOT_TLSLD_PCREL34:
*(ul64 *)loc |= prefix34(ctx.got->get_tlsld_addr(ctx) - P);
break;
case R_PPC64_DTPREL16_HA:
*(ul16 *)loc = ha(S + A - ctx.dtp_addr);
break;
case R_PPC64_DTPREL16_LO:
*(ul16 *)loc = lo(S + A - ctx.dtp_addr);
break;
case R_PPC64_DTPREL34:
*(ul64 *)loc |= prefix34(S + A - ctx.dtp_addr);
break;
case R_PPC64_TPREL16_HA:
*(ul16 *)loc = ha(S + A - ctx.tp_addr);
break;
case R_PPC64_TPREL16_LO:
*(ul16 *)loc = lo(S + A - ctx.tp_addr);
break;
case R_PPC64_PLTSEQ:
case R_PPC64_PLTSEQ_NOTOC:
case R_PPC64_PLTCALL:
case R_PPC64_PLTCALL_NOTOC:
case R_PPC64_TLS:
case R_PPC64_TLSGD:
case R_PPC64_TLSLD:
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_PPC64_ADDR64:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul64 *)loc = *val;
else
*(ul64 *)loc = S + A;
break;
case R_PPC64_ADDR32: {
i64 val = S + A;
check(val, 0, 1LL << 32);
*(ul32 *)loc = val;
break;
}
case R_PPC64_DTPREL64:
*(ul64 *)loc = S + A - ctx.dtp_addr;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_PPC64_ADDR64:
if (name() == ".toc")
scan_toc_rel(ctx, sym, rel);
else
scan_dyn_absrel(ctx, sym, rel);
break;
case R_PPC64_GOT_TPREL16_HA:
case R_PPC64_GOT_TPREL_PCREL34:
sym.flags |= NEEDS_GOTTP;
break;
case R_PPC64_REL24:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_PPC64_REL24_NOTOC:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
ctx.extra.is_power10 = true;
break;
case R_PPC64_PLT16_HA:
case R_PPC64_PLT_PCREL34:
case R_PPC64_PLT_PCREL34_NOTOC:
case R_PPC64_GOT_PCREL34:
sym.flags |= NEEDS_GOT;
break;
case R_PPC64_GOT_TLSGD16_HA:
case R_PPC64_GOT_TLSGD_PCREL34:
sym.flags |= NEEDS_TLSGD;
break;
case R_PPC64_GOT_TLSLD16_HA:
case R_PPC64_GOT_TLSLD_PCREL34:
ctx.needs_tlsld = true;
break;
case R_PPC64_TPREL16_HA:
case R_PPC64_TPREL16_LO:
check_tlsle(ctx, sym, rel);
break;
case R_PPC64_REL32:
case R_PPC64_REL64:
case R_PPC64_TOC16_HA:
case R_PPC64_TOC16_LO:
case R_PPC64_TOC16_LO_DS:
case R_PPC64_TOC16_DS:
case R_PPC64_REL16_HA:
case R_PPC64_REL16_LO:
case R_PPC64_PLT16_HI:
case R_PPC64_PLT16_LO:
case R_PPC64_PLT16_LO_DS:
case R_PPC64_PCREL34:
case R_PPC64_PLTSEQ:
case R_PPC64_PLTSEQ_NOTOC:
case R_PPC64_PLTCALL:
case R_PPC64_PLTCALL_NOTOC:
case R_PPC64_GOT_TPREL16_LO_DS:
case R_PPC64_GOT_TLSGD16_LO:
case R_PPC64_GOT_TLSLD16_LO:
case R_PPC64_TLS:
case R_PPC64_TLSGD:
case R_PPC64_TLSLD:
case R_PPC64_DTPREL16_HA:
case R_PPC64_DTPREL16_LO:
case R_PPC64_DTPREL34:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <>
void RangeExtensionThunk<E>::copy_buf(Context<E> &ctx) {
u8 *buf = ctx.buf + output_section.shdr.sh_offset + offset;
// If the destination is PLT, we read an address from .got.plt or .got
// and jump there.
static const ul32 plt_thunk[] = {
0xf841'0018, // std r2, 24(r1)
0x3d82'0000, // addis r12, r2, foo@gotplt@toc@ha
0xe98c'0000, // ld r12, foo@gotplt@toc@lo(r12)
0x7d89'03a6, // mtctr r12
0x4e80'0420, // bctr
};
static const ul32 plt_thunk_power10[] = {
0xf841'0018, // std r2, 24(r1)
0x0410'0000, // pld r12, foo@gotplt@pcrel
0xe580'0000,
0x7d89'03a6, // mtctr r12
0x4e80'0420, // bctr
};
// If the destination is a non-imported function, we directly jump
// to its local entry point.
static const ul32 local_thunk[] = {
0xf841'0018, // std r2, 24(r1)
0x3d82'0000, // addis r12, r2, foo@toc@ha
0x398c'0000, // addi r12, r12, foo@toc@lo
0x7d89'03a6, // mtctr r12
0x4e80'0420, // bctr
};
static const ul32 local_thunk_power10[] = {
0xf841'0018, // std r2, 24(r1)
0x0610'0000, // pla r12, foo@pcrel
0x3980'0000,
0x7d89'03a6, // mtctr r12
0x4e80'0420, // bctr
};
static_assert(E::thunk_size == sizeof(plt_thunk));
static_assert(E::thunk_size == sizeof(plt_thunk_power10));
static_assert(E::thunk_size == sizeof(local_thunk));
static_assert(E::thunk_size == sizeof(local_thunk_power10));
for (i64 i = 0; i < symbols.size(); i++) {
Symbol<E> &sym = *symbols[i];
ul32 *loc = (ul32 *)(buf + i * E::thunk_size);
if (sym.has_plt(ctx)) {
u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx);
if (ctx.extra.is_power10) {
memcpy(loc, plt_thunk_power10, E::thunk_size);
*(ul64 *)(loc + 1) |= prefix34(got - get_addr(i) - 4);
} else {
i64 val = got - ctx.extra.TOC->value;
memcpy(loc, plt_thunk, E::thunk_size);
loc[1] |= higha(val);
loc[2] |= lo(val);
}
} else {
if (ctx.extra.is_power10) {
memcpy(loc, local_thunk_power10, E::thunk_size);
*(ul64 *)(loc + 1) |= prefix34(sym.get_addr(ctx) - get_addr(i) - 4);
} else {
i64 val = sym.get_addr(ctx) - ctx.extra.TOC->value;
memcpy(loc, local_thunk, E::thunk_size);
loc[1] |= higha(val);
loc[2] |= lo(val);
}
}
}
}
} // namespace mold::elf

View file

@ -1,938 +0,0 @@
// clang-format off
// RISC-V is a clean RISC ISA. It supports PC-relative load/store for
// position-independent code. Its 32-bit and 64-bit ISAs are almost
// identical. That is, you can think RV32 as a RV64 without 64-bit
// operations. In this file, we support both RV64 and RV32.
//
// RISC-V is essentially little-endian, but the big-endian version is
// available as an extension. GCC supports `-mbig-endian` to generate
// big-endian code. Even in big-endian mode, machine instructions are
// defined to be encoded in little-endian, though. Only the behavior of
// load/store instructions are different between LE RISC-V and BE RISC-V.
//
// From the linker's point of view, the RISC-V's psABI is unique because
// sections in input object files can be shrunk while being copied to the
// output file. That is contrary to other psABIs in which sections are an
// atomic unit of copying. Let me explain it in more details.
//
// Since RISC-V instructions are 16-bit or 32-bit long, there's no way to
// embed a very large immediate into a branch instruction. In fact, JAL
// (jump and link) instruction can jump to only within PC ± 1 MiB because
// its immediate is only 21 bits long. If the destination is out of its
// reach, we need to use two instructions instead; the first instruction
// being AUIPC which sets upper 20 bits to a register and the second being
// JALR with a 12-bit immediate and the register. Combined, they specify a
// 32 bits displacement.
//
// Other RISC ISAs have the same limitation, and they solved the problem by
// letting the linker create so-called "range extension thunks". It works as
// follows: the compiler optimistically emits single jump instructions for
// function calls. If the linker finds that a branch target is out of reach,
// it emits a small piece of machine code near the branch instruction and
// redirect the branch to the linker-synthesized code. The code constructs a
// full 32-bit address in a register and jump to the destination. That
// linker-synthesized code is called "range extension thunks" or just
// "thunks".
//
// The RISC-V psABI is unique that it works the other way around. That is,
// for RISC-V, the compiler always emits two instructions (AUIPC + JAL) for
// function calls. If the linker finds the destination is reachable with a
// single instruction, it replaces the two instructions with the one and
// shrink the section size by one instruction length, instead of filling the
// gap with a nop.
//
// With the presence of this relaxation, sections can no longer be
// considered as an atomic unit. If we delete 4 bytes from the middle of a
// section, all contents after that point needs to be shifted by 4. Symbol
// values and relocation offsets have to be adjusted accordingly if they
// refer to past the deleted bytes.
//
// In mold, we use `r_deltas` to memorize how many bytes have be adjusted
// for relocations. For symbols, we directly mutate their `value` member.
//
// RISC-V object files tend to have way more relocations than those for
// other targets. This is because all branches, including ones that jump
// within the same section, are explicitly expressed with relocations.
// Here is why we need them: all control-flow statements such as `if` or
// `for` are implemented using branch instructions. For other targets, the
// compiler doesn't emit relocations for such branches because they know
// at compile-time exactly how many bytes has to be skipped. That's not
// true to RISC-V because the linker may delete bytes between a branch and
// its destination. Therefore, all branches including in-section ones have
// to be explicitly expressed with relocations.
//
// Note that this mechanism only shrink sections and never enlarge, as
// the compiler always emits the longest instruction sequence. This
// makes the linker implementation a bit simpler because we don't need
// to worry about oscillation.
//
// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc
#include "third_party/mold/elf/mold.h"
// MISSING #include <tbb/parallel_for.h>
// MISSING #include <tbb/parallel_for_each.h>
namespace mold::elf {
static void write_itype(u8 *loc, u32 val) {
*(ul32 *)loc &= 0b000000'00000'11111'111'11111'1111111;
*(ul32 *)loc |= bits(val, 11, 0) << 20;
}
static void write_stype(u8 *loc, u32 val) {
*(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
*(ul32 *)loc |= bits(val, 11, 5) << 25 | bits(val, 4, 0) << 7;
}
static void write_btype(u8 *loc, u32 val) {
*(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111;
*(ul32 *)loc |= bit(val, 12) << 31 | bits(val, 10, 5) << 25 |
bits(val, 4, 1) << 8 | bit(val, 11) << 7;
}
static void write_utype(u8 *loc, u32 val) {
*(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;
// U-type instructions are used in combination with I-type
// instructions. U-type insn sets an immediate to the upper 20-bits
// of a register. I-type insn sign-extends a 12-bits immediate and
// adds it to a register value to construct a complete value. 0x800
// is added here to compensate for the sign-extension.
*(ul32 *)loc |= (val + 0x800) & 0xffff'f000;
}
static void write_jtype(u8 *loc, u32 val) {
*(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111;
*(ul32 *)loc |= bit(val, 20) << 31 | bits(val, 10, 1) << 21 |
bit(val, 11) << 20 | bits(val, 19, 12) << 12;
}
static void write_cbtype(u8 *loc, u32 val) {
*(ul16 *)loc &= 0b111'000'111'00000'11;
*(ul16 *)loc |= bit(val, 8) << 12 | bit(val, 4) << 11 | bit(val, 3) << 10 |
bit(val, 7) << 6 | bit(val, 6) << 5 | bit(val, 2) << 4 |
bit(val, 1) << 3 | bit(val, 5) << 2;
}
static void write_cjtype(u8 *loc, u32 val) {
*(ul16 *)loc &= 0b111'00000000000'11;
*(ul16 *)loc |= bit(val, 11) << 12 | bit(val, 4) << 11 | bit(val, 9) << 10 |
bit(val, 8) << 9 | bit(val, 10) << 8 | bit(val, 6) << 7 |
bit(val, 7) << 6 | bit(val, 3) << 5 | bit(val, 2) << 4 |
bit(val, 1) << 3 | bit(val, 5) << 2;
}
static void overwrite_uleb(u8 *loc, u64 val) {
while (*loc & 0b1000'0000) {
*loc++ = 0b1000'0000 | (val & 0b0111'1111);
val >>= 7;
}
}
// Returns the rd register of an R/I/U/J-type instruction.
static u32 get_rd(u32 val) {
return bits(val, 11, 7);
}
static void set_rs1(u8 *loc, u32 rs1) {
assert(rs1 < 32);
*(ul32 *)loc &= 0b111111'11111'00000'111'11111'1111111;
*(ul32 *)loc |= rs1 << 15;
}
template <typename E>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static const ul32 insn_64[] = {
0x0000'0397, // auipc t2, %pcrel_hi(.got.plt)
0x41c3'0333, // sub t1, t1, t3 # .plt entry + hdr + 12
0x0003'be03, // ld t3, %pcrel_lo(1b)(t2) # _dl_runtime_resolve
0xfd43'0313, // addi t1, t1, -44 # .plt entry
0x0003'8293, // addi t0, t2, %pcrel_lo(1b) # &.got.plt
0x0013'5313, // srli t1, t1, 1 # .plt entry offset
0x0082'b283, // ld t0, 8(t0) # link map
0x000e'0067, // jr t3
};
static const ul32 insn_32[] = {
0x0000'0397, // auipc t2, %pcrel_hi(.got.plt)
0x41c3'0333, // sub t1, t1, t3 # .plt entry + hdr + 12
0x0003'ae03, // lw t3, %pcrel_lo(1b)(t2) # _dl_runtime_resolve
0xfd43'0313, // addi t1, t1, -44 # .plt entry
0x0003'8293, // addi t0, t2, %pcrel_lo(1b) # &.got.plt
0x0023'5313, // srli t1, t1, 2 # .plt entry offset
0x0042'a283, // lw t0, 4(t0) # link map
0x000e'0067, // jr t3
};
if constexpr (E::is_64)
memcpy(buf, insn_64, sizeof(insn_64));
else
memcpy(buf, insn_32, sizeof(insn_32));
u64 gotplt = ctx.gotplt->shdr.sh_addr;
u64 plt = ctx.plt->shdr.sh_addr;
write_utype(buf, gotplt - plt);
write_itype(buf + 8, gotplt - plt);
write_itype(buf + 16, gotplt - plt);
}
static const ul32 plt_entry_64[] = {
0x0000'0e17, // auipc t3, %pcrel_hi(function@.got.plt)
0x000e'3e03, // ld t3, %pcrel_lo(1b)(t3)
0x000e'0367, // jalr t1, t3
0x0000'0013, // nop
};
static const ul32 plt_entry_32[] = {
0x0000'0e17, // auipc t3, %pcrel_hi(function@.got.plt)
0x000e'2e03, // lw t3, %pcrel_lo(1b)(t3)
0x000e'0367, // jalr t1, t3
0x0000'0013, // nop
};
template <typename E>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
if constexpr (E::is_64)
memcpy(buf, plt_entry_64, sizeof(plt_entry_64));
else
memcpy(buf, plt_entry_32, sizeof(plt_entry_32));
u64 gotplt = sym.get_gotplt_addr(ctx);
u64 plt = sym.get_plt_addr(ctx);
write_utype(buf, gotplt - plt);
write_itype(buf + 4, gotplt - plt);
}
template <typename E>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
if constexpr (E::is_64)
memcpy(buf, plt_entry_64, sizeof(plt_entry_64));
else
memcpy(buf, plt_entry_32, sizeof(plt_entry_32));
u64 got = sym.get_got_addr(ctx);
u64 plt = sym.get_plt_addr(ctx);
write_utype(buf, got - plt);
write_itype(buf + 4, got - plt);
}
template <typename E>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_RISCV_ADD32:
*(U32<E> *)loc += val;
break;
case R_RISCV_SUB8:
*loc -= val;
break;
case R_RISCV_SUB16:
*(U16<E> *)loc -= val;
break;
case R_RISCV_SUB32:
*(U32<E> *)loc -= val;
break;
case R_RISCV_SUB6:
*loc = (*loc & 0b1100'0000) | ((*loc - val) & 0b0011'1111);
break;
case R_RISCV_SET6:
*loc = (*loc & 0b1100'0000) | (val & 0b0011'1111);
break;
case R_RISCV_SET8:
*loc = val;
break;
case R_RISCV_SET16:
*(U16<E> *)loc = val;
break;
case R_RISCV_SET32:
*(U32<E> *)loc = val;
break;
case R_RISCV_32_PCREL:
*(U32<E> *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <typename E>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
auto get_r_delta = [&](i64 idx) {
return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
};
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
i64 r_offset = rel.r_offset - get_r_delta(i);
i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
u8 *loc = base + r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
auto find_paired_reloc = [&] {
Symbol<E> &sym = *file.symbols[rels[i].r_sym];
assert(sym.get_input_section() == this);
if (sym.value < r_offset) {
for (i64 j = i - 1; j >= 0; j--)
if (u32 ty = rels[j].r_type;
ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20)
if (sym.value == rels[j].r_offset - get_r_delta(j))
return j;
} else {
for (i64 j = i + 1; j < rels.size(); j++)
if (u32 ty = rels[j].r_type;
ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 ||
ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20)
if (sym.value == rels[j].r_offset - get_r_delta(j))
return j;
}
Fatal(ctx) << *this << ": paired relocation is missing: " << i;
};
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_RISCV_32:
if constexpr (E::is_64)
*(U32<E> *)loc = S + A;
else
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_RISCV_64:
assert(E::is_64);
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_RISCV_BRANCH:
check(S + A - P, -(1 << 12), 1 << 12);
write_btype(loc, S + A - P);
break;
case R_RISCV_JAL:
check(S + A - P, -(1 << 20), 1 << 20);
write_jtype(loc, S + A - P);
break;
case R_RISCV_CALL:
case R_RISCV_CALL_PLT: {
u32 rd = get_rd(*(ul32 *)(contents.data() + rel.r_offset + 4));
if (removed_bytes == 4) {
// auipc + jalr -> jal
*(ul32 *)loc = (rd << 7) | 0b1101111;
write_jtype(loc, S + A - P);
} else if (removed_bytes == 6 && rd == 0) {
// auipc + jalr -> c.j
*(ul16 *)loc = 0b101'00000000000'01;
write_cjtype(loc, S + A - P);
} else if (removed_bytes == 6 && rd == 1) {
// auipc + jalr -> c.jal
assert(!E::is_64);
*(ul16 *)loc = 0b001'00000000000'01;
write_cjtype(loc, S + A - P);
} else {
assert(removed_bytes == 0);
// Calling an undefined weak symbol does not make sense.
// We make such call into an infinite loop. This should
// help debugging of a faulty program.
u64 val = sym.esym().is_undef_weak() ? 0 : S + A - P;
check(val, -(1LL << 31), 1LL << 31);
write_utype(loc, val);
write_itype(loc + 4, val);
}
break;
}
case R_RISCV_GOT_HI20:
write_utype(loc, G + GOT + A - P);
break;
case R_RISCV_TLS_GOT_HI20:
write_utype(loc, sym.get_gottp_addr(ctx) + A - P);
break;
case R_RISCV_TLS_GD_HI20:
write_utype(loc, sym.get_tlsgd_addr(ctx) + A - P);
break;
case R_RISCV_PCREL_HI20:
write_utype(loc, S + A - P);
break;
case R_RISCV_PCREL_LO12_I:
case R_RISCV_PCREL_LO12_S: {
i64 idx2 = find_paired_reloc();
const ElfRel<E> &rel2 = rels[idx2];
Symbol<E> &sym2 = *file.symbols[rel2.r_sym];
u64 S = sym2.get_addr(ctx);
u64 A = rel2.r_addend;
u64 P = get_addr() + rel2.r_offset - get_r_delta(idx2);
u64 G = sym2.get_got_idx(ctx) * sizeof(Word<E>);
u64 val;
switch (rel2.r_type) {
case R_RISCV_GOT_HI20:
val = G + GOT + A - P;
break;
case R_RISCV_TLS_GOT_HI20:
val = sym2.get_gottp_addr(ctx) + A - P;
break;
case R_RISCV_TLS_GD_HI20:
val = sym2.get_tlsgd_addr(ctx) + A - P;
break;
case R_RISCV_PCREL_HI20:
val = S + A - P;
break;
default:
unreachable();
}
if (rel.r_type == R_RISCV_PCREL_LO12_I)
write_itype(loc, val);
else
write_stype(loc, val);
break;
}
case R_RISCV_HI20:
assert(removed_bytes == 0 || removed_bytes == 4);
if (removed_bytes == 0) {
check(S + A, -(1LL << 31), 1LL << 31);
write_utype(loc, S + A);
}
break;
case R_RISCV_LO12_I:
case R_RISCV_LO12_S:
if (rel.r_type == R_RISCV_LO12_I)
write_itype(loc, S + A);
else
write_stype(loc, S + A);
// Rewrite `lw t1, 0(t0)` with `lw t1, 0(x0)` if the address is
// accessible relative to the zero register. If the upper 20 bits
// are all zero, the corresponding LUI might have been removed.
if (bits(S + A, 31, 12) == 0)
set_rs1(loc, 0);
break;
case R_RISCV_TPREL_HI20:
assert(removed_bytes == 0 || removed_bytes == 4);
if (removed_bytes == 0)
write_utype(loc, S + A - ctx.tp_addr);
break;
case R_RISCV_TPREL_ADD:
// This relocation just annotates an ADD instruction that can be
// removed when a TPREL is relaxed. No value is needed to be
// written.
assert(removed_bytes == 0 || removed_bytes == 4);
break;
case R_RISCV_TPREL_LO12_I:
case R_RISCV_TPREL_LO12_S: {
i64 val = S + A - ctx.tp_addr;
if (rel.r_type == R_RISCV_TPREL_LO12_I)
write_itype(loc, val);
else
write_stype(loc, val);
// Rewrite `lw t1, 0(t0)` with `lw t1, 0(tp)` if the address is
// directly accessible using tp. tp is x4.
if (sign_extend(val, 11) == val)
set_rs1(loc, 4);
break;
}
case R_RISCV_ADD8:
loc += S + A;
break;
case R_RISCV_ADD16:
*(U16<E> *)loc += S + A;
break;
case R_RISCV_ADD32:
*(U32<E> *)loc += S + A;
break;
case R_RISCV_ADD64:
*(U64<E> *)loc += S + A;
break;
case R_RISCV_SUB8:
loc -= S + A;
break;
case R_RISCV_SUB16:
*(U16<E> *)loc -= S + A;
break;
case R_RISCV_SUB32:
*(U32<E> *)loc -= S + A;
break;
case R_RISCV_SUB64:
*(U64<E> *)loc -= S + A;
break;
case R_RISCV_ALIGN: {
// A R_RISCV_ALIGN is followed by a NOP sequence. We need to remove
// zero or more bytes so that the instruction after R_RISCV_ALIGN is
// aligned to a given alignment boundary.
//
// We need to guarantee that the NOP sequence is valid after byte
// removal (e.g. we can't remove the first 2 bytes of a 4-byte NOP).
// For the sake of simplicity, we always rewrite the entire NOP sequence.
i64 padding_bytes = rel.r_addend - removed_bytes;
assert((padding_bytes & 1) == 0);
i64 i = 0;
for (; i <= padding_bytes - 4; i += 4)
*(ul32 *)(loc + i) = 0x0000'0013; // nop
if (i < padding_bytes)
*(ul16 *)(loc + i) = 0x0001; // c.nop
break;
}
case R_RISCV_RVC_BRANCH:
check(S + A - P, -(1 << 8), 1 << 8);
write_cbtype(loc, S + A - P);
break;
case R_RISCV_RVC_JUMP:
check(S + A - P, -(1 << 11), 1 << 11);
write_cjtype(loc, S + A - P);
break;
case R_RISCV_SUB6:
*loc = (*loc & 0b1100'0000) | ((*loc - (S + A)) & 0b0011'1111);
break;
case R_RISCV_SET6:
*loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
break;
case R_RISCV_SET8:
*loc = S + A;
break;
case R_RISCV_SET16:
*(U16<E> *)loc = S + A;
break;
case R_RISCV_SET32:
*(U32<E> *)loc = S + A;
break;
case R_RISCV_PLT32:
case R_RISCV_32_PCREL:
*(U32<E> *)loc = S + A - P;
break;
default:
unreachable();
}
}
}
template <typename E>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_RISCV_32:
*(U32<E> *)loc = S + A;
break;
case R_RISCV_64:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(U64<E> *)loc = *val;
else
*(U64<E> *)loc = S + A;
break;
case R_RISCV_ADD8:
*loc += S + A;
break;
case R_RISCV_ADD16:
*(U16<E> *)loc += S + A;
break;
case R_RISCV_ADD32:
*(U32<E> *)loc += S + A;
break;
case R_RISCV_ADD64:
*(U64<E> *)loc += S + A;
break;
case R_RISCV_SUB8:
*loc -= S + A;
break;
case R_RISCV_SUB16:
*(U16<E> *)loc -= S + A;
break;
case R_RISCV_SUB32:
*(U32<E> *)loc -= S + A;
break;
case R_RISCV_SUB64:
*(U64<E> *)loc -= S + A;
break;
case R_RISCV_SUB6:
*loc = (*loc & 0b1100'0000) | ((*loc - (S + A)) & 0b0011'1111);
break;
case R_RISCV_SET6:
*loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111);
break;
case R_RISCV_SET8:
*loc = S + A;
break;
case R_RISCV_SET16:
*(U16<E> *)loc = S + A;
break;
case R_RISCV_SET32:
*(U32<E> *)loc = S + A;
break;
case R_RISCV_SET_ULEB128:
overwrite_uleb(loc, S + A);
break;
case R_RISCV_SUB_ULEB128: {
u8 *p = loc;
u64 val = read_uleb(p);
overwrite_uleb(loc, val - S - A);
break;
}
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
break;
}
}
}
template <typename E>
void InputSection<E>::copy_contents_riscv(Context<E> &ctx, u8 *buf) {
// If a section is not relaxed, we can copy it as a one big chunk.
if (extra.r_deltas.empty()) {
uncompress_to(ctx, buf);
return;
}
// A relaxed section is copied piece-wise.
std::span<const ElfRel<E>> rels = get_rels(ctx);
i64 pos = 0;
for (i64 i = 0; i < rels.size(); i++) {
i64 delta = extra.r_deltas[i + 1] - extra.r_deltas[i];
if (delta == 0)
continue;
assert(delta > 0);
const ElfRel<E> &r = rels[i];
memcpy(buf, contents.data() + pos, r.r_offset - pos);
buf += r.r_offset - pos;
pos = r.r_offset + delta;
}
memcpy(buf, contents.data() + pos, contents.size() - pos);
}
template <typename E>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_RISCV_32:
if constexpr (E::is_64)
scan_absrel(ctx, sym, rel);
else
scan_dyn_absrel(ctx, sym, rel);
break;
case R_RISCV_HI20:
scan_absrel(ctx, sym, rel);
break;
case R_RISCV_64:
if constexpr (!E::is_64)
Fatal(ctx) << *this << ": R_RISCV_64 cannot be used on RV32";
scan_dyn_absrel(ctx, sym, rel);
break;
case R_RISCV_CALL:
case R_RISCV_CALL_PLT:
case R_RISCV_PLT32:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_RISCV_GOT_HI20:
sym.flags |= NEEDS_GOT;
break;
case R_RISCV_TLS_GOT_HI20:
sym.flags |= NEEDS_GOTTP;
break;
case R_RISCV_TLS_GD_HI20:
sym.flags |= NEEDS_TLSGD;
break;
case R_RISCV_32_PCREL:
scan_pcrel(ctx, sym, rel);
break;
case R_RISCV_TPREL_HI20:
case R_RISCV_TPREL_LO12_I:
case R_RISCV_TPREL_LO12_S:
case R_RISCV_TPREL_ADD:
check_tlsle(ctx, sym, rel);
break;
case R_RISCV_BRANCH:
case R_RISCV_JAL:
case R_RISCV_PCREL_HI20:
case R_RISCV_PCREL_LO12_I:
case R_RISCV_PCREL_LO12_S:
case R_RISCV_LO12_I:
case R_RISCV_LO12_S:
case R_RISCV_ADD8:
case R_RISCV_ADD16:
case R_RISCV_ADD32:
case R_RISCV_ADD64:
case R_RISCV_SUB8:
case R_RISCV_SUB16:
case R_RISCV_SUB32:
case R_RISCV_SUB64:
case R_RISCV_ALIGN:
case R_RISCV_RVC_BRANCH:
case R_RISCV_RVC_JUMP:
case R_RISCV_RELAX:
case R_RISCV_SUB6:
case R_RISCV_SET6:
case R_RISCV_SET8:
case R_RISCV_SET16:
case R_RISCV_SET32:
break;
default:
Error(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
template <typename E>
static bool is_resizable(Context<E> &ctx, InputSection<E> *isec) {
return isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC) &&
(isec->shdr().sh_flags & SHF_EXECINSTR);
}
// Returns the distance between a relocated place and a symbol.
template <typename E>
static i64 compute_distance(Context<E> &ctx, Symbol<E> &sym,
InputSection<E> &isec, const ElfRel<E> &rel) {
// We handle absolute symbols as if they were infinitely far away
// because `shrink_section` may increase a distance between a branch
// instruction and an absolute symbol. Branching to an absolute
// location is extremely rare in real code, though.
if (sym.is_absolute())
return INT32_MAX;
// Likewise, relocations against weak undefined symbols won't be relaxed.
if (sym.esym().is_undef_weak())
return INT32_MAX;
// Compute a distance between the relocated place and the symbol.
i64 S = sym.get_addr(ctx);
i64 A = rel.r_addend;
i64 P = isec.get_addr() + rel.r_offset;
return S + A - P;
}
// Scan relocations to shrink sections.
template <typename E>
static void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
isec.extra.r_deltas.resize(rels.size() + 1);
i64 delta = 0;
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &r = rels[i];
Symbol<E> &sym = *isec.file.symbols[r.r_sym];
isec.extra.r_deltas[i] = delta;
// Handling R_RISCV_ALIGN is mandatory.
//
// R_RISCV_ALIGN refers to NOP instructions. We need to eliminate some
// or all of the instructions so that the instruction that immediately
// follows the NOPs is aligned to a specified alignment boundary.
if (r.r_type == R_RISCV_ALIGN) {
// The total bytes of NOPs is stored to r_addend, so the next
// instruction is r_addend away.
u64 loc = isec.get_addr() + r.r_offset - delta;
u64 next_loc = loc + r.r_addend;
u64 alignment = bit_ceil(r.r_addend + 1);
assert(alignment <= (1 << isec.p2align));
delta += next_loc - align_to(loc, alignment);
continue;
}
// Handling other relocations is optional.
if (!ctx.arg.relax || i == rels.size() - 1 ||
rels[i + 1].r_type != R_RISCV_RELAX)
continue;
// Linker-synthesized symbols haven't been assigned their final
// values when we are shrinking sections because actual values can
// be computed only after we fix the file layout. Therefore, we
// assume that relocations against such symbols are always
// non-relaxable.
if (sym.file == ctx.internal_obj)
continue;
switch (r.r_type) {
case R_RISCV_CALL:
case R_RISCV_CALL_PLT: {
// These relocations refer to an AUIPC + JALR instruction pair to
// allow to jump to anywhere in PC ± 2 GiB. If the jump target is
// close enough to PC, we can use C.J, C.JAL or JAL instead.
i64 dist = compute_distance(ctx, sym, isec, r);
if (dist & 1)
break;
i64 rd = get_rd(*(ul32 *)(isec.contents.data() + r.r_offset + 4));
if (rd == 0 && sign_extend(dist, 11) == dist && use_rvc) {
// If rd is x0 and the jump target is within ±2 KiB, we can use
// C.J, saving 6 bytes.
delta += 6;
} else if (rd == 1 && sign_extend(dist, 11) == dist && use_rvc && !E::is_64) {
// If rd is x1 and the jump target is within ±2 KiB, we can use
// C.JAL. This is RV32 only because C.JAL is RV32-only instruction.
delta += 6;
} else if (sign_extend(dist, 20) == dist) {
// If the jump target is within ±1 MiB, we can use JAL.
delta += 4;
}
break;
}
case R_RISCV_HI20:
// If the upper 20 bits are all zero, we can remove LUI.
// The corresponding instructions referred to by LO12_I/LO12_S
// relocations will use the zero register instead.
if (bits(sym.get_addr(ctx), 31, 12) == 0)
delta += 4;
break;
case R_RISCV_TPREL_HI20:
case R_RISCV_TPREL_ADD:
// These relocations are used to add a high 20-bit value to the
// thread pointer. The following two instructions materializes
// TP + HI20(foo) in %r5, for example.
//
// lui a5,%tprel_hi(foo) # R_RISCV_TPREL_HI20 (symbol)
// add a5,a5,tp,%tprel_add(foo) # R_RISCV_TPREL_ADD (symbol)
//
// Then thread-local variable `foo` is accessed with a low 12-bit
// offset like this:
//
// sw t0,%tprel_lo(foo)(a5) # R_RISCV_TPREL_LO12_S (symbol)
//
// However, if the variable is at TP ±2 KiB, TP + HI20(foo) is the
// same as TP, so we can instead access the thread-local variable
// directly using TP like this:
//
// sw t0,%tprel_lo(foo)(tp)
//
// Here, we remove `lui` and `add` if the offset is within ±2 KiB.
if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr;
sign_extend(val, 11) == val)
delta += 4;
break;
}
}
isec.extra.r_deltas[rels.size()] = delta;
isec.sh_size -= delta;
}
// Shrink sections by interpreting relocations.
//
// This operation seems to be optional, because by default longest
// instructions are being used. However, calling this function is actually
// mandatory because of R_RISCV_ALIGN. R_RISCV_ALIGN is a directive to the
// linker to align the location referred to by the relocation to a
// specified byte boundary. We at least have to interpret them to satisfy
// the alignment constraints.
template <typename E>
i64 riscv_resize_sections(Context<E> &ctx) {
Timer t(ctx, "riscv_resize_sections");
// True if we can use the 2-byte instructions. This is usually true on
// Unix because RV64GC is generally considered the baseline hardware.
bool use_rvc = get_eflags(ctx) & EF_RISCV_RVC;
// Find all the relocations that can be relaxed.
// This step should only shrink sections.
tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
for (std::unique_ptr<InputSection<E>> &isec : file->sections)
if (is_resizable(ctx, isec.get()))
shrink_section(ctx, *isec, use_rvc);
});
// Fix symbol values.
tbb::parallel_for_each(ctx.objs, [&](ObjectFile<E> *file) {
for (Symbol<E> *sym : file->symbols) {
if (sym->file != file)
continue;
InputSection<E> *isec = sym->get_input_section();
if (!isec || isec->extra.r_deltas.empty())
continue;
std::span<const ElfRel<E>> rels = isec->get_rels(ctx);
auto it = std::lower_bound(rels.begin(), rels.end(), sym->value,
[&](const ElfRel<E> &r, u64 val) {
return r.r_offset < val;
});
sym->value -= isec->extra.r_deltas[it - rels.begin()];
}
});
// Re-compute section offset again to finalize them.
compute_section_sizes(ctx);
return set_osec_offsets(ctx);
}
#define INSTANTIATE(E) \
template void write_plt_header(Context<E> &, u8 *); \
template void write_plt_entry(Context<E> &, u8 *, Symbol<E> &); \
template void write_pltgot_entry(Context<E> &, u8 *, Symbol<E> &); \
template void \
EhFrameSection<E>::apply_reloc(Context<E> &, const ElfRel<E> &, u64, u64); \
template void InputSection<E>::apply_reloc_alloc(Context<E> &, u8 *); \
template void InputSection<E>::apply_reloc_nonalloc(Context<E> &, u8 *); \
template void InputSection<E>::copy_contents_riscv(Context<E> &, u8 *); \
template void InputSection<E>::scan_relocations(Context<E> &); \
template i64 riscv_resize_sections(Context<E> &);
INSTANTIATE(RV64LE);
INSTANTIATE(RV64BE);
INSTANTIATE(RV32LE);
INSTANTIATE(RV32BE);
} // namespace mold::elf

View file

@ -1,491 +0,0 @@
// clang-format off
// This file contains code for the IBM z/Architecture 64-bit ISA, which is
// commonly referred to as "s390x" on Linux.
//
// z/Architecture is a 64-bit CISC ISA developed by IBM around 2000 for
// IBM's "big iron" mainframe computers. The computers are direct
// descendents of IBM System/360 all the way back in 1966. I've never
// actually seen a mainframe, and you probaly haven't either, but it looks
// like the mainframe market is still large enough to sustain its ecosystem.
// Ubuntu for example provides the official support for s390x as of 2022.
// Since they are being actively maintained, we need to support them.
//
// As an instruction set, s390x isn't particularly odd. It has 16 general-
// purpose registers. Instructions are 2, 4 or 6 bytes long and always
// aligned to 2 bytes boundaries. Despite unfamiliarty, I found that it
// just feels like an x86-64 in a parallel universe.
//
// Here is the register usage in this ABI:
//
// r0-r1: reserved as scratch registers so we can use them in our PLT
// r2: parameter passing and return values
// r3-r6: parameter passing
// r12: address of GOT if position-independent code
// r14: return address
// r15: stack pointer
// a1: upper 32 bits of TP (thread pointer)
// a2: lower 32 bits of TP (thread pointer)
//
// Thread-local storage (TLS) is supported on s390x in the same way as it
// is on other targets with one exeption. On other targets, __tls_get_addr
// is used to get an address of a thread-local variable. On s390x,
// __tls_get_offset is used instead. The difference is __tls_get_offset
// returns an address of a thread-local variable as an offset from TP. So
// we need to add TP to a return value before use. I don't know why it is
// different, but that is the way it is.
//
// https://github.com/rui314/psabi/blob/main/s390x.pdf
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = S390X;
static void write_mid20(u8 *loc, u64 val) {
*(ub32 *)loc |= (bits(val, 11, 0) << 16) | (bits(val, 19, 12) << 8);
}
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
static u8 insn[] = {
0xe3, 0x00, 0xf0, 0x38, 0x00, 0x24, // stg %r0, 56(%r15)
0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOTPLT_OFFSET
0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc 48(8, %r15), 8(%r1)
0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg %r1, 16(%r1)
0x07, 0xf1, // br %r1
0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
};
memcpy(buf, insn, sizeof(insn));
*(ub32 *)(buf + 8) = (ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 6) >> 1;
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static u8 insn[] = {
0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOTPLT_ENTRY_OFFSET
0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg %r1, (%r1)
0xc0, 0x01, 0, 0, 0, 0, // lgfi %r0, PLT_INDEX
0x07, 0xf1, // br %r1
0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
0x07, 0x00, 0x07, 0x00, 0x07, 0x00, // nopr; nopr; nopr
};
memcpy(buf, insn, sizeof(insn));
*(ub32 *)(buf + 2) = (sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx)) >> 1;
*(ub32 *)(buf + 14) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static u8 insn[] = {
0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOT_ENTRY_OFFSET
0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg %r1, (%r1)
0x07, 0xf1, // br %r1
0x07, 0x00, // nopr
};
memcpy(buf, insn, sizeof(insn));
*(ub32 *)(buf + 2) = (sym.get_got_addr(ctx) - sym.get_plt_addr(ctx)) >> 1;
}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_390_PC32:
*(ub32 *)loc = val - this->shdr.sh_addr - offset;
break;
case R_390_64:
*(ub64 *)loc = val;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
auto check_dbl = [&](i64 val, i64 lo, i64 hi) {
check(val, lo, hi);
// R_390_*DBL relocs should never refer a symbol at an odd address
if (val & 1)
Error(ctx) << *this << ": misaligned symbol " << sym
<< " for relocation " << rel;
};
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_390_64:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_390_8:
check(S + A, 0, 1 << 8);
*loc = S + A;
break;
case R_390_12:
check(S + A, 0, 1 << 12);
*(ul16 *)loc |= bits(S + A, 11, 0);
break;
case R_390_16:
check(S + A, 0, 1 << 16);
*(ub16 *)loc = S + A;
break;
case R_390_20:
check(S + A, 0, 1 << 20);
write_mid20(loc, S + A);
break;
case R_390_32:
case R_390_PLT32:
check(S + A, 0, 1LL << 32);
*(ub32 *)loc = S + A;
break;
case R_390_PLT64:
*(ub64 *)loc = S + A;
break;
case R_390_PC12DBL:
case R_390_PLT12DBL:
check_dbl(S + A - P, -(1 << 12), 1 << 12);
*(ul16 *)loc |= bits(S + A - P, 12, 1);
break;
case R_390_PC16:
check(S + A - P, -(1 << 15), 1 << 15);
*(ub16 *)loc = S + A - P;
break;
case R_390_PC32:
check(S + A - P, -(1LL << 31), 1LL << 31);
*(ub32 *)loc = S + A - P;
break;
case R_390_PC64:
*(ub64 *)loc = S + A - P;
break;
case R_390_PC16DBL:
case R_390_PLT16DBL:
check_dbl(S + A - P, -(1 << 16), 1 << 16);
*(ub16 *)loc = (S + A - P) >> 1;
break;
case R_390_PC24DBL:
case R_390_PLT24DBL:
check_dbl(S + A - P, -(1 << 24), 1 << 24);
*(ub32 *)loc |= bits(S + A - P, 24, 1);
break;
case R_390_PC32DBL:
case R_390_PLT32DBL:
check_dbl(S + A - P, -(1LL << 32), 1LL << 32);
*(ub32 *)loc = (S + A - P) >> 1;
break;
case R_390_GOT12:
case R_390_GOTPLT12:
check(G + A, 0, 1 << 12);
*(ul16 *)loc |= bits(G + A, 11, 0);
break;
case R_390_GOT16:
case R_390_GOTPLT16:
check(G + A, 0, 1 << 16);
*(ub16 *)loc = G + A;
break;
case R_390_GOT20:
case R_390_GOTPLT20:
check(G + A, 0, 1 << 20);
write_mid20(loc, G + A);
break;
case R_390_GOT32:
case R_390_GOTPLT32:
check(G + A, 0, 1LL << 32);
*(ub32 *)loc = G + A;
break;
case R_390_GOT64:
case R_390_GOTPLT64:
*(ub64 *)loc = G + A;
break;
case R_390_GOTOFF16:
case R_390_PLTOFF16:
check(S + A - GOT, -(1 << 15), 1 << 15);
*(ub16 *)loc = S + A - GOT;
break;
case R_390_GOTOFF32:
case R_390_PLTOFF32:
check(S + A - GOT, -(1LL << 31), 1LL << 31);
*(ub32 *)loc = S + A - GOT;
break;
case R_390_GOTOFF64:
case R_390_PLTOFF64:
*(ub64 *)loc = S + A - GOT;
break;
case R_390_GOTPC:
*(ub64 *)loc = GOT + A - P;
break;
case R_390_GOTPCDBL:
check_dbl(GOT + A - P, -(1LL << 32), 1LL << 32);
*(ub32 *)loc = (GOT + A - P) >> 1;
break;
case R_390_GOTENT:
check(GOT + G + A - P, -(1LL << 32), 1LL << 32);
*(ub32 *)loc = (GOT + G + A - P) >> 1;
break;
case R_390_TLS_LE32:
*(ub32 *)loc = S + A - ctx.tp_addr;
break;
case R_390_TLS_LE64:
*(ub64 *)loc = S + A - ctx.tp_addr;
break;
case R_390_TLS_GOTIE20:
write_mid20(loc, sym.get_gottp_addr(ctx) + A - GOT);
break;
case R_390_TLS_IEENT:
*(ub32 *)loc = (sym.get_gottp_addr(ctx) + A - P) >> 1;
break;
case R_390_TLS_GD32:
if (sym.has_tlsgd(ctx))
*(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
else if (sym.has_gottp(ctx))
*(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
else
*(ub32 *)loc = S + A - ctx.tp_addr;
break;
case R_390_TLS_GD64:
if (sym.has_tlsgd(ctx))
*(ub64 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
else if (sym.has_gottp(ctx))
*(ub64 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
else
*(ub64 *)loc = S + A - ctx.tp_addr;
break;
case R_390_TLS_GDCALL:
if (sym.has_tlsgd(ctx)) {
// do nothing
} else if (sym.has_gottp(ctx)) {
// lg %r2, 0(%r2, %r12)
static u8 insn[] = { 0xe3, 0x22, 0xc0, 0x00, 0x00, 0x04 };
memcpy(loc, insn, sizeof(insn));
} else {
// nop
static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
memcpy(loc, insn, sizeof(insn));
}
break;
case R_390_TLS_LDM32:
if (ctx.got->has_tlsld(ctx))
*(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
break;
case R_390_TLS_LDM64:
if (ctx.got->has_tlsld(ctx))
*(ub64 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
break;
case R_390_TLS_LDO32:
if (ctx.got->has_tlsld(ctx))
*(ub32 *)loc = S + A - ctx.dtp_addr;
else
*(ub32 *)loc = S + A - ctx.tp_addr;
break;
case R_390_TLS_LDO64:
if (ctx.got->has_tlsld(ctx))
*(ub64 *)loc = S + A - ctx.dtp_addr;
else
*(ub64 *)loc = S + A - ctx.tp_addr;
break;
case R_390_TLS_LDCALL:
if (!ctx.got->has_tlsld(ctx)) {
// nop
static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 };
memcpy(loc, insn, sizeof(insn));
}
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_390_32: {
i64 val = S + A;
check(val, 0, 1LL << 32);
*(ub32 *)loc = val;
break;
}
case R_390_64:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ub64 *)loc = *val;
else
*(ub64 *)loc = S + A;
break;
case R_390_TLS_LDO64:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ub64 *)loc = *val;
else
*(ub64 *)loc = S + A - ctx.dtp_addr;
break;
default:
Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_390_64:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_390_8:
case R_390_12:
case R_390_16:
case R_390_20:
case R_390_32:
scan_absrel(ctx, sym, rel);
break;
case R_390_PC16:
case R_390_PC16DBL:
case R_390_PC32:
case R_390_PC32DBL:
case R_390_PC64:
scan_pcrel(ctx, sym, rel);
break;
case R_390_GOT12:
case R_390_GOT16:
case R_390_GOT20:
case R_390_GOT32:
case R_390_GOT64:
case R_390_GOTOFF16:
case R_390_GOTOFF32:
case R_390_GOTOFF64:
case R_390_GOTPLT12:
case R_390_GOTPLT16:
case R_390_GOTPLT20:
case R_390_GOTPLT32:
case R_390_GOTPLT64:
case R_390_GOTPC:
case R_390_GOTPCDBL:
case R_390_GOTENT:
sym.flags |= NEEDS_GOT;
break;
case R_390_PLT12DBL:
case R_390_PLT16DBL:
case R_390_PLT24DBL:
case R_390_PLT32:
case R_390_PLT32DBL:
case R_390_PLT64:
case R_390_PLTOFF16:
case R_390_PLTOFF32:
case R_390_PLTOFF64:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_390_TLS_GOTIE20:
case R_390_TLS_IEENT:
sym.flags |= NEEDS_GOTTP;
break;
case R_390_TLS_GD32:
case R_390_TLS_GD64:
// We always want to relax calls to __tls_get_offset() in statically-
// linked executables because __tls_get_offset() in libc.a just calls
// abort().
if (ctx.arg.is_static ||
(ctx.arg.relax && !sym.is_imported && !ctx.arg.shared)) {
// do nothing
} else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared &&
!ctx.arg.z_dlopen) {
sym.flags |= NEEDS_GOTTP;
} else {
sym.flags |= NEEDS_TLSGD;
}
break;
case R_390_TLS_LDM32:
case R_390_TLS_LDM64: {
bool do_relax = ctx.arg.is_static || (ctx.arg.relax && !ctx.arg.shared);
if (!do_relax)
ctx.needs_tlsld = true;
break;
}
case R_390_TLS_LE32:
case R_390_TLS_LE64:
check_tlsle(ctx, sym, rel);
break;
case R_390_TLS_LDO32:
case R_390_TLS_LDO64:
case R_390_TLS_GDCALL:
case R_390_TLS_LDCALL:
break;
default:
Fatal(ctx) << *this << ": scan_relocations: " << rel;
}
}
}
} // namespace mold::elf

View file

@ -1,355 +0,0 @@
// clang-format off
// SH-4 (SuperH 4) is a 32-bit RISC ISA developed by Hitachi in the early
// '90s. Some relatively powerful systems were developed with SH-4.
// A notable example is Sega's Dreamcast game console which debuted in 1998.
// Hitachi later spun off its semiconductor division as an independent
// company, Renesas, and Renesas is still selling SH-4 processors for the
// embedded market. It has never been as popular as ARM is, and its
// popularity continues to decline though.
//
// SH-4's most distinctive feature compared to other RISC ISAs is that its
// instructions are 16 bits in length instead of more common 32 bits for
// better code density. This difference affects various aspects of its
// instruction set as shown below:
//
// - SH-4 has 16 general-purpose registers (GPRs) instead of the most
// commmon 32 GPR configuration to save one bit to specify a register.
//
// - Binary instructions such as ADD normally take three register in
// RISC ISAs (e.g. x ← y ⊕ z where x, y and z are registers), but
// SH-4's instructions take only two registers. The result of an
// operation is written to one of the source registers (e.g. x ← x ⊕ y).
//
// - Usual RISC ISAs have "load high" and "load low" instructions to set
// an immediate to most significant and least significant bits in a
// register to construct a full 32-bit value in a register. This
// technique is hard to use in SH-4, as 16 bit instructions are too
// small to contain large immediates. On SH-4, large immediates are
// loaded from memory using `mov.l` PC-relative load instruction.
//
// - Many RISC ISAs are, despite their name, actually fairly complex.
// They tend to have hundreds if not thousands of different instructions.
// SH-4 doesn't really have that many instructions because its 16-bit
// machine code simply can't encode many different opcodes. As a
// result, the number of relocations the linker has to support is also
// small.
//
// Beside these, SH-4 has a delay branch slot just like contemporary MIPS
// and SPARC. That is, one instruction after a branch instruction will
// always be executed even if the branch is taken. Delay branch slot allows
// a pipelined CPU to start and finish executing an instruction after a
// branch regardless of the branch's condition, simplifying the processor's
// implementation. It's considered a bad premature optimization nowadays,
// though. Modern RISC processors don't have it.
//
// Here are notes about the SH-4 psABI:
//
// - If a source file is compiled with -fPIC, each function starts
// with a piece of code to store the address of .got to %r12.
// We can use the register in our PLT for position-independent output.
//
// - Even though it uses the RELA-type relocations, relocation addends
// are stored not to the r_addend field but to the relocated section
// contents for some reason. Therefore, it's effectively REL.
//
// - It looks like the ecosystem has bit-rotted. Some tests, especially
// one using C++ exceptions, don't pass even with GNU ld.
//
// - GCC/SH4 tends to write dynamically-relocated data into .text, so the
// output from the linker contains lots of text relocations. That's not
// a problem with embedded programming, I guess.
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = SH4;
// Even though SH-4 uses RELA-type relocations, addends are stored to
// relocated places for some reason.
template <>
i64 get_addend(u8 *loc, const ElfRel<E> &rel) {
switch (rel.r_type) {
case R_SH_DIR32:
case R_SH_REL32:
case R_SH_TLS_GD_32:
case R_SH_TLS_LD_32:
case R_SH_TLS_LDO_32:
case R_SH_TLS_IE_32:
case R_SH_TLS_LE_32:
case R_SH_TLS_DTPMOD32:
case R_SH_TLS_DTPOFF32:
case R_SH_TLS_TPOFF32:
case R_SH_GOT32:
case R_SH_PLT32:
case R_SH_GOTOFF:
case R_SH_GOTPC:
case R_SH_GOTPLT32:
return *(ul32 *)loc;
default:
return 0;
}
}
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
if (ctx.arg.pic) {
static const u8 insn[] = {
0x02, 0xd2, // mov.l 1f, r2
0xcc, 0x32, // add r12, r2
0x22, 0x50, // mov.l @(8, r2), r0
0x21, 0x52, // mov.l @(4, r2), r2
0x2b, 0x40, // jmp @r0
0x00, 0xe0, // mov #0, r0
0, 0, 0, 0, // 1: .long GOTPLT
};
static_assert(sizeof(insn) == E::plt_hdr_size);
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr;
} else {
static const u8 insn[] = {
0x02, 0xd2, // mov.l 1f, r2
0x22, 0x50, // mov.l @(8, r2), r0
0x21, 0x52, // mov.l @(4, r2), r2
0x2b, 0x40, // jmp @r0
0x00, 0xe0, // mov #0, r0
0x09, 0x00, // nop
0, 0, 0, 0, // 1: .long GOTPLT
};
static_assert(sizeof(insn) == E::plt_hdr_size);
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr;
}
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
if (ctx.arg.pic) {
static const u8 insn[] = {
0x01, 0xd0, // mov.l 1f, r0
0xce, 0x00, // mov.l @(r0, r12), r0
0x2b, 0x40, // jmp @r0
0x01, 0xd1, // mov.l 2f, r1
0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
};
static_assert(sizeof(insn) == E::plt_size);
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr;
*(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
} else {
static const u8 insn[] = {
0x01, 0xd0, // mov.l 1f, r0
0x02, 0x60, // mov.l @r0, r0
0x2b, 0x40, // jmp @r0
0x01, 0xd1, // mov.l 2f, r1
0, 0, 0, 0, // 1: .long GOTPLT_ENTRY
0, 0, 0, 0, // 2: .long INDEX_IN_RELPLT
};
static_assert(sizeof(insn) == E::plt_size);
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx);
*(ul32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel<E>);
}
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
if (ctx.arg.pic) {
static const u8 insn[] = {
0x01, 0xd0, // mov.l 1f, r0
0xce, 0x00, // mov.l @(r0, r12), r0
0x2b, 0x40, // jmp @r0
0x09, 0x00, // nop
0, 0, 0, 0, // 1: .long GOT_ENTRY
};
static_assert(sizeof(insn) == E::pltgot_size);
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 8) = sym.get_got_addr(ctx) - ctx.got->shdr.sh_addr;
} else {
static const u8 insn[] = {
0x01, 0xd0, // mov.l 1f, r0
0x02, 0x60, // mov.l @r0, r0
0x2b, 0x40, // jmp @r0
0x09, 0x00, // nop
0, 0, 0, 0, // 1: .long GOT_ENTRY
};
static_assert(sizeof(insn) == E::pltgot_size);
memcpy(buf, insn, sizeof(insn));
*(ul32 *)(buf + 8) = sym.get_got_addr(ctx);
}
}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_SH_DIR32:
*(ul32 *)loc = val;
break;
case R_SH_REL32:
*(ul32 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
u64 S = sym.get_addr(ctx);
u64 A = get_addend(loc, rel);
u64 P = get_addr() + rel.r_offset;
u64 G = sym.get_got_idx(ctx) * sizeof(Word<E>);
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_SH_DIR32:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_SH_REL32:
case R_SH_PLT32:
*(ul32 *)loc = S + A - P;
break;
case R_SH_GOT32:
*(ul32 *)loc = G;
break;
case R_SH_GOTPC:
*(ul32 *)loc = GOT + A - P;
break;
case R_SH_GOTOFF:
*(ul32 *)loc = S + A - GOT;
break;
case R_SH_TLS_GD_32:
*(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT;
break;
case R_SH_TLS_LD_32:
*(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT;
break;
case R_SH_TLS_LDO_32:
*(ul32 *)loc = S + A - ctx.dtp_addr;
break;
case R_SH_TLS_IE_32:
*(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT;
break;
case R_SH_TLS_LE_32:
*(ul32 *)loc = S + A - ctx.tp_addr;
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : get_addend(loc, rel);
switch (rel.r_type) {
case R_SH_DIR32:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ul32 *)loc = *val;
else
*(ul32 *)loc = S + A;
break;
default:
Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: "
<< rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
Error(ctx) << sym << ": GNU ifunc symbol is not supported on sh4";
switch (rel.r_type) {
case R_SH_DIR32:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_SH_REL32:
scan_pcrel(ctx, sym, rel);
break;
case R_SH_GOT32:
sym.flags |= NEEDS_GOT;
break;
case R_SH_PLT32:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_SH_TLS_GD_32:
sym.flags |= NEEDS_TLSGD;
break;
case R_SH_TLS_LD_32:
ctx.needs_tlsld = true;
break;
case R_SH_TLS_IE_32:
sym.flags |= NEEDS_GOTTP;
break;
case R_SH_TLS_LE_32:
check_tlsle(ctx, sym, rel);
break;
case R_SH_GOTPC:
case R_SH_GOTOFF:
case R_SH_TLS_LDO_32:
break;
default:
Fatal(ctx) << *this << ": unknown relocation: " << rel;
}
}
}
} // namespace mold::elf

View file

@ -1,622 +0,0 @@
// clang-format off
// SPARC is a RISC ISA developed by Sun Microsystems.
//
// The byte order of the processor is big-endian. Anything larger than a
// byte is stored in the "reverse" order compared to little-endian
// processors such as x86-64.
//
// All instructions are 4 bytes long and aligned to 4 bytes boundaries.
//
// A notable feature of SPARC is that, unlike other RISC ISAs, it doesn't
// need range extension thunks. It is because the SPARC's CALL instruction
// contains a whopping 30 bits immediate. The processor scales it by 4 to
// extend it to 32 bits (this is doable because all instructions are
// aligned to 4 bytes boundaries, so the least significant two bits are
// always zero). That means CALL's reach is PC ± 2 GiB, elinating the
// need of range extension thunks. It comes with the cost that the CALL
// instruction alone takes 1/4th of the instruction encoding space,
// though.
//
// SPARC has 32 general purpose registers. CALL instruction saves a return
// address to %o7, which is an alias for %r15. Thread pointer is stored to
// %g7 which is %r7.
//
// SPARC does not have PC-relative load/store instructions. To access data
// in the position-independent manner, we usually first set the address of
// .got to, for example, %l7, with the following piece of code
//
// sethi %hi(. - _GLOBAL_OFFSET_TABLE_), %l7
// add %l7, %lo(. - _GLOBAL_OFFSET_TABLE_), %l7
// call __sparc_get_pc_thunk.l7
// nop
//
// where __sparc_get_pc_thunk.l7 is defined as
//
// retl
// add %o7, %l7, %l7
//
// . SETHI and the following ADD materialize a 32 bits offset to .got.
// CALL instruction sets a return address to $o7, and the subsequent ADD
// adds it to the GOT offset to materialize the absolute address of .got.
//
// Note that we have a NOP after CALL and an ADD after RETL because of
// SPARC's delay branch slots. That is, the SPARC processor always
// executes one instruction after a branch even if the branch is taken.
// This may seem like an odd behavior, and indeed it is considered as such
// (that's a premature optimization for the early pipelined SPARC
// processors), but that's been a part of the ISA's spec so that's what it
// is.
//
// Note also that the .got address obtained this way is not shared between
// functions, so functions can use an arbitrary register to hold the .got
// address. That also means each function needs to execute the above piece
// of code to become position-independent.
//
// This scheme is very similar to i386. That may not be a coincidence
// because the i386 ELF psABI is created by Sun Microsystems too.
//
// https://github.com/rui314/psabi/blob/main/sparc.pdf
#include "third_party/mold/elf/mold.h"
namespace mold::elf {
using E = SPARC64;
// SPARC's PLT section is writable despite containing executable code.
// We don't need to write the PLT header entry because the dynamic loader
// will do that for us.
//
// We also don't need a .got.plt section to store the result of lazy PLT
// symbol resolution because the dynamic symbol resolver directly mutates
// instructions in PLT so that they jump to the right places next time.
// That's why each PLT entry contains lots of NOPs; they are a placeholder
// for the runtime to add more instructions.
//
// Self-modifying code is nowadays considered really bad from the security
// point of view, though.
template <>
void write_plt_header(Context<E> &ctx, u8 *buf) {
memset(buf, 0, E::plt_hdr_size);
}
template <>
void write_plt_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static ub32 insn[] = {
0x0300'0000, // sethi (. - .PLT0), %g1
0x3068'0000, // ba,a %xcc, .PLT1
0x0100'0000, // nop
0x0100'0000, // nop
0x0100'0000, // nop
0x0100'0000, // nop
0x0100'0000, // nop
0x0100'0000, // nop
};
u64 plt0 = ctx.plt->shdr.sh_addr;
u64 plt1 = ctx.plt->shdr.sh_addr + E::plt_size;
u64 entry = sym.get_plt_addr(ctx);
memcpy(buf, insn, sizeof(insn));
*(ub32 *)buf |= bits(entry - plt0, 21, 0);
*(ub32 *)(buf + 4) |= bits(plt1 - entry - 4, 20, 2);
}
template <>
void write_pltgot_entry(Context<E> &ctx, u8 *buf, Symbol<E> &sym) {
static ub32 entry[] = {
0x8a10'000f, // mov %o7, %g5
0x4000'0002, // call . + 8
0xc25b'e014, // ldx [ %o7 + 20 ], %g1
0xc25b'c001, // ldx [ %o7 + %g1 ], %g1
0x81c0'4000, // jmp %g1
0x9e10'0005, // mov %g5, %o7
0x0000'0000, // .quad $plt_entry - $got_entry
0x0000'0000,
};
memcpy(buf, entry, sizeof(entry));
*(ub64 *)(buf + 24) = sym.get_got_addr(ctx) - sym.get_plt_addr(ctx) - 4;
}
template <>
void EhFrameSection<E>::apply_reloc(Context<E> &ctx, const ElfRel<E> &rel,
u64 offset, u64 val) {
u8 *loc = ctx.buf + this->shdr.sh_offset + offset;
switch (rel.r_type) {
case R_NONE:
break;
case R_SPARC_64:
case R_SPARC_UA64:
*(ub64 *)loc = val;
break;
case R_SPARC_DISP32:
*(ub32 *)loc = val - this->shdr.sh_addr - offset;
break;
default:
Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel;
}
}
template <>
void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
ElfRel<E> *dynrel = nullptr;
if (ctx.reldyn)
dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
file.reldyn_offset + this->reldyn_offset);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE)
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
u64 S = sym.get_addr(ctx);
u64 A = rel.r_addend;
u64 P = (get_addr() + rel.r_offset);
u64 G = (sym.get_got_idx(ctx) * sizeof(Word<E>));
u64 GOT = ctx.got->shdr.sh_addr;
switch (rel.r_type) {
case R_SPARC_64:
apply_dyn_absrel(ctx, sym, rel, loc, S, A, P, dynrel);
break;
case R_SPARC_5:
check(S + A, 0, 1 << 5);
*(ub32 *)loc |= bits(S + A, 4, 0);
break;
case R_SPARC_6:
check(S + A, 0, 1 << 6);
*(ub32 *)loc |= bits(S + A, 5, 0);
break;
case R_SPARC_7:
check(S + A, 0, 1 << 7);
*(ub32 *)loc |= bits(S + A, 6, 0);
break;
case R_SPARC_8:
check(S + A, 0, 1 << 8);
*(u8 *)loc = S + A;
break;
case R_SPARC_10:
check(S + A, 0, 1 << 10);
*(ub32 *)loc |= bits(S + A, 9, 0);
break;
case R_SPARC_LO10:
case R_SPARC_LOPLT10:
*(ub32 *)loc |= bits(S + A, 9, 0);
break;
case R_SPARC_11:
check(S + A, 0, 1 << 11);
*(ub32 *)loc |= bits(S + A, 10, 0);
break;
case R_SPARC_13:
check(S + A, 0, 1 << 13);
*(ub32 *)loc |= bits(S + A, 12, 0);
break;
case R_SPARC_16:
case R_SPARC_UA16:
check(S + A, 0, 1 << 16);
*(ub16 *)loc = S + A;
break;
case R_SPARC_22:
check(S + A, 0, 1 << 22);
*(ub32 *)loc |= bits(S + A, 21, 0);
break;
case R_SPARC_32:
case R_SPARC_UA32:
case R_SPARC_PLT32:
check(S + A, 0, 1LL << 32);
*(ub32 *)loc = S + A;
break;
case R_SPARC_PLT64:
case R_SPARC_UA64:
case R_SPARC_REGISTER:
*(ub64 *)loc = S + A;
break;
case R_SPARC_DISP8:
check(S + A - P, -(1 << 7), 1 << 7);
*(u8 *)loc = S + A - P;
break;
case R_SPARC_DISP16:
check(S + A - P, -(1 << 15), 1 << 15);
*(ub16 *)loc = S + A - P;
break;
case R_SPARC_DISP32:
case R_SPARC_PCPLT32:
check(S + A - P, -(1LL << 31), 1LL << 31);
*(ub32 *)loc = S + A - P;
break;
case R_SPARC_DISP64:
*(ub64 *)loc = S + A - P;
break;
case R_SPARC_WDISP16: {
i64 val = S + A - P;
check(val, -(1 << 16), 1 << 16);
*(ub16 *)loc |= (bit(val, 16) << 21) | bits(val, 15, 2);
break;
}
case R_SPARC_WDISP19:
check(S + A - P, -(1 << 20), 1 << 20);
*(ub32 *)loc |= bits(S + A - P, 20, 2);
break;
case R_SPARC_WDISP22:
check(S + A - P, -(1 << 23), 1 << 23);
*(ub32 *)loc |= bits(S + A - P, 23, 2);
break;
case R_SPARC_WDISP30:
case R_SPARC_WPLT30:
check(S + A - P, -(1LL << 31), 1LL << 31);
*(ub32 *)loc |= bits(S + A - P, 31, 2);
break;
case R_SPARC_HI22:
case R_SPARC_HIPLT22:
case R_SPARC_LM22:
*(ub32 *)loc |= bits(S + A, 31, 10);
break;
case R_SPARC_GOT10:
*(ub32 *)loc |= bits(G, 9, 0);
break;
case R_SPARC_GOT13:
check(G, 0, 1 << 12);
*(ub32 *)loc |= bits(G, 12, 0);
break;
case R_SPARC_GOT22:
*(ub32 *)loc |= bits(G, 31, 10);
break;
case R_SPARC_GOTDATA_HIX22: {
i64 val = S + A - GOT;
*(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
break;
}
case R_SPARC_GOTDATA_LOX10: {
i64 val = S + A - GOT;
*(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
break;
}
case R_SPARC_GOTDATA_OP_HIX22:
// We always have to relax a GOT load to a load immediate if a
// symbol is local, because R_SPARC_GOTDATA_OP cannot represent
// an addend for a local symbol.
if (sym.is_imported || sym.is_ifunc()) {
*(ub32 *)loc |= bits(G, 31, 10);
} else if (sym.is_absolute()) {
i64 val = S + A;
*(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
} else {
i64 val = S + A - GOT;
*(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10);
}
break;
case R_SPARC_GOTDATA_OP_LOX10: {
if (sym.is_imported || sym.is_ifunc()) {
*(ub32 *)loc |= bits(G, 9, 0);
} else if (sym.is_absolute()) {
i64 val = S + A;
*(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
} else {
i64 val = S + A - GOT;
*(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0);
}
break;
}
case R_SPARC_GOTDATA_OP:
if (sym.is_imported || sym.is_ifunc())
break;
if (sym.is_absolute()) {
// ldx [ %g2 + %g1 ], %g1 → nop
*(ub32 *)loc = 0x0100'0000;
} else {
// ldx [ %g2 + %g1 ], %g1 → add %g2, %g1, %g1
*(ub32 *)loc &= 0b00'11111'000000'11111'1'11111111'11111;
*(ub32 *)loc |= 0b10'00000'000000'00000'0'00000000'00000;
}
break;
case R_SPARC_PC10:
case R_SPARC_PCPLT10:
*(ub32 *)loc |= bits(S + A - P, 9, 0);
break;
case R_SPARC_PC22:
case R_SPARC_PCPLT22:
case R_SPARC_PC_LM22:
*(ub32 *)loc |= bits(S + A - P, 31, 10);
break;
case R_SPARC_OLO10:
*(ub32 *)loc |= bits(bits(S + A, 9, 0) + rel.r_type_data, 12, 0);
break;
case R_SPARC_HH22:
*(ub32 *)loc |= bits(S + A, 63, 42);
break;
case R_SPARC_HM10:
*(ub32 *)loc |= bits(S + A, 41, 32);
break;
case R_SPARC_PC_HH22:
*(ub32 *)loc |= bits(S + A - P, 63, 42);
break;
case R_SPARC_PC_HM10:
*(ub32 *)loc |= bits(S + A - P, 41, 32);
break;
case R_SPARC_HIX22:
*(ub32 *)loc |= bits(~(S + A), 31, 10);
break;
case R_SPARC_LOX10:
*(ub32 *)loc |= bits(S + A, 9, 0) | 0b1'1100'0000'0000;
break;
case R_SPARC_H44:
*(ub32 *)loc |= bits(S + A, 43, 22);
break;
case R_SPARC_M44:
*(ub32 *)loc |= bits(S + A, 21, 12);
break;
case R_SPARC_L44:
*(ub32 *)loc |= bits(S + A, 11, 0);
break;
case R_SPARC_TLS_GD_HI22:
*(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10);
break;
case R_SPARC_TLS_GD_LO10:
*(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0);
break;
case R_SPARC_TLS_GD_CALL:
case R_SPARC_TLS_LDM_CALL: {
u64 addr;
if (ctx.arg.is_static)
addr = ctx.extra.tls_get_addr_sec->shdr.sh_addr;
else
addr = ctx.extra.tls_get_addr_sym->get_addr(ctx);
*(ub32 *)loc |= bits(addr + A - P, 31, 2);
break;
}
case R_SPARC_TLS_LDM_HI22:
*(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10);
break;
case R_SPARC_TLS_LDM_LO10:
*(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0);
break;
case R_SPARC_TLS_LDO_HIX22:
*(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 31, 10);
break;
case R_SPARC_TLS_LDO_LOX10:
*(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 9, 0);
break;
case R_SPARC_TLS_IE_HI22:
*(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 31, 10);
break;
case R_SPARC_TLS_IE_LO10:
*(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 9, 0);
break;
case R_SPARC_TLS_LE_HIX22:
*(ub32 *)loc |= bits(~(S + A - ctx.tp_addr), 31, 10);
break;
case R_SPARC_TLS_LE_LOX10:
*(ub32 *)loc |= bits(S + A - ctx.tp_addr, 9, 0) | 0b1'1100'0000'0000;
break;
case R_SPARC_SIZE32:
*(ub32 *)loc = sym.esym().st_size + A;
break;
case R_SPARC_TLS_GD_ADD:
case R_SPARC_TLS_LDM_ADD:
case R_SPARC_TLS_LDO_ADD:
case R_SPARC_TLS_IE_LD:
case R_SPARC_TLS_IE_LDX:
case R_SPARC_TLS_IE_ADD:
break;
default:
unreachable();
}
}
}
template <>
void InputSection<E>::apply_reloc_nonalloc(Context<E> &ctx, u8 *base) {
std::span<const ElfRel<E>> rels = get_rels(ctx);
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
u8 *loc = base + rel.r_offset;
auto check = [&](i64 val, i64 lo, i64 hi) {
if (val < lo || hi <= val)
Error(ctx) << *this << ": relocation " << rel << " against "
<< sym << " out of range: " << val << " is not in ["
<< lo << ", " << hi << ")";
};
SectionFragment<E> *frag;
i64 frag_addend;
std::tie(frag, frag_addend) = get_fragment(ctx, rel);
u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx);
u64 A = frag ? frag_addend : (i64)rel.r_addend;
switch (rel.r_type) {
case R_SPARC_64:
case R_SPARC_UA64:
if (std::optional<u64> val = get_tombstone(sym, frag))
*(ub64 *)loc = *val;
else
*(ub64 *)loc = S + A;
break;
case R_SPARC_32:
case R_SPARC_UA32: {
i64 val = S + A;
check(val, 0, 1LL << 32);
*(ub32 *)loc = val;
break;
}
case R_SPARC_TLS_DTPOFF32:
*(ub32 *)loc = S + A - ctx.dtp_addr;
break;
case R_SPARC_TLS_DTPOFF64:
*(ub64 *)loc = S + A - ctx.dtp_addr;
break;
default:
Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel;
}
}
}
template <>
void InputSection<E>::scan_relocations(Context<E> &ctx) {
assert(shdr().sh_flags & SHF_ALLOC);
this->reldyn_offset = file.num_dynrel * sizeof(ElfRel<E>);
std::span<const ElfRel<E>> rels = get_rels(ctx);
// Scan relocations
for (i64 i = 0; i < rels.size(); i++) {
const ElfRel<E> &rel = rels[i];
if (rel.r_type == R_NONE || record_undef_error(ctx, rel))
continue;
Symbol<E> &sym = *file.symbols[rel.r_sym];
if (sym.is_ifunc())
sym.flags |= NEEDS_GOT | NEEDS_PLT;
switch (rel.r_type) {
case R_SPARC_64:
scan_dyn_absrel(ctx, sym, rel);
break;
case R_SPARC_8:
case R_SPARC_5:
case R_SPARC_6:
case R_SPARC_7:
case R_SPARC_10:
case R_SPARC_11:
case R_SPARC_13:
case R_SPARC_16:
case R_SPARC_22:
case R_SPARC_32:
case R_SPARC_REGISTER:
case R_SPARC_UA16:
case R_SPARC_UA32:
case R_SPARC_UA64:
case R_SPARC_PC_HM10:
case R_SPARC_OLO10:
case R_SPARC_LOX10:
case R_SPARC_HM10:
case R_SPARC_M44:
case R_SPARC_HIX22:
case R_SPARC_LO10:
case R_SPARC_L44:
case R_SPARC_LM22:
case R_SPARC_HI22:
case R_SPARC_H44:
case R_SPARC_HH22:
scan_absrel(ctx, sym, rel);
break;
case R_SPARC_PLT32:
case R_SPARC_WPLT30:
case R_SPARC_WDISP30:
case R_SPARC_HIPLT22:
case R_SPARC_LOPLT10:
case R_SPARC_PCPLT32:
case R_SPARC_PCPLT22:
case R_SPARC_PCPLT10:
case R_SPARC_PLT64:
if (sym.is_imported)
sym.flags |= NEEDS_PLT;
break;
case R_SPARC_GOT13:
case R_SPARC_GOT10:
case R_SPARC_GOT22:
case R_SPARC_GOTDATA_HIX22:
sym.flags |= NEEDS_GOT;
break;
case R_SPARC_GOTDATA_OP_HIX22:
if (sym.is_imported)
sym.flags |= NEEDS_GOT;
break;
case R_SPARC_DISP16:
case R_SPARC_DISP32:
case R_SPARC_DISP64:
case R_SPARC_DISP8:
case R_SPARC_PC10:
case R_SPARC_PC22:
case R_SPARC_PC_LM22:
case R_SPARC_WDISP16:
case R_SPARC_WDISP19:
case R_SPARC_WDISP22:
case R_SPARC_PC_HH22:
scan_pcrel(ctx, sym, rel);
break;
case R_SPARC_TLS_GD_HI22:
sym.flags |= NEEDS_TLSGD;
break;
case R_SPARC_TLS_LDM_HI22:
ctx.needs_tlsld = true;
break;
case R_SPARC_TLS_IE_HI22:
sym.flags |= NEEDS_GOTTP;
break;
case R_SPARC_TLS_GD_CALL:
case R_SPARC_TLS_LDM_CALL:
if (!ctx.arg.is_static && ctx.extra.tls_get_addr_sym->is_imported)
ctx.extra.tls_get_addr_sym->flags |= NEEDS_PLT;
break;
case R_SPARC_TLS_LE_HIX22:
case R_SPARC_TLS_LE_LOX10:
check_tlsle(ctx, sym, rel);
break;
case R_SPARC_GOTDATA_OP_LOX10:
case R_SPARC_GOTDATA_OP:
case R_SPARC_GOTDATA_LOX10:
case R_SPARC_TLS_GD_LO10:
case R_SPARC_TLS_GD_ADD:
case R_SPARC_TLS_LDM_LO10:
case R_SPARC_TLS_LDM_ADD:
case R_SPARC_TLS_LDO_HIX22:
case R_SPARC_TLS_LDO_LOX10:
case R_SPARC_TLS_LDO_ADD:
case R_SPARC_TLS_IE_ADD:
case R_SPARC_TLS_IE_LD:
case R_SPARC_TLS_IE_LDX:
case R_SPARC_TLS_IE_LO10:
case R_SPARC_SIZE32:
break;
default:
Fatal(ctx) << *this << ": scan_relocations: " << rel;
}
}
}
// __tls_get_addr is not defined by libc.a, so we can't use that function
// in statically-linked executables. This section provides a replacement.
void SparcTlsGetAddrSection::copy_buf(Context<E> &ctx) {
ub32 *buf = (ub32 *)(ctx.buf + this->shdr.sh_offset);
static const ub32 insn[] = {
0x0300'0000, // sethi %hi(TP_SIZE), %g1
0x8210'6000, // or %g1, %lo(TP_SIZE), %g1
0x8221'c001, // sub %g7, %g1, %g1
0xd05a'2008, // ldx [ %o0 + 8 ], %o0
0x81c3'e008, // retl
0x9000'4008, // add %g1, %o0, %o0
};
assert(this->shdr.sh_size == sizeof(insn));
memcpy(buf, insn, sizeof(insn));
buf[0] |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10);
buf[1] |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0);
}
} // namespace mold::elf

View file

@ -1,6 +1,6 @@
// clang-format off
#include "third_party/mold/elf/mold.h"
// MISSING #include "../common/cmdline.h"
#include "third_party/mold/cmdline.h"
#include "third_party/libcxx/regex"
#include "third_party/libcxx/sstream"
@ -36,7 +36,6 @@
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/ok.h"
#include "libc/time/time.h"
#include "third_party/getopt/getopt.internal.h"
#include "third_party/musl/crypt.h"
#include "third_party/musl/lockf.h"
#endif

View file

@ -1,7 +1,7 @@
// clang-format off
#pragma once
// MISSING #include "../common/integers.h"
#include "third_party/mold/integers.h"
#include "third_party/libcxx/ostream"
#include "third_party/libcxx/string"

View file

@ -1,9 +1,8 @@
// clang-format off
#include "third_party/mold/elf/mold.h"
// MISSING #include "../common/archive-file.h"
// MISSING #include "../common/cmdline.h"
// MISSING #include "../common/output-file.h"
#include "third_party/mold/archive-file.h"
#include "third_party/mold/cmdline.h"
#include "third_party/mold/output-file.h"
#include "third_party/libcxx/cstring"
#include "third_party/libcxx/functional"
#include "third_party/libcxx/iomanip"

View file

@ -2,7 +2,7 @@
#pragma once
#include "third_party/mold/elf/elf.h"
// MISSING #include "../common/common.h"
#include "third_party/mold/common.h"
#include "third_party/libcxx/atomic"
#include "third_party/libcxx/bitset"
@ -15,16 +15,19 @@
#include "third_party/libcxx/memory"
#include "third_party/libcxx/mutex"
#include "third_party/libcxx/optional"
// MISSING #include <span>
#include "third_party/libcxx/span"
#include "third_party/libcxx/sstream"
#include "third_party/libcxx/string"
#include "third_party/libcxx/string_view"
#include "third_party/mold/fake_tbb.h"
// MISSING #include <tbb/concurrent_hash_map.h>
// MISSING #include <tbb/concurrent_unordered_map.h>
// MISSING #include <tbb/concurrent_vector.h>
// MISSING #include <tbb/enumerable_thread_specific.h>
// MISSING #include <tbb/spin_mutex.h>
// MISSING #include <tbb/task_group.h>
#include "third_party/libcxx/type_traits"
#include "third_party/libcxx/unordered_map"
#include "third_party/libcxx/unordered_set"
@ -42,7 +45,6 @@
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/ok.h"
#include "libc/time/time.h"
#include "third_party/getopt/getopt.internal.h"
#include "third_party/musl/crypt.h"
#include "third_party/musl/lockf.h"
#endif

View file

@ -6,6 +6,16 @@ namespace tbb {
template <typename T>
using concurrent_vector = std::vector<T>;
template <
class Key,
class T,
class Hash = std::hash<Key>,
class KeyEqual = std::equal_to<Key>,
class Allocator = std::allocator< std::pair<const Key, T> > >
using concurrent_hash_map = std::unordered_map<Key, T, Hash, KeyEqual, Allocator>;
using spin_mutex = std::mutex;
template<typename InputIterator, typename Function>
void parallel_for_each(InputIterator first, InputIterator last, const Function& f) {
}
@ -22,5 +32,35 @@ namespace tbb {
void parallel_for(Index first, Index last, const Function& f) {
}
enum task_group_status {
not_complete,
complete,
canceled
};
class task_group {
public:
task_group() {};
~task_group() {};
template<typename Func>
void run( Func&& f ) {
};
template<typename Func>
task_group_status run_and_wait( const Func& f ) {
return task_group_status::complete;
};
task_group_status wait() {
return task_group_status::complete;
};
void cancel() {
};
};
}
#endif

5
third_party/mold/git-hash.cc vendored Normal file
View file

@ -0,0 +1,5 @@
#include "third_party/libcxx/string"
namespace mold {
std::string mold_git_hash = "d4d93d7fb72dd19c44aafa4dd5397e35787d33ad";
}

View file

@ -5,9 +5,10 @@
// For more info, read
// https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog
// TODO(fzakaria): changed from libcxx because pow symbol wasn't present.
#include "third_party/mold/common.h"
#include "third_party/libcxx/cmath"
#include "libc/math.h"
namespace mold {

View file

@ -6,7 +6,7 @@ PKGS += THIRD_PARTY_MOLD
THIRD_PARTY_MOLD_ARTIFACTS += THIRD_PARTY_MOLD_A
THIRD_PARTY_MOLD = $(THIRD_PARTY_MOLD_A_DEPS) $(THIRD_PARTY_MOLD_A)
THIRD_PARTY_MOLD_A = o/$(MODE)/third_party/mold/mold.a
THIRD_PARTY_MOLD_FILES := $(wildcard third_party/mold/*)
THIRD_PARTY_MOLD_FILES := $(wildcard third_party/mold/*) $(wildcard third_party/mold/elf/*)
THIRD_PARTY_MOLD_HDRS = $(filter %.h,$(THIRD_PARTY_MOLD_FILES))
THIRD_PARTY_MOLD_SRCS = $(filter %.cc,$(THIRD_PARTY_MOLD_FILES))
THIRD_PARTY_MOLD_OBJS = $(THIRD_PARTY_MOLD_SRCS:%.cc=o/$(MODE)/%.o)
@ -16,6 +16,9 @@ THIRD_PARTY_MOLD_A_DIRECTDEPS = \
LIBC_STR \
LIBC_INTRIN \
LIBC_STDIO \
LIBC_CALLS \
LIBC_TINYMATH \
LIBC_SYSV \
LIBC_RUNTIME \
THIRD_PARTY_ZSTD \
THIRD_PARTY_XXHASH \
@ -35,6 +38,8 @@ $(THIRD_PARTY_MOLD_OBJS): private \
-fno-asynchronous-unwind-tables \
-Wno-sign-compare \
-Wno-unused-function \
-DMOLD_X86_64=1 \
-DMOLD_TARGET=X86_64
THIRD_PARTY_MOLD_CHECKS = \
$(THIRD_PARTY_MOLD_A).pkg \