cosmopolitan/tool/build/assimilate.c
Justine Tunney a6ecbb747d
Introduce libc/mem/tinymalloc.inc
This allocator shaves ~20kb off single-threaded tool programs and is
slightly faster than proper malloc for simple non-demanding programs
2024-05-07 03:19:49 -07:00

638 lines
20 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi │
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2022 Justine Alexandra Roberts Tunney │
│ │
│ Permission to use, copy, modify, and/or distribute this software for │
│ any purpose with or without fee is hereby granted, provided that the │
│ above copyright notice and this permission notice appear in all copies. │
│ │
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
│ PERFORMANCE OF THIS SOFTWARE. │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/assert.h"
#include "libc/calls/calls.h"
#include "libc/calls/struct/stat.h"
#include "libc/dce.h"
#include "libc/elf/def.h"
#include "libc/elf/struct/ehdr.h"
#include "libc/errno.h"
#include "libc/fmt/conv.h"
#include "libc/limits.h"
#include "libc/macho.internal.h"
#include "libc/macros.internal.h"
#include "libc/runtime/runtime.h"
#include "libc/serialize.h"
#include "libc/stdckdint.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/msync.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/prot.h"
#include "third_party/getopt/getopt.internal.h"
#include "third_party/regex/regex.h"
#define VERSION \
"actually portable executable assimilate v1.6\n" \
"copyright 2023 justine alexandra roberts tunney\n"
#define USAGE \
" [-xapembfchv] [-o PATH] FILE...\n" \
" -h show help\n" \
" -v show version\n" \
" -f ignore soft errors\n" \
" -b don't remove freebsd from elf os/abi\n" \
" -c clobber input path w/o making backup\n" \
" -e convert to elf regardless of host os\n" \
" -m convert to macho regardless of host os\n" \
" -x convert to amd64 regardless of host cpu\n" \
" -a convert to arm64 regardless of host cpu\n" \
" -p convert to ppc64 regardless of host cpu\n" \
" -o PATH write modified binary to different file\n"
#define ARCH_NATIVE 0
#define ARCH_AMD64 1
#define ARCH_ARM64 2
#define ARCH_PPC64 3
#define FORMAT_NATIVE 0
#define FORMAT_ELF 1
#define FORMAT_MACHO 2
#define FORMAT_PE 3
#include "libc/mem/tinymalloc.inc"
static int g_arch;
static int g_format;
static bool g_force;
static bool g_clobber;
static bool g_freebsd;
static const char *prog;
static const char *path;
static const char *outpath;
static bool got_format_flag;
static char bakpath[PATH_MAX];
static wontreturn void Die(const char *thing, const char *reason) {
const char *native_explainer;
if (got_format_flag) {
native_explainer = "";
} else if (IsXnu()) {
native_explainer = " (the host os uses macho natively)";
} else if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) {
native_explainer = " (the host os uses elf natively)";
} else {
native_explainer = " (the host os uses pe natively)";
}
tinyprint(2, thing, ": ", reason, native_explainer, "\n", NULL);
exit(1);
}
static wontreturn void DieSys(const char *thing) {
perror(thing);
exit(1);
}
static int Atoi(const char *s) {
int x;
if ((x = atoi(s)) == INT_MAX) {
Die(path, "integer overflow parsing ape macho dd argument");
}
return x;
}
static void GetOpts(int argc, char *argv[]) {
int opt;
if (IsFreebsd()) {
g_freebsd = true;
}
while ((opt = getopt(argc, argv, "hvfemxapbco:")) != -1) {
switch (opt) {
case 'f':
g_force = true;
break;
case 'c':
g_clobber = true;
break;
case 'b':
g_freebsd = true;
break;
case 'o':
outpath = optarg;
break;
case 'e':
g_format = FORMAT_ELF;
got_format_flag = true;
break;
case 'm':
g_format = FORMAT_MACHO;
got_format_flag = true;
break;
case 'x':
g_arch = ARCH_AMD64;
break;
case 'a':
g_arch = ARCH_ARM64;
break;
case 'p':
g_arch = ARCH_PPC64;
break;
case 'v':
tinyprint(1, VERSION, NULL);
exit(0);
case 'h':
tinyprint(1, VERSION, "usage: ", prog, USAGE, NULL);
exit(0);
default:
tinyprint(2, VERSION, "usage: ", prog, USAGE, NULL);
exit(1);
}
}
if (optind == argc) {
Die(prog, "missing operand");
}
if (g_format == FORMAT_NATIVE) {
if (IsXnu()) {
g_format = FORMAT_MACHO;
} else if (IsLinux() || IsFreebsd() || IsNetbsd() || IsOpenbsd()) {
g_format = FORMAT_ELF;
} else {
g_format = FORMAT_PE;
}
}
if (g_arch == ARCH_NATIVE) {
#ifdef __aarch64__
g_arch = ARCH_ARM64;
#elif defined(__powerpc64__)
g_arch = ARCH_PPC64;
#else
g_arch = ARCH_AMD64;
#endif
}
if (g_format == FORMAT_PE && g_arch != ARCH_AMD64) {
Die(prog, "native non-x86 on windows not supported yet");
}
}
static int GetElfArch(void) {
switch (g_arch) {
case ARCH_AMD64:
return EM_NEXGEN32E;
case ARCH_ARM64:
return EM_AARCH64;
case ARCH_PPC64:
return EM_PPC64;
default:
unassert(false);
}
}
static void GetElfHeader(Elf64_Ehdr *ehdr, const char *image, size_t n) {
int c, i;
const char *p, *e;
for (p = image, e = p + MIN(n, 8192); p < e; ++p) {
TryAgain:
if (READ64LE(p) != READ64LE("printf '"))
continue;
for (i = 0, p += 8; p + 3 < e && (c = *p++) != '\'';) {
if (c == '\\') {
if ('0' <= *p && *p <= '7') {
c = *p++ - '0';
if ('0' <= *p && *p <= '7') {
c *= 8;
c += *p++ - '0';
if ('0' <= *p && *p <= '7') {
c *= 8;
c += *p++ - '0';
}
}
}
}
if (i < 64) {
((char *)ehdr)[i++] = c;
} else {
goto TryAgain;
}
}
if (i != sizeof(*ehdr) || //
READ32LE(ehdr->e_ident) != READ32LE("\177ELF") || //
ehdr->e_ident[EI_CLASS] == ELFCLASS32 || //
ehdr->e_machine != GetElfArch()) {
goto TryAgain;
}
return;
}
switch (g_arch) {
case ARCH_AMD64:
Die(path, "printf statement not found in first 8192 bytes of image "
"containing elf64 ehdr for amd64");
case ARCH_ARM64:
Die(path, "printf statement not found in first 8192 bytes of image "
"containing elf64 ehdr for arm64");
case ARCH_PPC64:
Die(path, "printf statement not found in first 8192 bytes of image "
"containing elf64 ehdr for ppc64");
default:
unassert(false);
}
}
static int GetMachoArch(void) {
switch (g_arch) {
case ARCH_AMD64:
return MAC_CPU_NEXGEN32E;
case ARCH_ARM64:
return MAC_CPU_ARM64;
case ARCH_PPC64:
return MAC_CPU_POWERPC64;
default:
unassert(false);
}
}
static void GetMachoPayload(const char *image, size_t imagesize,
int *out_offset, int *out_size) {
regex_t rx;
const char *script;
regmatch_t rm[1 + 13] = {0};
int rc, skip, count, bs, offset, size;
if ((script = memmem(image, MIN(imagesize, 4096), "'\n#'\"\n", 6))) {
script += 6;
} else if ((script = memmem(image, MIN(imagesize, 4096), "#'\"\n", 4))) {
script += 4;
} else {
Die(path, "ape shell script not found");
}
// the ape shell script has always historically used `dd` to
// assimilate binaries to the mach-o file format but we have
// formatted the arguments in a variety of different ways eg
//
// - `arg=" 9293"` is how we originally had ape do it
// - `arg=$(( 9293))` b/c busybox sh disliked quoted space
// - `arg=9293 ` is generated by modern apelink program
//
unassert(regcomp(&rx,
"bs=" // dd block size arg
"(['\"] *)?" // #1 optional quote w/ space
"(\\$\\(\\( *)?" // #2 optional math w/ space
"([[:digit:]]+)" // #3
"( *\\)\\))?" // #4 optional math w/ space
"( *['\"])?" // #5 optional quote w/ space
" +" //
"skip=" // dd skip arg
"(['\"] *)?" // #6 optional quote w/ space
"(\\$\\(\\( *)?" // #7 optional math w/ space
"([[:digit:]]+)" // #8
"( *\\)\\))?" // #9 optional math w/ space
"( *['\"])?" // #10 optional quote w/ space
" +" //
"count=" // dd count arg
"(['\"] *)?" // #11 optional quote w/ space
"(\\$\\(\\( *)?" // #12 optional math w/ space
"([[:digit:]]+)", // #13
REG_EXTENDED) == REG_OK);
int i = 0;
TryAgain:
rc = regexec(&rx, script + i, 1 + 13, rm, 0);
if (rc != REG_OK) {
unassert(rc == REG_NOMATCH);
switch (g_arch) {
case ARCH_AMD64:
Die(path, "ape macho dd command for amd64 not found");
case ARCH_ARM64:
Die(path, "ape macho dd command for arm64 not found; by convention ape "
"executables are run on apple silicon only as elf binaries, "
"which are loaded by the ape-m1.c ape loader program; thus "
"consider passing the -ae flags to assimilate to arm64 elf");
case ARCH_PPC64:
Die(path, "ape macho dd command for ppc64 not found");
default:
unassert(false);
}
}
i += rm[13].rm_eo;
bs = Atoi(script + rm[3].rm_so);
skip = Atoi(script + rm[8].rm_so);
count = Atoi(script + rm[13].rm_so);
if (ckd_mul(&offset, skip, bs)) {
Die(path, "integer overflow computing ape macho dd offset");
}
if (ckd_mul(&size, count, bs)) {
Die(path, "integer overflow computing ape macho dd size");
}
if (offset < 64) {
Die(path, "ape macho dd offset must be at least 64");
}
if (offset >= imagesize) {
Die(path, "ape macho dd offset points outside image");
}
if (size < 32) {
Die(path, "ape macho dd size must be at least 32");
}
if (size > imagesize - offset) {
Die(path, "ape macho dd size overlaps end of image");
exit(1);
}
if (READ32LE(image + offset) != 0xFEEDFACE + 1 ||
READ32LE(image + offset + 4) != GetMachoArch()) {
goto TryAgain;
}
*out_offset = offset;
*out_size = size;
regfree(&rx);
}
static ssize_t Write(int fd, const void *data, size_t size) {
ssize_t rc;
const char *p, *e;
for (p = data, e = p + size; p < e; p += (size_t)rc) {
if ((rc = write(fd, p, e - p)) == -1) {
return -1;
}
}
return size;
}
static ssize_t Pwrite(int fd, const void *data, size_t size, uint64_t offset) {
ssize_t rc;
const char *p, *e;
for (p = data, e = p + size; p < e; p += (size_t)rc, offset += (size_t)rc) {
if ((rc = pwrite(fd, p, e - p, offset)) == -1) {
return -1;
}
}
return size;
}
static int GetMode(int fd) {
struct stat st;
if (fstat(fd, &st))
DieSys(path);
return st.st_mode & 0777;
}
static void CopyFile(int infd, const char *map, size_t size, //
const void *hdr, size_t hdrsize) {
int outfd;
if (!outpath)
return;
if ((outfd = creat(outpath, GetMode(infd))) == -1)
DieSys(outpath);
if (hdrsize && Write(outfd, hdr, hdrsize) == -1)
DieSys(outpath);
if (Write(outfd, map + hdrsize, size - hdrsize) == -1)
DieSys(outpath);
if (close(outfd))
DieSys(outpath);
}
static void WriteOutput(int infd, const char *map, size_t size, //
const void *hdr, size_t hdrsize) {
int outfd, oflags, omode;
if (outpath) {
CopyFile(infd, map, size, hdr, hdrsize);
} else if (g_clobber) {
if (Pwrite(infd, hdr, hdrsize, 0) == -1)
DieSys(path);
} else {
omode = GetMode(infd);
oflags = O_WRONLY | O_CREAT | (g_force ? O_TRUNC : O_EXCL);
strlcat(bakpath, path, sizeof(bakpath));
if (strlcat(bakpath, ".bak", sizeof(bakpath)) >= sizeof(bakpath)) {
Die(path, "filename too long");
}
if ((outfd = open(bakpath, oflags, omode)) == -1)
DieSys(bakpath);
if (Write(outfd, map, size) == -1)
DieSys(bakpath);
if (close(outfd))
DieSys(bakpath);
if (Pwrite(infd, hdr, hdrsize, 0) == -1)
DieSys(path);
}
}
static void AssimilateElf(int infd, const char *map, size_t size) {
Elf64_Ehdr ehdr;
GetElfHeader(&ehdr, map, size);
if (!g_freebsd && ehdr.e_ident[EI_OSABI] == ELFOSABI_FREEBSD) {
// none of the kernels except freebsd care about the osabi. however
// gdb does whine about it when trying to debug ape elf programs on
// other platforms like linux. so when assimilating, it makes sense
// to fall back to the more generic system v os abi.
ehdr.e_ident[EI_OSABI] = ELFOSABI_SYSV;
}
WriteOutput(infd, map, size, &ehdr, sizeof(ehdr));
}
static void AssimilateMacho(int infd, const char *map, size_t size) {
int macho_offset, macho_size;
GetMachoPayload(map, size, &macho_offset, &macho_size);
WriteOutput(infd, map, size, map + macho_offset, macho_size);
}
static void Assimilate(void) {
int fd;
char *p;
int oflags;
ssize_t size;
oflags = outpath ? O_RDONLY : O_RDWR;
if ((fd = open(path, oflags)) == -1)
DieSys(path);
if ((size = lseek(fd, 0, SEEK_END)) == -1)
DieSys(path);
if (size < 64)
Die(path, "ape executables must be at least 64 bytes");
p = mmap(0, size, PROT_READ, MAP_PRIVATE, fd, 0);
if (p == MAP_FAILED)
DieSys(path);
if (READ32LE(p) == READ32LE("\177ELF")) {
Elf64_Ehdr *ehdr;
switch (g_format) {
case FORMAT_ELF:
ehdr = (Elf64_Ehdr *)p;
if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) {
Die(path, "32-bit elf not supported");
}
switch (g_arch) {
case ARCH_AMD64:
switch (ehdr->e_machine) {
case EM_NEXGEN32E:
if (g_force) {
CopyFile(fd, p, size, 0, 0);
exit(0);
} else {
Die(path, "already an elf amd64 executable");
}
case EM_AARCH64:
Die(path, "can't assimilate elf arm64 to elf amd64");
case EM_PPC64:
Die(path, "can't assimilate elf ppc64 to elf amd64");
default:
Die(path, "elf has unsupported architecture");
}
case ARCH_ARM64:
switch (ehdr->e_machine) {
case EM_AARCH64:
if (g_force) {
CopyFile(fd, p, size, 0, 0);
exit(0);
} else {
Die(path, "already an elf arm64 executable");
}
case EM_NEXGEN32E:
Die(path, "can't assimilate elf amd64 to elf arm64");
case EM_PPC64:
Die(path, "can't assimilate elf ppc64 to elf arm64");
default:
Die(path, "elf has unsupported architecture");
}
case ARCH_PPC64:
switch (ehdr->e_machine) {
case EM_PPC64:
if (g_force) {
CopyFile(fd, p, size, 0, 0);
exit(0);
} else {
Die(path, "already an elf ppc64 executable");
}
case EM_NEXGEN32E:
Die(path, "can't assimilate elf amd64 to elf ppc64");
case EM_AARCH64:
Die(path, "can't assimilate elf arm64 to elf ppc64");
default:
Die(path, "elf has unsupported architecture");
}
default:
unassert(false);
}
case FORMAT_MACHO:
Die(path, "can't assimilate elf to macho");
case FORMAT_PE:
Die(path, "can't assimilate elf to pe (try elf2pe)");
default:
unassert(false);
}
}
if (READ32LE(p) == 0xFEEDFACE + 1) {
struct MachoHeader *macho;
switch (g_format) {
case FORMAT_MACHO:
macho = (struct MachoHeader *)p;
switch (g_arch) {
case ARCH_AMD64:
switch (macho->arch) {
case MAC_CPU_NEXGEN32E:
if (g_force) {
CopyFile(fd, p, size, 0, 0);
exit(0);
} else {
Die(path, "already a macho amd64 executable");
}
case MAC_CPU_ARM64:
Die(path, "can't assimilate macho arm64 to macho amd64");
case MAC_CPU_POWERPC64:
Die(path, "can't assimilate macho ppc64 to macho amd64");
default:
Die(path, "macho has unsupported architecture");
}
case ARCH_ARM64:
switch (macho->arch) {
case MAC_CPU_ARM64:
if (g_force) {
CopyFile(fd, p, size, 0, 0);
exit(0);
} else {
Die(path, "already a macho arm64 executable");
}
case MAC_CPU_NEXGEN32E:
Die(path, "can't assimilate macho amd64 to macho arm64");
case MAC_CPU_POWERPC64:
Die(path, "can't assimilate macho ppc64 to macho arm64");
default:
Die(path, "macho has unsupported architecture");
}
case ARCH_PPC64:
switch (macho->arch) {
case MAC_CPU_POWERPC64:
if (g_force) {
CopyFile(fd, p, size, 0, 0);
exit(0);
} else {
Die(path, "already a macho ppc64 executable");
}
case MAC_CPU_NEXGEN32E:
Die(path, "can't assimilate macho amd64 to macho ppc64");
case MAC_CPU_ARM64:
Die(path, "can't assimilate macho arm64 to macho ppc64");
default:
Die(path, "macho has unsupported architecture");
}
default:
unassert(false);
}
case FORMAT_ELF:
Die(path, "can't assimilate macho to elf");
case FORMAT_PE:
Die(path, "can't assimilate macho to pe");
default:
unassert(false);
}
}
if (READ64LE(p) != READ64LE("MZqFpD='") && //
READ64LE(p) != READ64LE("jartsr='") && //
READ64LE(p) != READ64LE("APEDBG='")) {
Die(path, "not an actually portable executable");
}
if (g_format == FORMAT_PE) {
if (READ16LE(p) == READ16LE("MZ")) {
if (g_force) {
CopyFile(fd, p, size, 0, 0);
exit(0);
} else {
Die(path, "this ape file is already a pe file");
}
} else {
Die(path, "this ape file was built without pe support");
}
}
if (g_format == FORMAT_ELF) {
AssimilateElf(fd, p, size);
} else if (g_format == FORMAT_MACHO) {
AssimilateMacho(fd, p, size);
}
if (munmap(p, size))
DieSys(path);
if (close(fd))
DieSys(path);
}
int main(int argc, char *argv[]) {
prog = program_invocation_short_name;
GetOpts(argc, argv);
for (int i = optind; i < argc; ++i) {
path = argv[i];
Assimilate();
}
}