Productionize new APE loader and more

The APE_NO_MODIFY_SELF loader payload has been moved out of the examples folder and improved so that it works on BSD systems, and permits general elf program headers. This brings its quality up enough that it should be acceptable to use by default for many programs, e.g. Python, Lua, SQLite and Python. It's the responsibility of the user to define an appropriate TMPDIR if /tmp is considered an adversarial environment. Mac OS shall be supported by APE_NO_MODIFY_SELF soon. Fixes and improvements have been made to program_executable_name as it's now the one true way to get the absolute path of the executing image. This change fixes a memory leak in linenoise history loading, introduced by performance optimizations in 51904e2687 This change fixes a longstanding regression with Mach system calls, that 23ae9dfceb back in February which impacted our sched_yield() implementation, which is why no one noticed until now. The Blinkenlights PC emulator has been improved. We now fix rendering on XNU and BSD by not making the assumption that the kernel terminal driver understands UTF8 since that seems to break its internal modeling of \r\n which is now being addressed by using \e[𝑦H instead. The paneling is now more compact in real mode so you won't need to make your font as tiny if you're only emulating an 8086 program. The CLMUL ISA is now emulated too This change also makes improvement to time. CLOCK_MONOTONIC now does the right thing on Windows NT. The nanosecond time module functions added in Python 3.7 have been backported. This change doubles the performance of Argon2 password stretching simply by not using its copy_block and xor_block helper functions, as they were trivial to inline thus resulting in us needing to iterate over each 1024 byte block four fewer times. This change makes code size improvements. _PyUnicode_ToNumeric() was 64k in size and now it's 10k. The CJK codec lookup tables now use lazy delta zigzag deflate (δzd) encoding which reduces their size from 600k to 200k plus the code bloat caused by macro abuse in _decimal.c is now addressed so our fully-loaded statically-linked hermetically-sealed Python virtual interpreter container is now 9.4 megs in the default build mode and 5.5m in MODE=tiny which leaves plenty of room for chibicc. The pydoc web server now accommodates the use case of people who work by SSH'ing into a different machine w/ python.com -m pydoc -p8080 -h0.0.0.0 Finally Python Capsulae delenda est and won't be supported in the future
2025-09-10 10:43:48 +00:00 · 2021-10-02 08:17:04 -07:00 · 2021-10-02 08:17:04 -07:00 · 47a53e143b
commit 47a53e143b
parent 9cb54218ab
270 changed files with 214544 additions and 23331 deletions
--- a/tool/build/lib/clmul.c
+++ b/tool/build/lib/clmul.c
@ -0,0 +1,52 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2021 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/nexgen32e/bsr.h"
+#include "libc/nexgen32e/x86feature.h"
+#include "tool/build/lib/clmul.h"
+#include "tool/build/lib/endian.h"
+#include "tool/build/lib/modrm.h"
+
+/**
+ * @fileoverview Carryless Multiplication ISA
+ */
+
+struct clmul {
+  uint64_t x, y;
+};
+
+static struct clmul clmul(uint64_t a, uint64_t b) {
+  uint64_t t, x = 0, y = 0;
+  if (a && b) {
+    if (bsrl(a) < bsrl(b)) t = a, a = b, b = t;
+    for (t = 0; b; a <<= 1, b >>= 1) {
+      if (b & 1) x ^= a, y ^= t;
+      t = t << 1 | a >> 63;
+    }
+  }
+  return (struct clmul){x, y};
+}
+
+void OpSsePclmulqdq(struct Machine *m, uint32_t rde) {
+  struct clmul res;
+  res = clmul(Read64(XmmRexrReg(m, rde) + ((m->xedd->op.uimm0 & 0x01) << 3)),
+              Read64(GetModrmRegisterXmmPointerRead16(m, rde) +
+                     ((m->xedd->op.uimm0 & 0x10) >> 1)));
+  Write64(XmmRexrReg(m, rde) + 0, res.x);
+  Write64(XmmRexrReg(m, rde) + 8, res.y);
+}
--- a/tool/build/lib/clmul.h
+++ b/tool/build/lib/clmul.h
@ -0,0 +1,11 @@
+#ifndef COSMOPOLITAN_TOOL_BUILD_LIB_CLMUL_H_
+#define COSMOPOLITAN_TOOL_BUILD_LIB_CLMUL_H_
+#include "tool/build/lib/machine.h"
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+void OpSsePclmulqdq(struct Machine *, uint32_t);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_TOOL_BUILD_LIB_CLMUL_H_ */
--- a/tool/build/lib/cpuid.c
+++ b/tool/build/lib/cpuid.c
@ -35,7 +35,7 @@ void OpCpuid(struct Machine *m, uint32_t rde) {
      break;
    case 1:
      cx |= 1 << 0;   // sse3
-      cx |= 0 << 1;   // pclmulqdq
+      cx |= 1 << 1;   // pclmulqdq
      cx |= 1 << 9;   // ssse3
      cx |= 1 << 23;  // popcnt
      cx |= 1 << 30;  // rdrnd
--- a/tool/build/lib/dis.c
+++ b/tool/build/lib/dis.c
@ -95,7 +95,9 @@ static char *DisError(struct Dis *d, char *p) {

 static char *DisAddr(struct Dis *d, char *p) {
  int64_t x = d->addr;
-  if (-2147483648 <= x && x <= 2147483647) {
+  if (0 <= x && x < 0x10fff0) {
+    return p + uint64toarray_fixed16(x, p, 24);
+  } else if (-2147483648 <= x && x <= 2147483647) {
    return p + uint64toarray_fixed16(x, p, 32);
  } else {
    return p + uint64toarray_fixed16(x, p, 48);
@ -104,7 +106,13 @@ static char *DisAddr(struct Dis *d, char *p) {

 static char *DisRaw(struct Dis *d, char *p) {
  long i;
-  for (i = 0; i < PFIXLEN - MIN(PFIXLEN, d->xedd->op.PIVOTOP); ++i) {
+  int plen;
+  if (0 <= d->addr && d->addr < 0x10fff0) {
+    plen = 2;
+  } else {
+    plen = PFIXLEN;
+  }
+  for (i = 0; i < plen - MIN(plen, d->xedd->op.PIVOTOP); ++i) {
    *p++ = ' ';
    *p++ = ' ';
  }
@ -127,8 +135,16 @@ static char *DisCode(struct Dis *d, char *p) {
 }

 static char *DisLineCode(struct Dis *d, char *p) {
+  int blen, plen;
+  if (0 <= d->addr && d->addr < 0x10fff0) {
+    plen = 2;
+    blen = 6;
+  } else {
+    blen = BYTELEN;
+    plen = PFIXLEN;
+  }
  p = DisColumn(DisAddr(d, p), p, ADDRLEN);
-  p = DisColumn(DisRaw(d, p), p, PFIXLEN * 2 + 1 + BYTELEN * 2);
+  p = DisColumn(DisRaw(d, p), p, plen * 2 + 1 + blen * 2);
  p = DisCode(d, p);
  return p;
 }
--- a/tool/build/lib/divmul.c
+++ b/tool/build/lib/divmul.c
@ -24,114 +24,214 @@
 #include "tool/build/lib/modrm.h"
 #include "tool/build/lib/throw.h"

-void OpDivAlAhAxEbSigned(struct Machine *m, uint32_t rde) {
-  int8_t y, rem;
-  int16_t x, quo;
-  x = Read16(m->ax);
-  y = Read8(GetModrmRegisterBytePointerRead(m, rde));
-  if (!y || (x == INT16_MIN && y == -1)) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(INT8_MIN <= quo && quo <= INT8_MAX)) ThrowDivideError(m);
-  m->ax[0] = quo & 0xff;
-  m->ax[1] = rem & 0xff;
+struct Dubble {
+  uint64_t lo;
+  uint64_t hi;
+};
+
+static inline struct Dubble DubbleNeg(struct Dubble x) {
+  struct Dubble d;
+  d.lo = -x.lo;
+  d.hi = ~(x.hi - (x.lo - 1 > x.lo));
+  return d;
 }

-void OpDivAlAhAxEbUnsigned(struct Machine *m, uint32_t rde) {
-  uint8_t y, rem;
-  uint16_t x, quo;
+static inline struct Dubble DubbleShl(struct Dubble x) {
+  struct Dubble d;
+  d.lo = x.lo << 1;
+  d.hi = x.hi << 1 | x.lo >> 63;
+  return d;
+}
+
+static inline struct Dubble DubbleShr(struct Dubble x) {
+  struct Dubble d;
+  d.lo = x.lo >> 1 | x.hi << 63;
+  d.hi = x.hi >> 1;
+  return d;
+}
+
+static inline unsigned DubbleLte(struct Dubble a, struct Dubble b) {
+  return a.hi == b.hi ? a.lo <= b.lo : a.hi <= b.hi;
+}
+
+static struct Dubble DubbleMul(uint64_t a, uint64_t b) {
+  struct Dubble d;
+  uint64_t x, y, t;
+  x = (a & 0xffffffff) * (b & 0xffffffff);
+  t = x >> 32;
+  x &= 0xffffffff;
+  t += (a >> 32) * (b & 0xffffffff);
+  x += (t & 0xffffffff) << 32;
+  y = t >> 32;
+  t = x >> 32;
+  x &= 0xffffffff;
+  t += (b >> 32) * (a & 0xffffffff);
+  x += (t & 0xffffffff) << 32;
+  y += t >> 32;
+  y += (a >> 32) * (b >> 32);
+  d.lo = x;
+  d.hi = y;
+  return d;
+}
+
+static struct Dubble DubbleImul(uint64_t a, uint64_t b) {
+  unsigned s, t;
+  struct Dubble p;
+  if ((s = a >> 63)) a = -a;
+  if ((t = b >> 63)) b = -b;
+  p = DubbleMul(a, b);
+  return s ^ t ? DubbleNeg(p) : p;
+}
+
+static struct Dubble DubbleDiv(struct Dubble a, uint64_t b, uint64_t *r) {
+  int n, c;
+  uint64_t s;
+  struct Dubble d, q, t;
+  d.lo = b, d.hi = 0;
+  q.lo = 0, q.hi = 0;
+  for (n = 0; DubbleLte(d, a) && n < 128; ++n) {
+    d = DubbleShl(d);
+  }
+  for (; n > 0; --n) {
+    t = a;
+    d = DubbleShr(d);
+    q = DubbleShl(q);
+    s = a.lo, a.lo -= d.lo + 0, c = a.lo > s;
+    s = a.hi, a.hi -= d.hi + c, c = a.hi > s;
+    if (c) {
+      a = t;
+    } else {
+      q.lo++;
+    }
+  }
+  *r = a.lo;
+  return q;
+}
+
+static struct Dubble DubbleIdiv(struct Dubble a, uint64_t b, uint64_t *r) {
+  unsigned s, t;
+  struct Dubble q;
+  if ((s = a.hi >> 63)) a = DubbleNeg(a);
+  if ((t = b >> 63)) b = -b;
+  q = DubbleDiv(a, b, r);
+  if (s ^ t) q = DubbleNeg(q);
+  if (s) *r = -*r;
+  return q;
+}
+
+void OpDivAlAhAxEbSigned(struct Machine *m, uint32_t rde) {
+  int8_t y, r;
+  int16_t x, q;
  x = Read16(m->ax);
  y = Read8(GetModrmRegisterBytePointerRead(m, rde));
  if (!y) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(UINT8_MIN <= quo && quo <= UINT8_MAX)) ThrowDivideError(m);
-  m->ax[0] = quo & 0xff;
-  m->ax[1] = rem & 0xff;
+  if (x == INT16_MIN) ThrowDivideError(m);
+  q = x / y;
+  r = x % y;
+  if (q != (int8_t)q) ThrowDivideError(m);
+  m->ax[0] = q & 0xff;
+  m->ax[1] = r & 0xff;
+}
+
+void OpDivAlAhAxEbUnsigned(struct Machine *m, uint32_t rde) {
+  uint8_t y, r;
+  uint16_t x, q;
+  x = Read16(m->ax);
+  y = Read8(GetModrmRegisterBytePointerRead(m, rde));
+  if (!y) ThrowDivideError(m);
+  q = x / y;
+  r = x % y;
+  if (q > 255) ThrowDivideError(m);
+  m->ax[0] = q & 0xff;
+  m->ax[1] = r & 0xff;
 }

 static void OpDivRdxRaxEvqpSigned64(struct Machine *m, uint32_t rde,
                                    uint8_t *p) {
-  int64_t y, rem;
-  int128_t x, quo;
-  x = (uint128_t)Read64(m->dx) << 64 | Read64(m->ax);
-  y = Read64(p);
-  if (!y || (x == INT128_MIN && y == -1)) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(INT64_MIN <= quo && quo <= INT64_MAX)) ThrowDivideError(m);
-  Write64(m->ax, quo);
-  Write64(m->dx, rem);
+  uint64_t d, r;
+  struct Dubble q;
+  q.lo = Read64(m->ax);
+  q.hi = Read64(m->dx);
+  d = Read64(p);
+  if (!d) ThrowDivideError(m);
+  if (!q.lo && q.hi == 0x8000000000000000) ThrowDivideError(m);
+  q = DubbleIdiv(q, d, &r);
+  if ((int64_t)q.lo < 0 && (int64_t)q.hi != -1) ThrowDivideError(m);
+  if ((int64_t)q.lo >= 0 && q.hi) ThrowDivideError(m);
+  Write64(m->ax, q.lo);
+  Write64(m->dx, r);
 }

 static void OpDivRdxRaxEvqpSigned32(struct Machine *m, uint32_t rde,
                                    uint8_t *p) {
-  int32_t y, rem;
-  int64_t x, quo;
+  int32_t y, r;
+  int64_t x, q;
  x = (uint64_t)Read32(m->dx) << 32 | Read32(m->ax);
  y = Read32(p);
-  if (!y || (x == INT64_MIN && y == -1)) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(INT32_MIN <= quo && quo <= INT32_MAX)) ThrowDivideError(m);
-  Write64(m->ax, quo & 0xffffffff);
-  Write64(m->dx, rem & 0xffffffff);
+  if (!y) ThrowDivideError(m);
+  if (x == INT64_MIN) ThrowDivideError(m);
+  q = x / y;
+  r = x % y;
+  if (q != (int32_t)q) ThrowDivideError(m);
+  Write64(m->ax, q & 0xffffffff);
+  Write64(m->dx, r & 0xffffffff);
 }

 static void OpDivRdxRaxEvqpSigned16(struct Machine *m, uint32_t rde,
                                    uint8_t *p) {
-  int16_t y, rem;
-  int32_t x, quo;
+  int16_t y, r;
+  int32_t x, q;
  x = (uint32_t)Read16(m->dx) << 16 | Read16(m->ax);
  y = Read16(p);
-  if (!y || (x == INT32_MIN && y == -1)) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(INT16_MIN <= quo && quo <= INT16_MAX)) ThrowDivideError(m);
-  Write16(m->ax, quo);
-  Write16(m->dx, rem);
+  if (!y) ThrowDivideError(m);
+  if (x == INT32_MIN) ThrowDivideError(m);
+  q = x / y;
+  r = x % y;
+  if (q != (int16_t)q) ThrowDivideError(m);
+  Write16(m->ax, q);
+  Write16(m->dx, r);
 }

 static void OpDivRdxRaxEvqpUnsigned16(struct Machine *m, uint32_t rde,
                                      uint8_t *p) {
-  uint16_t y, rem;
-  uint32_t x, quo;
+  uint16_t y, r;
+  uint32_t x, q;
  x = (uint32_t)Read16(m->dx) << 16 | Read16(m->ax);
  y = Read16(p);
  if (!y) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(UINT16_MIN <= quo && quo <= UINT16_MAX)) ThrowDivideError(m);
-  Write16(m->ax, quo);
-  Write16(m->dx, rem);
+  q = x / y;
+  r = x % y;
+  if (q > 65535) ThrowDivideError(m);
+  Write16(m->ax, q);
+  Write16(m->dx, r);
 }

 static void OpDivRdxRaxEvqpUnsigned32(struct Machine *m, uint32_t rde,
                                      uint8_t *p) {
-  uint32_t y, rem;
-  uint64_t x, quo;
+  uint32_t y, r;
+  uint64_t x, q;
  x = (uint64_t)Read32(m->dx) << 32 | Read32(m->ax);
  y = Read32(p);
  if (!y) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(UINT32_MIN <= quo && quo <= UINT32_MAX)) ThrowDivideError(m);
-  Write64(m->ax, quo & 0xffffffff);
-  Write64(m->dx, rem & 0xffffffff);
+  q = x / y;
+  r = x % y;
+  if (q > 4294967295) ThrowDivideError(m);
+  Write64(m->ax, q & 0xffffffff);
+  Write64(m->dx, r & 0xffffffff);
 }

 static void OpDivRdxRaxEvqpUnsigned64(struct Machine *m, uint32_t rde,
                                      uint8_t *p) {
-  uint64_t y, rem;
-  uint128_t x, quo;
-  x = (uint128_t)Read64(m->dx) << 64 | Read64(m->ax);
-  y = Read64(p);
-  if (!y) ThrowDivideError(m);
-  quo = x / y;
-  rem = x % y;
-  if (!(UINT64_MIN <= quo && quo <= UINT64_MAX)) ThrowDivideError(m);
-  Write64(m->ax, quo);
-  Write64(m->dx, rem);
+  uint64_t d, r;
+  struct Dubble q;
+  q.lo = Read64(m->ax);
+  q.hi = Read64(m->dx);
+  d = Read64(p);
+  if (!d) ThrowDivideError(m);
+  q = DubbleDiv(q, d, &r);
+  if (q.hi) ThrowDivideError(m);
+  Write64(m->ax, q.lo);
+  Write64(m->dx, r);
 }

 void OpDivRdxRaxEvqpSigned(struct Machine *m, uint32_t rde) {
@ -159,9 +259,9 @@ void OpDivRdxRaxEvqpUnsigned(struct Machine *m, uint32_t rde) {
 }

 void OpMulAxAlEbSigned(struct Machine *m, uint32_t rde) {
-  bool of;
  int16_t ax;
  uint8_t *p;
+  unsigned of;
  p = GetModrmRegisterBytePointerRead(m, rde);
  ax = (int8_t)Read8(m->ax) * (int8_t)Read8(p);
  of = ax != (int8_t)ax;
@ -172,8 +272,8 @@ void OpMulAxAlEbSigned(struct Machine *m, uint32_t rde) {

 void OpMulAxAlEbUnsigned(struct Machine *m, uint32_t rde) {
  int ax;
-  bool of;
  uint8_t *p;
+  unsigned of;
  p = GetModrmRegisterBytePointerRead(m, rde);
  ax = Read8(m->ax) * Read8(p);
  of = ax != (uint8_t)ax;
@ -183,28 +283,25 @@ void OpMulAxAlEbUnsigned(struct Machine *m, uint32_t rde) {
 }

 void OpMulRdxRaxEvqpSigned(struct Machine *m, uint32_t rde) {
-  bool of;
  uint8_t *p;
+  unsigned of;
  int32_t dxax;
  int64_t edxeax;
-  int128_t rdxrax;
+  struct Dubble rdxrax;
  p = GetModrmRegisterWordPointerReadOszRexw(m, rde);
  if (Rexw(rde)) {
-    __builtin_mul_overflow((int128_t)(int64_t)Read64(m->ax), (int64_t)Read64(p),
-                           &rdxrax);
-    of = (int128_t)rdxrax != (int64_t)rdxrax;
-    Write64(m->ax, rdxrax);
-    Write64(m->dx, rdxrax >> 64);
+    rdxrax = DubbleImul(Read64(m->ax), Read64(p));
+    of = !!(rdxrax.hi + (rdxrax.lo >> 63));
+    Write64(m->ax, rdxrax.lo);
+    Write64(m->dx, rdxrax.hi);
  } else if (!Osz(rde)) {
-    __builtin_mul_overflow((int64_t)(int32_t)Read32(m->ax), (int32_t)Read32(p),
-                           &edxeax);
-    of = (int64_t)edxeax != (int32_t)edxeax;
+    edxeax = (int64_t)(int32_t)Read32(m->ax) * (int32_t)Read32(p);
+    of = edxeax != (int32_t)edxeax;
    Write64(m->ax, edxeax);
    Write64(m->dx, edxeax >> 32);
  } else {
-    __builtin_mul_overflow((int32_t)(int16_t)Read16(m->ax), (int16_t)Read16(p),
-                           &dxax);
-    of = (int32_t)dxax != (int16_t)dxax;
+    dxax = (int32_t)(int16_t)Read16(m->ax) * (int16_t)Read16(p);
+    of = dxax != (int16_t)dxax;
    Write16(m->ax, dxax);
    Write16(m->dx, dxax >> 16);
  }
@ -213,25 +310,24 @@ void OpMulRdxRaxEvqpSigned(struct Machine *m, uint32_t rde) {
 }

 void OpMulRdxRaxEvqpUnsigned(struct Machine *m, uint32_t rde) {
-  bool of;
  uint8_t *p;
+  unsigned of;
  uint32_t dxax;
  uint64_t edxeax;
-  uint128_t rdxrax;
+  struct Dubble rdxrax;
  p = GetModrmRegisterWordPointerReadOszRexw(m, rde);
  if (Rexw(rde)) {
-    __builtin_mul_overflow((uint128_t)Read64(m->ax), Read64(p), &rdxrax);
-    of = (uint64_t)rdxrax != rdxrax;
-    Write64(m->ax, rdxrax);
-    Write64(m->dx, rdxrax >> 64);
+    rdxrax = DubbleMul(Read64(m->ax), Read64(p));
+    of = !!rdxrax.hi;
+    Write64(m->ax, rdxrax.lo);
+    Write64(m->dx, rdxrax.hi);
  } else if (!Osz(rde)) {
-    __builtin_mul_overflow((uint64_t)Read32(m->ax), Read32(p), &edxeax);
+    edxeax = (uint64_t)Read32(m->ax) * Read32(p);
    of = (uint32_t)edxeax != edxeax;
    Write64(m->ax, edxeax);
    Write64(m->dx, edxeax >> 32);
  } else {
-    __builtin_mul_overflow((uint32_t)(uint16_t)Read16(m->ax),
-                           (uint16_t)Read16(p), &dxax);
+    dxax = (uint32_t)(uint16_t)Read16(m->ax) * (uint16_t)Read16(p);
    of = (uint16_t)dxax != dxax;
    Write16(m->ax, dxax);
    Write16(m->dx, dxax >> 16);
@ -243,23 +339,18 @@ void OpMulRdxRaxEvqpUnsigned(struct Machine *m, uint32_t rde) {
 static void AluImul(struct Machine *m, uint32_t rde, uint8_t *a, uint8_t *b) {
  unsigned of;
  if (Rexw(rde)) {
-    int64_t x, y, z;
-    x = Read64(a);
-    y = Read64(b);
-    of = __builtin_mul_overflow(x, y, &z);
-    Write64(RegRexrReg(m, rde), z);
+    struct Dubble p;
+    p = DubbleImul(Read64(a), Read64(b));
+    of = !!(p.hi + (p.lo >> 63));
+    Write64(RegRexrReg(m, rde), p.lo);
  } else if (!Osz(rde)) {
-    int32_t x, y, z;
-    x = Read32(a);
-    y = Read32(b);
-    of = __builtin_mul_overflow(x, y, &z);
+    int64_t z;
+    z = (int64_t)(int32_t)Read32(a) * (int32_t)Read32(b);
+    of = z != (int32_t)z;
    Write64(RegRexrReg(m, rde), z & 0xffffffff);
  } else {
-    int z;
-    int16_t x, y;
-    x = Read16(a);
-    y = Read16(b);
-    z = x * y;
+    int32_t z;
+    z = (int32_t)(int16_t)Read16(a) * (int16_t)Read16(b);
    of = z != (int16_t)z;
    Write16(RegRexrReg(m, rde), z);
  }
--- a/tool/build/lib/endian.h
+++ b/tool/build/lib/endian.h
@ -1,53 +1,82 @@
 #ifndef COSMOPOLITAN_TOOL_BUILD_LIB_ENDIAN_H_
 #define COSMOPOLITAN_TOOL_BUILD_LIB_ENDIAN_H_
-#include "libc/bits/bits.h"
+#include "libc/str/str.h"

-#define Read8(P) (*(const uint8_t *)(P))
+static inline uint8_t Read8(const uint8_t *p) {
+  return p[0];
+}

-#define Read16(P)                              \
-  ({                                           \
-    const uint8_t *Ptr = (const uint8_t *)(P); \
-    READ16LE(P);                               \
-  })
+static inline void Write8(uint8_t *p, uint8_t v) {
+  *p = v;
+}

-#define Read32(P)                              \
-  ({                                           \
-    const uint8_t *Ptr = (const uint8_t *)(P); \
-    READ32LE(P);                               \
-  })
+static inline uint16_t Read16(const uint8_t *p) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  uint16_t v;
+  memcpy(&v, p, sizeof(v));
+  return v;
+#else
+  return p[1] << 8 | p[0];
+#endif
+}

-#define Read64(P)                              \
-  ({                                           \
-    const uint8_t *Ptr = (const uint8_t *)(P); \
-    READ64LE(P);                               \
-  })
+static inline void Write16(uint8_t *p, uint16_t v) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  memcpy(p, &v, sizeof(v));
+#else
+  p[0] = (0x00FF & v) >> 000;
+  p[1] = (0xFF00 & v) >> 010;
+#endif
+}

-#define Write8(P, V)    \
-  do {                  \
-    uint8_t Val = (V);  \
-    uint8_t *Ptr = (P); \
-    *Ptr = Val;         \
-  } while (0)
+static inline uint32_t Read32(const uint8_t *p) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  uint32_t v;
+  memcpy(&v, p, sizeof(v));
+  return v;
+#else
+  return ((uint32_t)p[0] << 000 | (uint32_t)p[1] << 010 |
+          (uint32_t)p[2] << 020 | (uint32_t)p[3] << 030);
+#endif
+}

-#define Write16(P, V)    \
-  do {                   \
-    uint16_t Val = (V);  \
-    uint8_t *Ptr = (P);  \
-    WRITE16LE(Ptr, Val); \
-  } while (0)
+static inline void Write32(uint8_t *p, uint32_t v) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  memcpy(p, &v, sizeof(v));
+#else
+  p[0] = (0x000000FF & v) >> 000;
+  p[1] = (0x0000FF00 & v) >> 010;
+  p[2] = (0x00FF0000 & v) >> 020;
+  p[3] = (0xFF000000 & v) >> 030;
+#endif
+}

-#define Write32(P, V)    \
-  do {                   \
-    uint32_t Val = (V);  \
-    uint8_t *Ptr = (P);  \
-    WRITE32LE(Ptr, Val); \
-  } while (0)
+static inline uint64_t Read64(const uint8_t *p) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  uint64_t v;
+  memcpy(&v, p, sizeof(v));
+  return v;
+#else
+  return ((uint64_t)p[0] << 000 | (uint64_t)p[1] << 010 |
+          (uint64_t)p[2] << 020 | (uint64_t)p[3] << 030 |
+          (uint64_t)p[4] << 040 | (uint64_t)p[5] << 050 |
+          (uint64_t)p[6] << 060 | (uint64_t)p[7] << 070);
+#endif
+}

-#define Write64(P, V)    \
-  do {                   \
-    uint64_t Val = (V);  \
-    uint8_t *Ptr = (P);  \
-    WRITE64LE(Ptr, Val); \
-  } while (0)
+static inline void Write64(uint8_t *p, uint64_t v) {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+  memcpy(p, &v, sizeof(v));
+#else
+  p[0] = (0x00000000000000FF & v) >> 000;
+  p[1] = (0x000000000000FF00 & v) >> 010;
+  p[2] = (0x0000000000FF0000 & v) >> 020;
+  p[3] = (0x00000000FF000000 & v) >> 030;
+  p[4] = (0x000000FF00000000 & v) >> 040;
+  p[5] = (0x0000FF0000000000 & v) >> 050;
+  p[6] = (0x00FF000000000000 & v) >> 060;
+  p[7] = (0xFF00000000000000 & v) >> 070;
+#endif
+}

 #endif /* COSMOPOLITAN_TOOL_BUILD_LIB_ENDIAN_H_ */
--- a/tool/build/lib/machine.c
+++ b/tool/build/lib/machine.c
@ -27,6 +27,7 @@
 #include "tool/build/lib/bcd.h"
 #include "tool/build/lib/bitscan.h"
 #include "tool/build/lib/case.h"
+#include "tool/build/lib/clmul.h"
 #include "tool/build/lib/cpuid.h"
 #include "tool/build/lib/cvt.h"
 #include "tool/build/lib/divmul.h"
@ -2215,6 +2216,7 @@ void ExecuteSparseInstruction(struct Machine *m, uint32_t rde, uint32_t d) {
    CASE(0x22a, OpMovntdqaVdqMdq(m, rde));
    CASE(0x240, OpSsePmulld(m, rde));
    CASE(0x30f, OpSsePalignr(m, rde));
+    CASE(0x344, OpSsePclmulqdq(m, rde));
    default:
      OpUd(m, rde);
  }
--- a/tool/build/lib/panel.c
+++ b/tool/build/lib/panel.c
@ -48,7 +48,7 @@ ssize_t PrintPanels(int fd, long pn, struct Panel *p, long tyn, long txn) {
  bzero(&b, sizeof(b));
  AppendStr(&b, "\e[H");
  for (y = 0; y < tyn; ++y) {
-    if (y) AppendStr(&b, "\r\n");
+    if (y) AppendFmt(&b, "\e[%dH", y + 1);
    for (x = i = 0; i < pn; ++i) {
      if (p[i].top <= y && y < p[i].bottom) {
        j = state = 0;