Make AARCH64 harder, better, faster, stronger

- Perform some housekeeping on scalar math function code - Import ARM's Optimized Routines for SIMD string processing - Upgrade to latest Chromium zlib and enable more SIMD optimizations
2025-07-25 20:10:29 +00:00 · 2023-05-15 01:51:29 -07:00 · 2023-05-15 01:51:29 -07:00 · cc1732bc42
commit cc1732bc42
parent 550b52abf6
143 changed files with 15661 additions and 1329 deletions
--- a/build/definitions.mk
+++ b/build/definitions.mk
@ -73,6 +73,13 @@ IMAGE_BASE_VIRTUAL ?= 0x400000
 IGNORE := $(shell $(ECHO) -2 ♥cosmo)
 IGNORE := $(shell $(MKDIR) o/tmp)

+ifeq ($(MODE), dbg)
+# be generous about resources in debug mode
+# let commands use  64 seconds  cpu time max
+# let commands use 300 seconds wall time max
+QUOTA ?= -C64 -L300
+endif
+
 ifneq ($(findstring aarch64,$(MODE)),)
 ARCH = aarch64
 VM = o/third_party/qemu/qemu-aarch64
--- a/examples/compress.c
+++ b/examples/compress.c
@ -12,8 +12,8 @@
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/log/check.h"
-#include "libc/mem/mem.h"
 #include "libc/mem/gc.internal.h"
+#include "libc/mem/mem.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
@ -48,26 +48,62 @@ FLAGS\n\
 // clang-format off
 // make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
 /*
+#!/bin/bash
 # data file is o/dbg/third_party/python/python.com
-# level 0 147517 compress 495 MB/s decompress 1.4 GB/s
-# level 1 80274 compress 29.2 MB/s decompress 303 MB/s
-# level 2 79384 compress 33.8 MB/s decompress 212 MB/s
-# level 3 78875 compress 28.9 MB/s decompress 224 MB/s
-# level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot?
-# level 5 77107 compress 19.5 MB/s decompress 273 MB/s
-# level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s
-# level 7 75022 compress 7.5 MB/s decompress 287 MB/s
-# level 8 75016 compress 5.4 MB/s decompress 109 MB/s
-# level 9 75016 compress 5.4 MB/s decompress 344 MB/s
+# level   1 348739 compress 22.8 MB/s decompress 444 MB/s
+# level   2 347549 compress 37.8 MB/s decompress 457 MB/s
+# level   3 346902 compress 33.3 MB/s decompress 463 MB/s
+# level   4 345671 compress 29.3 MB/s decompress 467 MB/s
+# level   5 344392 compress 22.4 MB/s decompress 506 MB/s
+# level   6 342105 compress 10.9 MB/s decompress 516 MB/s
+# level   7 342046 compress  7.9 MB/s decompress 515 MB/s
+# level   8 342009 compress  5.8 MB/s decompress 518 MB/s
+# level   9 342001 compress  5.7 MB/s decompress 524 MB/s
+# level F 1 362426 compress 48.2 MB/s decompress 488 MB/s
+# level F 2 360875 compress 42.7 MB/s decompress 484 MB/s
+# level F 3 359992 compress 37.1 MB/s decompress 499 MB/s
+# level F 4 358460 compress 32.9 MB/s decompress 503 MB/s
+# level F 5 356431 compress 24.0 MB/s decompress 547 MB/s
+# level F 6 352274 compress 11.6 MB/s decompress 558 MB/s
+# level F 7 352155 compress  8.7 MB/s decompress 554 MB/s
+# level F 8 352065 compress  6.3 MB/s decompress 554 MB/s
+# level F 9 352051 compress  6.2 MB/s decompress 556 MB/s
+# level L 1 348739 compress 41.1 MB/s decompress 446 MB/s
+# level L 2 347549 compress 37.4 MB/s decompress 443 MB/s
+# level L 3 346902 compress 32.3 MB/s decompress 462 MB/s
+# level L 4 351932 compress 28.8 MB/s decompress 511 MB/s
+# level L 5 351384 compress 23.6 MB/s decompress 520 MB/s
+# level L 6 351328 compress 12.1 MB/s decompress 522 MB/s
+# level L 7 351230 compress  7.3 MB/s decompress 518 MB/s
+# level L 8 351192 compress  5.7 MB/s decompress 522 MB/s
+# level L 9 351182 compress  6.5 MB/s decompress 519 MB/s
+# level R 1 388209 compress 83.1 MB/s decompress 371 MB/s
+# level R 2 388209 compress 82.3 MB/s decompress 362 MB/s
+# level R 3 388209 compress 81.8 MB/s decompress 361 MB/s
+# level R 4 388209 compress 81.7 MB/s decompress 364 MB/s
+# level R 5 388209 compress 81.7 MB/s decompress 363 MB/s
+# level R 6 388209 compress 80.1 MB/s decompress 359 MB/s
+# level R 7 388209 compress 80.3 MB/s decompress 354 MB/s
+# level R 8 388209 compress 80.3 MB/s decompress 363 MB/s
+# level R 9 388209 compress 81.3 MB/s decompress 364 MB/s
+# level H 1 390207 compress 87.6 MB/s decompress 371 MB/s
+# level H 2 390207 compress 87.5 MB/s decompress 372 MB/s
+# level H 3 390207 compress 85.5 MB/s decompress 364 MB/s
+# level H 4 390207 compress 87.3 MB/s decompress 375 MB/s
+# level H 5 390207 compress 89.0 MB/s decompress 373 MB/s
+# level H 6 390207 compress 87.3 MB/s decompress 372 MB/s
+# level H 7 390207 compress 87.0 MB/s decompress 368 MB/s
+# level H 8 390207 compress 86.2 MB/s decompress 367 MB/s
+# level H 9 390207 compress 86.9 MB/s decompress 369 MB/s
 m=
 make -j8 MODE=$m o/$m/examples || exit
+for strategy in ' ' F L R H; do
 for level in $(seq 1 9); do
-for strategy in F L R H; do
-  o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
+  o/$m/examples/compress.com -$level$strategy <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
  compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
  o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null
  decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
-  size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c)
+  size=$(o/$m/examples/compress.com -$level$strategy <o/$m/examples/compress.com | wc -c)
  echo "level $strategy $level $size compress $compspeed decompress $decompspeed"
 done
 done
--- a/examples/decompress.c
+++ b/examples/decompress.c
@ -10,43 +10,14 @@
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/errno.h"
-#include "libc/mem/mem.h"
 #include "libc/mem/gc.internal.h"
+#include "libc/mem/mem.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "third_party/zlib/zlib.h"

 #define CHUNK 32768

-// clang-format off
-// make -j8 o//examples && dd if=/dev/urandom count=100 | tee a | o//examples/compress.com | o//examples/decompress.com >b && sha1sum a b
-/*
-# data file is o/dbg/third_party/python/python.com
-# level 0 147517 compress 495 MB/s decompress 1.4 GB/s
-# level 1 80274 compress 29.2 MB/s decompress 303 MB/s
-# level 2 79384 compress 33.8 MB/s decompress 212 MB/s
-# level 3 78875 compress 28.9 MB/s decompress 224 MB/s
-# level 4 78010 compress 27.1 MB/s decompress 319 MB/s <-- sweet spot?
-# level 5 77107 compress 19.5 MB/s decompress 273 MB/s
-# level 6 75081 compress 10.0 MB/s decompress 99.3 MB/s
-# level 7 75022 compress 7.5 MB/s decompress 287 MB/s
-# level 8 75016 compress 5.4 MB/s decompress 109 MB/s
-# level 9 75016 compress 5.4 MB/s decompress 344 MB/s
-m=
-make -j8 MODE=$m o/$m/examples || exit
-for level in $(seq 0 9); do
-for strategy in F L R H; do
-  o/$m/examples/compress.com -$strategy$level <o/dbg/third_party/python/python.com | dd count=10000 2>/tmp/info >/tmp/comp
-  compspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
-  o/$m/examples/decompress.com </tmp/comp | dd count=10000 2>/tmp/info >/dev/null
-  decompspeed=$(grep -Po '[.\d]+ \w+/s' /tmp/info)
-  size=$(o/$m/examples/compress.com -$strategy$level <o/$m/examples/compress.com | wc -c)
-  echo "level $strategy $level $size compress $compspeed decompress $decompspeed"
-done
-done
-*/
-// clang-format on
-
 int decompressor(int infd, int outfd) {
  int rc;
  unsigned have;
--- a/libc/intrin/aarch64/asmdefs.h
+++ b/libc/intrin/aarch64/asmdefs.h
@ -0,0 +1,88 @@
+#ifndef COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
+#define COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_
+#ifdef __ASSEMBLER__
+// clang-format off
+
+/* Branch Target Identitication support.  */
+#define BTI_C		hint	34
+#define BTI_J		hint	36
+/* Return address signing support (pac-ret).  */
+#define PACIASP		hint	25; .cfi_window_save
+#define AUTIASP		hint	29; .cfi_window_save
+
+/* GNU_PROPERTY_AARCH64_* macros from elf.h.  */
+#define FEATURE_1_AND 0xc0000000
+#define FEATURE_1_BTI 1
+#define FEATURE_1_PAC 2
+
+/* Add a NT_GNU_PROPERTY_TYPE_0 note.  */
+#define GNU_PROPERTY(type, value)	\
+  .section .note.gnu.property, "a";	\
+  .p2align 3;				\
+  .word 4;				\
+  .word 16;				\
+  .word 5;				\
+  .asciz "GNU";				\
+  .word type;				\
+  .word 4;				\
+  .word value;				\
+  .word 0;				\
+  .text
+
+/* If set then the GNU Property Note section will be added to
+   mark objects to support BTI and PAC-RET.  */
+#ifndef WANT_GNU_PROPERTY
+#define WANT_GNU_PROPERTY 1
+#endif
+
+#if WANT_GNU_PROPERTY
+/* Add property note with supported features to all asm files.  */
+GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC)
+#endif
+
+#define ENTRY_ALIGN(name, alignment)	\
+  .global name;		\
+  .type name,%function;	\
+  .align alignment;		\
+  name:			\
+  .cfi_startproc;	\
+  BTI_C;
+
+#define ENTRY(name)	ENTRY_ALIGN(name, 6)
+
+#define ENTRY_ALIAS(name)	\
+  .global name;		\
+  .type name,%function;	\
+  name:
+
+#define END(name)	\
+  .cfi_endproc;		\
+  .size name, .-name;
+
+#define L(l) .L ## l
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of pointer arguments as per aapcs64 */
+#define PTR_ARG(n)  mov w##n, w##n
+#else
+#define PTR_ARG(n)
+#endif
+
+#ifdef __ILP32__
+  /* Sanitize padding bits of size arguments as per aapcs64 */
+#define SIZE_ARG(n)  mov w##n, w##n
+#else
+#define SIZE_ARG(n)
+#endif
+
+/* Compiler supports SVE instructions  */
+#ifndef HAVE_SVE
+# if __aarch64__ && (__GNUC__ >= 8 || __clang_major__ >= 5)
+#   define HAVE_SVE 1
+# else
+#   define HAVE_SVE 0
+# endif
+#endif
+
+#endif /* __ASSEMBLER__ */
+#endif /* COSMOPOLITAN_LIBC_INTRIN_AARCH64_ASMDEFS_H_ */
--- a/libc/intrin/aarch64/memchr.S
+++ b/libc/intrin/aarch64/memchr.S
@ -0,0 +1,172 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __memchr_aarch64 memchr
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+
+#define result		x0
+
+#define src		x3
+#define	tmp		x4
+#define wtmp2		w5
+#define synd		x6
+#define soff		x9
+#define cntrem		x10
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_chr1	v3
+#define vhas_chr2	v4
+#define vrepmask	v5
+#define vend		v6
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
+ * per byte. For each tuple, bit 0 is set if the relevant byte matched the
+ * requested character and bit 1 is not used (faster than using a 32bit
+ * syndrome). Since the bits in the syndrome reflect exactly the order in which
+ * things occur in the original string, counting trailing zeros allows to
+ * identify exactly which byte has matched.
+ */
+
+ENTRY (__memchr_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+	/* Do not dereference srcin if no bytes to compare.  */
+	cbz	cntin, L(zero_length)
+	/*
+	 * Magic constant 0x40100401 allows us to identify which lane matches
+	 * the requested byte.
+	 */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	dup	vrepmask.4s, wtmp2
+	ands	soff, srcin, #31
+	and	cntrem, cntin, #31
+	b.eq	L(loop)
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	sub	tmp, soff, #32
+	adds	cntin, cntin, tmp
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.d[0]
+	/* Clear the soff*2 lower bits */
+	lsl	tmp, soff, #1
+	lsr	synd, synd, tmp
+	lsl	synd, synd, tmp
+	/* The first block can also be the last */
+	b.ls	L(masklast)
+	/* Have we found something already? */
+	cbnz	synd, L(tail)
+
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	subs	cntin, cntin, #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	/* If we're out of data we finish regardless of the result */
+	b.ls	L(end)
+	/* Use a fast check for the termination condition */
+	orr	vend.16b, vhas_chr1.16b, vhas_chr2.16b
+	addp	vend.2d, vend.2d, vend.2d
+	mov	synd, vend.d[0]
+	/* We're not out of data, loop if we haven't found the character */
+	cbz	synd, L(loop)
+
+L(end):
+	/* Termination condition found, let's calculate the syndrome value */
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr1.16b, vhas_chr2.16b		/* 256->128 */
+	addp	vend.16b, vend.16b, vend.16b			/* 128->64 */
+	mov	synd, vend.d[0]
+	/* Only do the clear for the last possible block */
+	b.hs	L(tail)
+
+L(masklast):
+	/* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
+	add	tmp, cntrem, soff
+	and	tmp, tmp, #31
+	sub	tmp, tmp, #32
+	neg	tmp, tmp, lsl #1
+	lsl	synd, synd, tmp
+	lsr	synd, synd, tmp
+
+L(tail):
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Check that we have found a character */
+	cmp	synd, #0
+	/* And count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result */
+	add	result, src, synd, lsr #1
+	/* Select result or NULL */
+	csel	result, xzr, result, eq
+	ret
+
+L(zero_length):
+	mov	result, #0
+	ret
+
+END (__memchr_aarch64)
--- a/libc/intrin/aarch64/memcmp.S
+++ b/libc/intrin/aarch64/memcmp.S
@ -0,0 +1,218 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __memcmp_aarch64 memcmp
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ */
+
+#define src1	x0
+#define src2	x1
+#define limit	x2
+#define result	w0
+
+#define data1	x3
+#define data1w	w3
+#define data2	x4
+#define data2w	w4
+#define data3	x5
+#define data3w	w5
+#define data4	x6
+#define data4w	w6
+#define tmp	x6
+#define src1end	x7
+#define src2end	x8
+
+
+ENTRY (__memcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+
+	cmp	limit, 16
+	b.lo	L(less16)
+	ldp	data1, data3, [src1]
+	ldp	data2, data4, [src2]
+	ccmp	data1, data2, 0, ne
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	cmp	limit, 32
+	b.ls	L(last_bytes)
+	cmp	limit, 160
+	b.hs	L(loop_align)
+	sub	limit, limit, 32
+
+	.p2align 4
+L(loop32):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	cmp	limit, 16
+	b.ls	L(last_bytes)
+
+	ldp	data1, data3, [src1, 32]
+	ldp	data2, data4, [src2, 32]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+	add	src1, src1, 32
+	add	src2, src2, 32
+L(last64):
+	subs	limit, limit, 32
+	b.hi	L(loop32)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	ldp	data1, data3, [src1end, -16]
+	ldp	data2, data4, [src2end, -16]
+L(return2):
+	cmp	data1, data2
+	csel	data1, data1, data3, ne
+	csel	data2, data2, data4, ne
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+L(less16):
+	add	src1end, src1, limit
+	add	src2end, src2, limit
+	tbz	limit, 3, L(less8)
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+	ldr	data3, [src1end, -8]
+	ldr	data4, [src2end, -8]
+	b	L(return2)
+
+	.p2align 4
+L(less8):
+	tbz	limit, 2, L(less4)
+	ldr	data1w, [src1]
+	ldr	data2w, [src2]
+	ldr	data3w, [src1end, -4]
+	ldr	data4w, [src2end, -4]
+	b	L(return2)
+
+L(less4):
+	tbz	limit, 1, L(less2)
+	ldrh	data1w, [src1]
+	ldrh	data2w, [src2]
+	cmp	data1w, data2w
+	b.ne	L(return)
+L(less2):
+	mov	result, 0
+	tbz	limit, 0, L(return_zero)
+	ldrb	data1w, [src1end, -1]
+	ldrb	data2w, [src2end, -1]
+	sub	result, data1w, data2w
+L(return_zero):
+	ret
+
+L(loop_align):
+	ldp	data1, data3, [src1, 16]
+	ldp	data2, data4, [src2, 16]
+	cmp	data1, data2
+	ccmp	data3, data4, 0, eq
+	b.ne	L(return2)
+
+	/* Align src2 and adjust src1, src2 and limit.  */
+	and	tmp, src2, 15
+	sub	tmp, tmp, 16
+	sub	src2, src2, tmp
+	add	limit, limit, tmp
+	sub	src1, src1, tmp
+	sub	limit, limit, 64 + 16
+
+	.p2align 4
+L(loop64):
+	ldr	q0, [src1, 16]
+	ldr	q1, [src2, 16]
+	subs	limit, limit, 64
+	ldr	q2, [src1, 32]
+	ldr	q3, [src2, 32]
+	eor	v0.16b, v0.16b, v1.16b
+	eor	v1.16b, v2.16b, v3.16b
+	ldr	q2, [src1, 48]
+	ldr	q3, [src2, 48]
+	umaxp	v0.16b, v0.16b, v1.16b
+	ldr	q4, [src1, 64]!
+	ldr	q5, [src2, 64]!
+	eor	v1.16b, v2.16b, v3.16b
+	eor	v2.16b, v4.16b, v5.16b
+	umaxp	v1.16b, v1.16b, v2.16b
+	umaxp	v0.16b, v0.16b, v1.16b
+	umaxp	v0.16b, v0.16b, v0.16b
+	fmov	tmp, d0
+	ccmp	tmp, 0, 0, hi
+	b.eq	L(loop64)
+
+	/* If equal, process last 1-64 bytes using scalar loop.  */
+	add	limit, limit, 64 + 16
+	cbz	tmp, L(last64)
+
+	/* Determine the 8-byte aligned offset of the first difference.  */
+#ifdef __AARCH64EB__
+	rev16	tmp, tmp
+#endif
+	rev	tmp, tmp
+	clz	tmp, tmp
+	bic	tmp, tmp, 7
+	sub	tmp, tmp, 48
+	ldr	data1, [src1, tmp]
+	ldr	data2, [src2, tmp]
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	mov	result, 1
+	cmp	data1, data2
+	cneg	result, result, lo
+	ret
+
+END (__memcmp_aarch64)
--- a/libc/intrin/aarch64/memcpy.S
+++ b/libc/intrin/aarch64/memcpy.S
@ -0,0 +1,233 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __memcpy_aarch64_simd memcpy
+#define __memmove_aarch64_simd memmove
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_lw	w10
+#define tmp1	x14
+
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
+/* This implementation handles overlaps and supports both memcpy and memmove
+   from a single entry point.  It uses unaligned accesses and branchless
+   sequences to keep the code small, simple and improve performance.
+
+   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
+   copies of up to 128 bytes, and large copies.  The overhead of the overlap
+   check is negligible since it is only required for large copies.
+
+   Large copies use a software pipelined loop processing 64 bytes per iteration.
+   The source pointer is 16-byte aligned to minimize unaligned accesses.
+   The loop tail is handled by always copying 64 bytes from the end.
+*/
+
+ENTRY_ALIAS (__memmove_aarch64_simd)
+ENTRY (__memcpy_aarch64_simd)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 128
+	b.hi	L(copy_long)
+	cmp	count, 32
+	b.hi	L(copy32_128)
+
+	/* Small copies: 0..32 bytes.  */
+	cmp	count, 16
+	b.lo	L(copy16)
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
+	ret
+
+	/* Copy 8-15 bytes.  */
+L(copy16):
+	tbz	count, 3, L(copy8)
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+
+	.p2align 3
+	/* Copy 4-7 bytes.  */
+L(copy8):
+	tbz	count, 2, L(copy4)
+	ldr	A_lw, [src]
+	ldr	B_lw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	B_lw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes using a branchless sequence.  */
+L(copy4):
+	cbz	count, L(copy0)
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	C_lw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	C_lw, [dstend, -1]
+L(copy0):
+	ret
+
+	.p2align 4
+	/* Medium copies: 33..128 bytes.  */
+L(copy32_128):
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
+	cmp	count, 64
+	b.hi	L(copy128)
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	.p2align 4
+	/* Copy 65..128 bytes.  */
+L(copy128):
+	ldp	E_q, F_q, [src, 32]
+	cmp	count, 96
+	b.ls	L(copy96)
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
+L(copy96):
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
+	ret
+
+	/* Copy more than 128 bytes.  */
+L(copy_long):
+	/* Use backwards copy if there is an overlap.  */
+	sub	tmp1, dstin, src
+	cmp	tmp1, count
+	b.lo	L(copy_long_backwards)
+
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(copy64_from_end)
+L(loop64):
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last iteration and copy 64 bytes from the end.  */
+L(copy64_from_end):
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
+	ret
+
+	/* Large backwards copy for overlapping copies.
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	cbz	tmp1, L(copy0)
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
+	sub	count, count, tmp1
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	L(copy64_from_start)
+
+L(loop64_backwards):
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
+	subs	count, count, 64
+	b.hi	L(loop64_backwards)
+
+	/* Write the last iteration and copy 64 bytes from the start.  */
+L(copy64_from_start):
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+	ret
+
+END (__memcpy_aarch64_simd)
--- a/libc/intrin/aarch64/memrchr.S
+++ b/libc/intrin/aarch64/memrchr.S
@ -0,0 +1,138 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __memrchr_aarch64 memrchr
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#define srcin		x0
+#define chrin		w1
+#define cntin		x2
+#define result		x0
+
+#define src		x3
+#define cntrem		x4
+#define synd		x5
+#define shift		x6
+#define	tmp		x7
+#define end		x8
+#define endm1		x9
+
+#define vrepchr		v0
+#define qdata		q1
+#define vdata		v1
+#define vhas_chr	v2
+#define vend		v3
+#define dend		d3
+
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
+
+ENTRY (__memrchr_aarch64)
+	PTR_ARG (0)
+	add	end, srcin, cntin
+	sub	endm1, end, 1
+	bic	src, endm1, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	dup	vrepchr.16b, chrin
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	neg	shift, end, lsl 2
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	lsl	synd, synd, shift
+	cbz	synd, L(start_loop)
+
+	clz	synd, synd
+	sub	result, endm1, synd, lsr 2
+	cmp	cntin, synd, lsr 2
+	csel	result, result, xzr, hi
+	ret
+
+	nop
+L(start_loop):
+	subs	cntrem, src, srcin
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	sub	cntrem, cntrem, 1
+	tbz	cntrem, 4, L(loop32_2)
+	add	src, src, 16
+
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src, -32]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+
+L(loop32_2):
+	ldr	qdata, [src, -16]
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	b.lo	L(end_2)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end_2):
+	sub	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	fmov	synd, dend
+
+	add	tmp, src, 15
+#ifdef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	sub	tmp, tmp, synd, lsr 2
+	cmp	tmp, srcin
+	csel	result, tmp, xzr, hs
+	ret
+
+L(nomatch):
+	mov	result, 0
+	ret
+
+END (__memrchr_aarch64)
--- a/libc/intrin/aarch64/memset.S
+++ b/libc/intrin/aarch64/memset.S
@ -0,0 +1,143 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __memset_aarch64 memset
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+
+ENTRY (__memset_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (2)
+
+	dup	v0.16B, valw
+	add	dstend, dstin, count
+
+	cmp	count, 96
+	b.hi	L(set_long)
+	cmp	count, 16
+	b.hs	L(set_medium)
+	mov	val, v0.D[0]
+
+	/* Set 0..15 bytes.  */
+	tbz	count, 3, 1f
+	str	val, [dstin]
+	str	val, [dstend, -8]
+	ret
+	.p2align 4
+1:	tbz	count, 2, 2f
+	str	valw, [dstin]
+	str	valw, [dstend, -4]
+	ret
+2:	cbz	count, 3f
+	strb	valw, [dstin]
+	tbz	count, 1, 3f
+	strh	valw, [dstend, -2]
+3:	ret
+
+	/* Set 17..96 bytes.  */
+L(set_medium):
+	str	q0, [dstin]
+	tbnz	count, 6, L(set96)
+	str	q0, [dstend, -16]
+	tbz	count, 5, 1f
+	str	q0, [dstin, 16]
+	str	q0, [dstend, -32]
+1:	ret
+
+	.p2align 4
+	/* Set 64..96 bytes.  Write 64 bytes from the start and
+	   32 bytes from the end.  */
+L(set96):
+	str	q0, [dstin, 16]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+L(set_long):
+	and	valw, valw, 255
+	bic	dst, dstin, 15
+	str	q0, [dstin]
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
+	str	q0, [dst, 16]
+	stp	q0, q0, [dst, 32]
+	bic	dst, dst, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+	stp	q0, q0, [dst, 32]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (__memset_aarch64)
--- a/libc/intrin/aarch64/stpcpy.S
+++ b/libc/intrin/aarch64/stpcpy.S
@ -0,0 +1,175 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __stpcpy_aarch64 stpcpy
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#define dstin		x0
+#define srcin		x1
+#define result		x0
+
+#define src		x2
+#define dst		x3
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vend		v2
+#define dend		d2
+#define dataq2		q1
+
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
+
+ENTRY (__stpcpy_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	bic	src, srcin, 15
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	add	result, dstin, len
+	ret
+
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
+	str	data1, [dstin]
+	str	data2, [dstin, tmp]
+	add	result, dstin, len
+	ret
+
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	add	result, dstin, len
+	ret
+
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	add	result, dstin, len
+	ret
+
+	.p2align 4
+L(start_loop):
+	sub	tmp, srcin, dstin
+	ldr	dataq2, [srcin]
+	sub	dst, src, tmp
+	str	dataq2, [dstin]
+L(loop):
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	dst, dst, 16
+L(loopend):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	sub	dst, dst, 31
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	len, synd
+	lsr	len, len, 2
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	add	result, dst, 15
+	ret
+
+END (__stpcpy_aarch64)
--- a/libc/intrin/aarch64/strchr.S
+++ b/libc/intrin/aarch64/strchr.S
@ -0,0 +1,152 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strchr_aarch64 strchr
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+/* Locals and temporaries.  */
+
+ENTRY (__strchr_aarch64)
+	PTR_ARG (0)
+	/* Magic constant 0xc0300c03 to allow us to identify which lane
+	   matches the requested byte.  Even bits are set if the character
+	   matches, odd bits if either the char is NUL or matches.  */
+	mov	wtmp2, 0x0c03
+	movk	wtmp2, 0xc030, lsl 16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(loop)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, L(tail)
+
+	.p2align 4
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
+	mov	tmp1, vend1.d[0]
+	cbz	tmp1, L(loop)
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	bif	vhas_nul1.16b, vhas_chr1.16b, vrepmask_0.16b
+	bif	vhas_nul2.16b, vhas_chr2.16b, vrepmask_0.16b
+	and	vend1.16b, vhas_nul1.16b, vrepmask_c.16b
+	and	vend2.16b, vhas_nul2.16b, vrepmask_c.16b
+	addp	vend1.16b, vend1.16b, vend2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend2.16b		// 128->64
+	mov	tmp1, vend1.d[0]
+L(tail):
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* And counting the leading zeros.  */
+	/* Tmp1 is even if the target charager was found first.  Otherwise
+	   we've found the end of string and we weren't looking for NUL.  */
+	tst	tmp1, #1
+	add	result, src, tmp1, lsr #1
+	csel	result, result, xzr, eq
+	ret
+
+END (__strchr_aarch64)
--- a/libc/intrin/aarch64/strchrnul.S
+++ b/libc/intrin/aarch64/strchrnul.S
@ -0,0 +1,140 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strchrnul_aarch64 strchrnul
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask	v7
+#define vend1		v16
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character or nul.  Since the
+   bits in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination.  */
+
+/* Locals and temporaries.  */
+
+ENTRY (__strchrnul_aarch64)
+	PTR_ARG (0)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the termination condition.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask.4s, wtmp2
+	ands	tmp1, srcin, #31
+	b.eq	L(loop)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+	lsl	tmp1, tmp1, #1
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	mov	tmp3, #~0
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+	lsr	tmp1, tmp3, tmp1
+
+	mov	tmp3, vend1.d[0]
+	bic	tmp1, tmp3, tmp1	// Mask padding bits.
+	cbnz	tmp1, L(tail)
+
+	.p2align 4
+L(loop):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	cmhs	vhas_nul1.16b, vhas_chr1.16b, vdata1.16b
+	cmhs	vhas_nul2.16b, vhas_chr2.16b, vdata2.16b
+	orr	vend1.16b, vhas_nul1.16b, vhas_nul2.16b
+	umaxp	vend1.16b, vend1.16b, vend1.16b
+	mov	tmp1, vend1.d[0]
+	cbz	tmp1, L(loop)
+
+	/* Termination condition found.  Now need to establish exactly why
+	   we terminated.  */
+	and	vhas_chr1.16b, vhas_nul1.16b, vrepmask.16b
+	and	vhas_chr2.16b, vhas_nul2.16b, vrepmask.16b
+	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
+	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
+
+	mov	tmp1, vend1.d[0]
+L(tail):
+	/* Count the trailing zeros, by bit reversing...  */
+	rbit	tmp1, tmp1
+	/* Re-bias source.  */
+	sub	src, src, #32
+	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
+	/* tmp1 is twice the offset into the fragment.  */
+	add	result, src, tmp1, lsr #1
+	ret
+
+END (__strchrnul_aarch64)
--- a/libc/intrin/aarch64/strcmp.S
+++ b/libc/intrin/aarch64/strcmp.S
@ -0,0 +1,214 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strcmp_aarch64 strcmp
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define off1		x5
+#define syndrome	x6
+#define tmp		x6
+#define data3		x7
+#define zeroones	x8
+#define shift		x9
+#define off2		x10
+
+/* On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.  */
+#ifdef __AARCH64EB__
+# define LS_FW lsl
+#else
+# define LS_FW lsr
+#endif
+
+/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+   can be done in parallel across the entire word.
+   Since carry propagation makes 0x1 bytes before a NUL byte appear
+   NUL too in big-endian, byte-reverse the data before the NUL check.  */
+
+
+ENTRY (__strcmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	sub	off2, src2, src1
+	mov	zeroones, REP8_01
+	and	tmp, src1, 7
+	tst	off2, 7
+	b.ne	L(misaligned8)
+	cbnz	tmp, L(mutual_align)
+
+	.p2align 4
+
+L(loop_aligned):
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+L(start_realigned):
+#ifdef __AARCH64EB__
+	rev	tmp, data1
+	sub	has_nul, tmp, zeroones
+	orr	tmp, tmp, REP8_7f
+#else
+	sub	has_nul, data1, zeroones
+	orr	tmp, data1, REP8_7f
+#endif
+	bics	has_nul, has_nul, tmp	/* Non-zero if NUL terminator.  */
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_aligned)
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+L(end):
+#ifndef __AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	clz	shift, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, shift
+	lsl	data2, data2, shift
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, 56
+	sub	result, data1, data2, lsr 56
+	ret
+
+	.p2align 4
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, 7
+	ldr	data2, [src1, off2]
+	ldr	data1, [src1], 8
+	neg	shift, src2, lsl 3	/* Bits to alignment -64.  */
+	mov	tmp, -1
+	LS_FW	tmp, tmp, shift
+	orr	data1, data1, tmp
+	orr	data2, data2, tmp
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+	   checking to make sure that we don't access beyond the end of SRC2.  */
+	cbz	tmp, L(src1_aligned)
+L(do_misaligned):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	cmp	data1w, 0
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	tst	src1, 7
+	b.ne	L(do_misaligned)
+
+L(src1_aligned):
+	neg	shift, src2, lsl 3
+	bic	src2, src2, 7
+	ldr	data3, [src2], 8
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	lsr	tmp, zeroones, shift
+	orr	data3, data3, tmp
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	bics	has_nul, has_nul, tmp
+	b.ne	L(tail)
+
+	sub	off1, src2, src1
+
+	.p2align 4
+
+L(loop_unaligned):
+	ldr	data3, [src1, off1]
+	ldr	data2, [src1, off2]
+#ifdef __AARCH64EB__
+	rev	data3, data3
+#endif
+	sub	has_nul, data3, zeroones
+	orr	tmp, data3, REP8_7f
+	ldr	data1, [src1], 8
+	bics	has_nul, has_nul, tmp
+	ccmp	data1, data2, 0, eq
+	b.eq	L(loop_unaligned)
+
+	lsl	tmp, has_nul, shift
+#ifdef __AARCH64EB__
+	rev	tmp, tmp
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, tmp
+	cbnz	syndrome, L(end)
+L(tail):
+	ldr	data1, [src1]
+	neg	shift, shift
+	lsr	data2, data3, shift
+	lsr	has_nul, has_nul, shift
+#ifdef __AARCH64EB__
+	rev     data2, data2
+	rev	has_nul, has_nul
+#endif
+	eor	diff, data1, data2
+	orr	syndrome, diff, has_nul
+	b	L(end)
+
+L(done):
+	sub	result, data1, data2
+	ret
+
+END (__strcmp_aarch64)
--- a/libc/intrin/aarch64/strcpy.S
+++ b/libc/intrin/aarch64/strcpy.S
@ -0,0 +1,170 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strcpy_aarch64 strcpy
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#define dstin		x0
+#define srcin		x1
+#define result		x0
+
+#define src		x2
+#define dst		x3
+#define len		x4
+#define synd		x4
+#define	tmp		x5
+#define shift		x5
+#define data1		x6
+#define dataw1		w6
+#define data2		x7
+#define dataw2		w7
+
+#define dataq		q0
+#define vdata		v0
+#define vhas_nul	v1
+#define vend		v2
+#define dend		d2
+#define dataq2		q1
+
+/*
+   Core algorithm:
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
+   which things occur in the original string, counting leading zeros identifies
+   exactly which byte matched.  */
+
+ENTRY (__strcpy_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	bic	src, srcin, 15
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbnz	synd, L(tail)
+
+	ldr	dataq, [src, 16]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	shrn	vend.8b, vhas_nul.8h, 4
+	fmov	synd, dend
+	cbz	synd, L(start_loop)
+
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	sub	tmp, src, srcin
+	clz	len, synd
+	add	len, tmp, len, lsr 2
+	tbz	len, 4, L(less16)
+	sub	tmp, len, 15
+	ldr	dataq, [srcin]
+	ldr	dataq2, [srcin, tmp]
+	str	dataq, [dstin]
+	str	dataq2, [dstin, tmp]
+	ret
+
+L(tail):
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 2
+L(less16):
+	tbz	len, 3, L(less8)
+	sub	tmp, len, 7
+	ldr	data1, [srcin]
+	ldr	data2, [srcin, tmp]
+	str	data1, [dstin]
+	str	data2, [dstin, tmp]
+	ret
+
+	.p2align 4
+L(less8):
+	subs	tmp, len, 3
+	b.lo	L(less4)
+	ldr	dataw1, [srcin]
+	ldr	dataw2, [srcin, tmp]
+	str	dataw1, [dstin]
+	str	dataw2, [dstin, tmp]
+	ret
+
+L(less4):
+	cbz	len, L(zerobyte)
+	ldrh	dataw1, [srcin]
+	strh	dataw1, [dstin]
+L(zerobyte):
+	strb	wzr, [dstin, len]
+	ret
+
+	.p2align 4
+L(start_loop):
+	sub	tmp, srcin, dstin
+	ldr	dataq2, [srcin]
+	sub	dst, src, tmp
+	str	dataq2, [dstin]
+L(loop):
+	str	dataq, [dst], 32
+	ldr	dataq, [src, 16]
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbnz	synd, L(loopend)
+	str	dataq, [dst, -16]
+	ldr	dataq, [src, 32]!
+	cmeq	vhas_nul.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_nul.16b, vhas_nul.16b
+	fmov	synd, dend
+	cbz	synd, L(loop)
+	add	dst, dst, 16
+L(loopend):
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	sub	dst, dst, 31
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	len, synd
+	lsr	len, len, 2
+	add	dst, dst, len
+	ldr	dataq, [dst, tmp]
+	str	dataq, [dst]
+	ret
+
+END (__strcpy_aarch64)
--- a/libc/intrin/aarch64/strlen.S
+++ b/libc/intrin/aarch64/strlen.S
@ -0,0 +1,220 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strlen_aarch64 strlen
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ * Not MTE compatible.
+ */
+
+#define srcin	x0
+#define len	x0
+
+#define src	x1
+#define data1	x2
+#define data2	x3
+#define has_nul1 x4
+#define has_nul2 x5
+#define tmp1	x4
+#define tmp2	x5
+#define tmp3	x6
+#define tmp4	x7
+#define zeroones x8
+
+#define maskv	v0
+#define maskd	d0
+#define dataq1	q1
+#define dataq2	q2
+#define datav1	v1
+#define datav2	v2
+#define tmp	x2
+#define tmpw	w2
+#define synd	x3
+#define syndw	w3
+#define shift	x4
+
+/* For the first 32 bytes, NUL detection works on the principle that
+   (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a
+   byte is zero, and can be done in parallel across the entire word.  */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* To test the page crossing code path more thoroughly, compile with
+   -DTEST_PAGE_CROSS - this will force all calls through the slower
+   entry path.  This option is not intended for production use.  */
+
+#ifdef TEST_PAGE_CROSS
+# define MIN_PAGE_SIZE 32
+#else
+# define MIN_PAGE_SIZE 4096
+#endif
+
+/* Core algorithm:
+
+   Since strings are short on average, we check the first 32 bytes of the
+   string for a NUL character without aligning the string.  In order to use
+   unaligned loads safely we must do a page cross check first.
+
+   If there is a NUL byte we calculate the length from the 2 8-byte words
+   using conditional select to reduce branch mispredictions (it is unlikely
+   strlen will be repeatedly called on strings with the same length).
+
+   If the string is longer than 32 bytes, align src so we don't need further
+   page cross checks, and process 32 bytes per iteration using a fast SIMD
+   loop.
+
+   If the page cross check fails, we read 32 bytes from an aligned address,
+   and ignore any characters before the string.  If it contains a NUL
+   character, return the length, if not, continue in the main loop.  */
+
+ENTRY (__strlen_aarch64)
+	PTR_ARG (0)
+	and	tmp1, srcin, MIN_PAGE_SIZE - 1
+	cmp	tmp1, MIN_PAGE_SIZE - 32
+	b.hi	L(page_cross)
+
+	/* Look for a NUL byte in the first 16 bytes.  */
+	ldp	data1, data2, [srcin]
+	mov	zeroones, REP8_01
+
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul1/2 directly.
+	   Since we expect strings to be small and early-exit,
+	   byte-swap the data now so has_null1/2 will be correct.  */
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	b.eq	L(bytes16_31)
+
+	/* Find the exact offset of the first NUL byte in the first 16 bytes
+	   from the string start.  Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 8
+	rev	has_nul1, has_nul1
+	csel	len, xzr, len, cc
+	clz	tmp1, has_nul1
+	add	len, len, tmp1, lsr 3
+	ret
+
+	/* Look for a NUL byte at offset 16..31 in the string.  */
+L(bytes16_31):
+	ldp	data1, data2, [srcin, 16]
+#ifdef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, REP8_7f
+	bics	has_nul1, tmp1, tmp2
+	bic	has_nul2, tmp3, tmp4
+	ccmp	has_nul2, 0, 0, eq
+	b.eq	L(loop_entry)
+
+	/* Find the exact offset of the first NUL byte at offset 16..31 from
+	   the string start.  Enter with C = has_nul1 == 0.  */
+	csel	has_nul1, has_nul1, has_nul2, cc
+	mov	len, 24
+	rev	has_nul1, has_nul1
+	mov	tmp3, 16
+	clz	tmp1, has_nul1
+	csel	len, tmp3, len, cc
+	add	len, len, tmp1, lsr 3
+	ret
+
+	nop
+L(loop_entry):
+	bic	src, srcin, 31
+
+	.p2align 5
+L(loop):
+	ldp	dataq1, dataq2, [src, 32]!
+	uminp	maskv.16b, datav1.16b, datav2.16b
+	uminp	maskv.16b, maskv.16b, maskv.16b
+	cmeq	maskv.8b, maskv.8b, 0
+	fmov	synd, maskd
+	cbz	synd, L(loop)
+
+	/* Low 32 bits of synd are non-zero if a NUL was found in datav1.  */
+	cmeq	maskv.16b, datav1.16b, 0
+	sub	len, src, srcin
+	cbnz	syndw, 1f
+	cmeq	maskv.16b, datav2.16b, 0
+	add	len, len, 16
+1:
+	/* Generate a bitmask and compute correct byte offset.  */
+	shrn	maskv.8b, maskv.8h, 4
+	fmov	synd, maskd
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	tmp, synd
+	add	len, len, tmp, lsr 2
+	ret
+
+L(page_cross):
+	bic	src, srcin, 31
+	mov	tmpw, 0x0c03
+	movk	tmpw, 0xc030, lsl 16
+	ld1	{datav1.16b, datav2.16b}, [src]
+	dup	maskv.4s, tmpw
+	cmeq	datav1.16b, datav1.16b, 0
+	cmeq	datav2.16b, datav2.16b, 0
+	and	datav1.16b, datav1.16b, maskv.16b
+	and	datav2.16b, datav2.16b, maskv.16b
+	addp	maskv.16b, datav1.16b, datav2.16b
+	addp	maskv.16b, maskv.16b, maskv.16b
+	fmov	synd, maskd
+	lsl	shift, srcin, 1
+	lsr	synd, synd, shift
+	cbz	synd, L(loop)
+
+	rbit	synd, synd
+	clz	len, synd
+	lsr	len, len, 1
+	ret
+
+END (__strlen_aarch64)
--- a/libc/intrin/aarch64/strncmp.S
+++ b/libc/intrin/aarch64/strncmp.S
@ -0,0 +1,334 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strncmp_aarch64 strncmp
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64.
+ * MTE compatible.
+ */
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define syndrome	x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define zeroones	x11
+#define pos		x12
+#define mask		x13
+#define endloop		x14
+#define count		mask
+#define offset		pos
+#define neg_offset	x15
+
+/* Define endian dependent shift operations.
+   On big-endian early bytes are at MSB and on little-endian LSB.
+   LS_FW means shifting towards early bytes.
+   LS_BK means shifting towards later bytes.
+   */
+#ifdef __AARCH64EB__
+#define LS_FW lsl
+#define LS_BK lsr
+#else
+#define LS_FW lsr
+#define LS_BK lsl
+#endif
+
+ENTRY (__strncmp_aarch64)
+	PTR_ARG (0)
+	PTR_ARG (1)
+	SIZE_ARG (2)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	and	count, src1, #7
+	b.ne	L(misaligned8)
+	cbnz	count, L(mutual_align)
+
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	.p2align 4
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit, limit, #8
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, hi	/* Last Dword or differences.  */
+	bics	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	ccmp	endloop, #0, #0, eq
+	b.eq	L(loop_aligned)
+	/* End of main loop */
+
+L(full_check):
+#ifndef __AARCH64EB__
+	orr	syndrome, diff, has_nul
+	add	limit, limit, 8	/* Rewind limit to before last subs. */
+L(syndrome_check):
+	/* Limit was reached. Check if the NUL byte or the difference
+	   is before the limit. */
+	rev	syndrome, syndrome
+	rev	data1, data1
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	cmp	limit, pos, lsr #3
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	csel result, result, xzr, hi
+	ret
+#else
+	/* Not reached the limit, must have found the end or a diff.  */
+	tbz	limit, #63, L(not_limit)
+	add	tmp1, limit, 8
+	cbz	limit, L(not_limit)
+
+	lsl	limit, tmp1, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+	lsr	mask, mask, limit
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	/* Make sure that the NUL byte is marked in the syndrome.  */
+	orr	has_nul, has_nul, mask
+
+L(not_limit):
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The most-significant-non-zero bit of the syndrome marks either the
+	   first bit that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+L(end_quick):
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	ret
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.
+	   We also need to adjust the limit calculations, but without
+	   overflowing if the limit is near ULONG_MAX.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	ldr	data1, [src1], #8
+	neg	tmp3, count, lsl #3	/* 64 - bits(bytes beyond align). */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+	LS_FW	tmp2, tmp2, tmp3	/* Shift (count & 63).  */
+	/* Adjust the limit and ensure it doesn't overflow.  */
+	adds	limit, limit, count
+	csinv	limit, limit, xzr, lo
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+	.p2align 4
+	/* Don't bother with dwords for up to 16 bytes.  */
+L(misaligned8):
+	cmp	limit, #16
+	b.hs	L(try_misaligned_words)
+
+L(byte_loop):
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, #1, #0, hi	/* NZCV = 0b0000.  */
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+L(done):
+	sub	result, data1, data2
+	ret
+	/* Align the SRC1 to a dword by doing a bytewise compare and then do
+	   the dword loop.  */
+L(try_misaligned_words):
+	cbz	count, L(src1_aligned)
+
+	neg	count, count
+	and	count, count, #7
+	sub	limit, limit, count
+
+L(page_end_loop):
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.ne	L(done)
+	subs	count, count, #1
+	b.hi	L(page_end_loop)
+
+	/* The following diagram explains the comparison of misaligned strings.
+	   The bytes are shown in natural order. For little-endian, it is
+	   reversed in the registers. The "x" bytes are before the string.
+	   The "|" separates data that is loaded at one time.
+	   src1     | a a a a a a a a | b b b c c c c c | . . .
+	   src2     | x x x x x a a a   a a a a a b b b | c c c c c . . .
+
+	   After shifting in each step, the data looks like this:
+	                STEP_A              STEP_B              STEP_C
+	   data1    a a a a a a a a     b b b c c c c c     b b b c c c c c
+	   data2    a a a a a a a a     b b b 0 0 0 0 0     0 0 0 c c c c c
+
+	   The bytes with "0" are eliminated from the syndrome via mask.
+
+	   Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
+	   time from SRC2. The comparison happens in 3 steps. After each step
+	   the loop can exit, or read from SRC1 or SRC2. */
+L(src1_aligned):
+	/* Calculate offset from 8 byte alignment to string start in bits. No
+	   need to mask offset since shifts are ignoring upper bits. */
+	lsl	offset, src2, #3
+	bic	src2, src2, #0xf
+	mov	mask, -1
+	neg	neg_offset, offset
+	ldr	data1, [src1], #8
+	ldp	tmp1, tmp2, [src2], #16
+	LS_BK	mask, mask, neg_offset
+	and	neg_offset, neg_offset, #63	/* Need actual value for cmp later. */
+	/* Skip the first compare if data in tmp1 is irrelevant. */
+	tbnz	offset, 6, L(misaligned_mid_loop)
+
+L(loop_misaligned):
+	/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
+	LS_FW	data2, tmp1, offset
+	LS_BK	tmp1, tmp2, neg_offset
+	subs	limit, limit, #8
+	orr	data2, data2, tmp1	/* 8 bytes from SRC2 combined from two regs.*/
+	sub	has_nul, data1, zeroones
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	orr	tmp3, data1, #REP8_7f
+	csinv	endloop, diff, xzr, hi	/* If limit, set to all ones. */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL byte found in SRC1. */
+	orr	tmp3, endloop, has_nul
+	cbnz	tmp3, L(full_check)
+
+	ldr	data1, [src1], #8
+L(misaligned_mid_loop):
+	/* STEP_B: Compare first part of data1 to second part of tmp2. */
+	LS_FW	data2, tmp2, offset
+#ifdef __AARCH64EB__
+	/* For big-endian we do a byte reverse to avoid carry-propagation
+	problem described above. This way we can reuse the has_nul in the
+	next step and also use syndrome value trick at the end. */
+	rev	tmp3, data1
+	#define data1_fixed tmp3
+#else
+	#define data1_fixed data1
+#endif
+	sub	has_nul, data1_fixed, zeroones
+	orr	tmp3, data1_fixed, #REP8_7f
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	bic	has_nul, has_nul, tmp3	/* Non-zero if NUL terminator.  */
+#ifdef __AARCH64EB__
+	rev	has_nul, has_nul
+#endif
+	cmp	limit, neg_offset, lsr #3
+	orr	syndrome, diff, has_nul
+	bic	syndrome, syndrome, mask	/* Ignore later bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	/* STEP_C: Compare second part of data1 to first part of tmp1. */
+	ldp	tmp1, tmp2, [src2], #16
+	cmp	limit, #8
+	LS_BK	data2, tmp1, neg_offset
+	eor	diff, data2, data1	/* Non-zero if differences found.  */
+	orr	syndrome, diff, has_nul
+	and	syndrome, syndrome, mask	/* Ignore earlier bytes. */
+	csinv	tmp3, syndrome, xzr, hi	/* If limit, set to all ones. */
+	cbnz	tmp3, L(syndrome_check)
+
+	ldr	data1, [src1], #8
+	sub	limit, limit, #8
+	b	L(loop_misaligned)
+
+#ifdef	__AARCH64EB__
+L(syndrome_check):
+	clz	pos, syndrome
+	cmp	pos, limit, lsl #3
+	b.lo	L(end_quick)
+#endif
+
+L(ret0):
+	mov	result, #0
+	ret
+END(__strncmp_aarch64)
--- a/libc/intrin/aarch64/strnlen.S
+++ b/libc/intrin/aarch64/strnlen.S
@ -0,0 +1,128 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strnlen_aarch64 strnlen
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
+ */
+
+#define srcin		x0
+#define cntin		x1
+#define result		x0
+
+#define src		x2
+#define synd		x3
+#define	shift		x4
+#define tmp		x4
+#define cntrem		x5
+
+#define qdata		q0
+#define vdata		v0
+#define vhas_chr	v1
+#define vend		v2
+#define dend		d2
+
+/*
+   Core algorithm:
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
+
+ENTRY (__strnlen_aarch64)
+	PTR_ARG (0)
+	SIZE_ARG (1)
+	bic	src, srcin, 15
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src]
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+L(finish):
+	rbit	synd, synd
+	clz	synd, synd
+	lsr	result, synd, 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
+	ret
+
+L(nomatch):
+	mov	result, cntin
+	ret
+
+L(start_loop):
+	sub	tmp, src, srcin
+	add	tmp, tmp, 17
+	subs	cntrem, cntin, tmp
+	b.lo	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	tbz	cntrem, 4, L(loop32_2)
+	sub	src, src, 16
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src, 32]!
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+L(loop32_2):
+	ldr	qdata, [src, 16]
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	b.lo	L(end_2)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+L(end_2):
+	add	src, src, 16
+L(end):
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
+	sub	result, src, srcin
+	fmov	synd, dend
+#ifndef __AARCH64EB__
+	rbit	synd, synd
+#endif
+	clz	synd, synd
+	add	result, result, synd, lsr 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
+	ret
+
+END (__strnlen_aarch64)
--- a/libc/intrin/aarch64/strrchr.S
+++ b/libc/intrin/aarch64/strrchr.S
@ -0,0 +1,175 @@
+/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
+│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  Optimized Routines                                                          │
+│  Copyright (c) 1999-2022, Arm Limited.                                       │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/intrin/aarch64/asmdefs.h"
+
+#define __strrchr_aarch64 strrchr
+
+.ident "\n\
+Optimized Routines (MIT License)\n\
+Copyright 2022 ARM Limited\n"
+.include "libc/disclaimer.inc"
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		w1
+
+#define result		x0
+
+#define src		x2
+#define	tmp1		x3
+#define wtmp2		w4
+#define tmp3		x5
+#define src_match	x6
+#define src_offset	x7
+#define const_m1	x8
+#define tmp4		x9
+#define nul_match	x10
+#define chr_match	x11
+
+#define vrepchr		v0
+#define vdata1		v1
+#define vdata2		v2
+#define vhas_nul1	v3
+#define vhas_nul2	v4
+#define vhas_chr1	v5
+#define vhas_chr2	v6
+#define vrepmask_0	v7
+#define vrepmask_c	v16
+#define vend1		v17
+#define vend2		v18
+
+/* Core algorithm.
+
+   For each 32-byte hunk we calculate a 64-bit syndrome value, with
+   two bits per byte (LSB is always in bits 0 and 1, for both big
+   and little-endian systems).  For each tuple, bit 0 is set iff
+   the relevant byte matched the requested character; bit 1 is set
+   iff the relevant byte matched the NUL end of string (we trigger
+   off bit0 for the special case of looking for NUL).  Since the bits
+   in the syndrome reflect exactly the order in which things occur
+   in the original string a count_trailing_zeros() operation will
+   identify exactly which byte is causing the termination, and why.  */
+
+ENTRY (__strrchr_aarch64)
+	PTR_ARG (0)
+	/* Magic constant 0x40100401 to allow us to identify which lane
+	   matches the requested byte.  Magic constant 0x80200802 used
+	   similarly for NUL termination.  */
+	mov	wtmp2, #0x0401
+	movk	wtmp2, #0x4010, lsl #16
+	dup	vrepchr.16b, chrin
+	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
+	dup	vrepmask_c.4s, wtmp2
+	mov	src_offset, #0
+	ands	tmp1, srcin, #31
+	add	vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+	b.eq	L(aligned)
+
+	/* Input string is not 32-byte aligned.  Rather than forcing
+	   the padding bytes to a safe value, we calculate the syndrome
+	   for all the bytes, but then mask off those bits of the
+	   syndrome that are related to the padding.  */
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	neg	tmp1, tmp1
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b	// 256->128
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vhas_nul1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
+	lsl	tmp1, tmp1, #1
+	mov	const_m1, #~0
+	lsr	tmp3, const_m1, tmp1
+	mov	chr_match, vend1.d[1]
+
+	bic	nul_match, nul_match, tmp3	// Mask padding bits.
+	bic	chr_match, chr_match, tmp3	// Mask padding bits.
+	cbnz	nul_match, L(tail)
+
+	.p2align 4
+L(loop):
+	cmp	chr_match, #0
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+L(aligned):
+	ld1	{vdata1.16b, vdata2.16b}, [src], #32
+	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
+	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
+	uminp	vend1.16b, vdata1.16b, vdata2.16b
+	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+	cmeq	vend1.16b, vend1.16b, 0
+	addp	vhas_chr1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
+	addp	vend1.16b, vend1.16b, vhas_chr1.16b		// 128->64
+	mov	nul_match, vend1.d[0]
+	mov	chr_match, vend1.d[1]
+	cbz	nul_match, L(loop)
+
+	cmeq	vhas_nul1.16b, vdata1.16b, #0
+	cmeq	vhas_nul2.16b, vdata2.16b, #0
+	and	vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+	and	vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul2.16b
+	addp	vhas_nul1.16b, vhas_nul1.16b, vhas_nul1.16b
+	mov	nul_match, vhas_nul1.d[0]
+
+L(tail):
+	/* Work out exactly where the string ends.  */
+	sub	tmp4, nul_match, #1
+	eor	tmp4, tmp4, nul_match
+	ands	chr_match, chr_match, tmp4
+	/* And pick the values corresponding to the last match.  */
+	csel	src_match, src, src_match, ne
+	csel	src_offset, chr_match, src_offset, ne
+
+	/* Count down from the top of the syndrome to find the last match.  */
+	clz	tmp3, src_offset
+	/* Src_match points beyond the word containing the match, so we can
+	   simply subtract half the bit-offset into the syndrome.  Because
+	   we are counting down, we need to go back one more character.  */
+	add	tmp3, tmp3, #2
+	sub	result, src_match, tmp3, lsr #1
+	/* But if the syndrome shows no match was found, then return NULL.  */
+	cmp	src_offset, #0
+	csel	result, result, xzr, ne
+
+	ret
+
+END (__strrchr_aarch64)
--- a/libc/intrin/getauxval.c
+++ b/libc/intrin/getauxval.c
--- a/libc/intrin/intrin.mk
+++ b/libc/intrin/intrin.mk
@ -6,6 +6,7 @@ PKGS += LIBC_INTRIN
 LIBC_INTRIN_ARTIFACTS += LIBC_INTRIN_A
 LIBC_INTRIN = $(LIBC_INTRIN_A_DEPS) $(LIBC_INTRIN_A)
 LIBC_INTRIN_A = o/$(MODE)/libc/intrin/intrin.a
+LIBC_INTRIN_A_FILES := $(wildcard libc/intrin/*)
 LIBC_INTRIN_A_HDRS = $(filter %.h,$(LIBC_INTRIN_A_FILES))
 LIBC_INTRIN_A_INCS = $(filter %.inc,$(LIBC_INTRIN_A_FILES))
 LIBC_INTRIN_A_SRCS_S = $(filter %.S,$(LIBC_INTRIN_A_FILES))
@ -13,8 +14,9 @@ LIBC_INTRIN_A_SRCS_C = $(filter %.c,$(LIBC_INTRIN_A_FILES))
 LIBC_INTRIN_A_SRCS = $(LIBC_INTRIN_A_SRCS_S) $(LIBC_INTRIN_A_SRCS_C)
 LIBC_INTRIN_A_CHECKS = $(LIBC_INTRIN_A).pkg

-LIBC_INTRIN_A_FILES :=					\
-	$(wildcard libc/intrin/*)
+ifeq ($(ARCH), aarch64)
+LIBC_INTRIN_A_SRCS_S += $(wildcard libc/intrin/aarch64/*.S)
+endif

 LIBC_INTRIN_A_OBJS =					\
 	$(LIBC_INTRIN_A_SRCS_S:%.S=o/$(MODE)/%.o)	\
@ -203,6 +205,8 @@ o/$(MODE)/libc/intrin/memmove.o: private		\
 			-fpie

 # these assembly files are safe to build on aarch64
+o/$(MODE)/libc/intrin/aarch64/%.o: libc/intrin/aarch64/%.S
+	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/fenv.o: libc/intrin/fenv.S
 	@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
 o/$(MODE)/libc/intrin/futex.o: libc/intrin/futex.S
--- a/libc/intrin/memchr.c
+++ b/libc/intrin/memchr.c
@ -20,6 +20,7 @@
 #include "libc/intrin/asan.internal.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));

@ -83,3 +84,5 @@ void *memchr(const void *s, int c, size_t n) {
  return memchr_pure(s, c, n);
 #endif
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/memcmp.c
+++ b/libc/intrin/memcmp.c
@ -20,6 +20,7 @@
 #include "libc/intrin/likely.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 #define PMOVMSKB(x) __builtin_ia32_pmovmskb128(x)

@ -129,7 +130,9 @@ microarchitecture("avx") static int memcmp_avx(const unsigned char *p,
 *     memcmp n=32768                      29 ps/byte         32,851 mb/s
 *     memcmp n=131072                     33 ps/byte         28,983 mb/s
 *
- * @return unsigned char subtraction at stop index
+ * @return an integer that's (1) equal to zero if `a` is equal to `b`,
+ *     (2) less than zero if `a` is less than `b`, or (3) greater than
+ *     zero if `a` is greater than `b`
 * @asyncsignalsafe
 */
 int memcmp(const void *a, const void *b, size_t n) {
@ -200,3 +203,5 @@ int memcmp(const void *a, const void *b, size_t n) {
  }
  return 0;
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/memmove.c
+++ b/libc/intrin/memmove.c
@ -22,6 +22,7 @@
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 typedef long long xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
 typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
@ -343,3 +344,5 @@ void *memmove(void *dst, const void *src, size_t n) {

 asm("memcpy = memmove\n\t"
    ".globl\tmemcpy");
+
+#endif /* __aarch64__ */
--- a/libc/intrin/memrchr.c
+++ b/libc/intrin/memrchr.c
@ -20,6 +20,7 @@
 #include "libc/intrin/asan.internal.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));

@ -81,3 +82,5 @@ void *memrchr(const void *s, int c, size_t n) {
  return memrchr_pure(s, c, n);
 #endif
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/memset.c
+++ b/libc/intrin/memset.c
@ -22,6 +22,7 @@
 #include "libc/nexgen32e/nexgen32e.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(1)));
 typedef long long xmm_a __attribute__((__vector_size__(16), __aligned__(16)));
@ -168,3 +169,5 @@ void *memset(void *p, int c, size_t n) {
    return memset_sse(b, c, n);
  }
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/stpcpy.c
+++ b/libc/intrin/stpcpy.c
@ -17,6 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
+#ifndef __aarch64__
+
+// TODO(jart): ASAN support here is important.

 typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
 typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
@ -63,3 +66,5 @@ char *stpcpy(char *d, const char *s) {
    ++i;
  }
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/strchr.c
+++ b/libc/intrin/strchr.c
@ -21,6 +21,7 @@
 #include "libc/intrin/asan.internal.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 static inline const char *strchr_pure(const char *s, int c) {
  for (;; ++s) {
@ -115,3 +116,5 @@ char *strchr(const char *s, int c) {
  return r;
 #endif
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/strchrnul.c
+++ b/libc/intrin/strchrnul.c
@ -21,6 +21,7 @@
 #include "libc/intrin/asan.internal.h"
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 static inline const char *strchrnul_pure(const char *s, int c) {
  for (;; ++s) {
@ -113,3 +114,5 @@ char *strchrnul(const char *s, int c) {
  return r;
 #endif
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/strcpy.c
+++ b/libc/intrin/strcpy.c
@ -17,6 +17,9 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
+#ifndef __aarch64__
+
+// TODO(jart): ASAN support here is important.

 typedef char xmm_u __attribute__((__vector_size__(16), __aligned__(1)));
 typedef char xmm_t __attribute__((__vector_size__(16), __aligned__(16)));
@ -63,3 +66,5 @@ char *strcpy(char *d, const char *s) {
    ++i;
  }
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/strlen.c
+++ b/libc/intrin/strlen.c
@ -19,6 +19,7 @@
 #include "libc/dce.h"
 #include "libc/intrin/asan.internal.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 /**
 * Returns length of NUL-terminated string.
@ -61,3 +62,5 @@ noasan size_t strlen(const char *s) {
  return n;
 #endif
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/strncmp.c
+++ b/libc/intrin/strncmp.c
@ -17,6 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
+#ifndef __aarch64__

 /**
 * Compares NUL-terminated strings w/ limit.
@ -32,3 +33,5 @@ int strncmp(const char *a, const char *b, size_t n) {
  while (i < n && a[i] == b[i] && b[i]) ++i;
  return (a[i] & 0xff) - (b[i] & 0xff);
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/strnlen.c
+++ b/libc/intrin/strnlen.c
@ -21,6 +21,7 @@
 #include "libc/intrin/asan.internal.h"
 #include "libc/intrin/bits.h"
 #include "libc/str/str.h"
+#ifndef __aarch64__

 static noasan size_t strnlen_x64(const char *s, size_t n, size_t i) {
  uint64_t w;
@ -56,3 +57,5 @@ noasan size_t strnlen(const char *s, size_t n) {
  if (IsAsan()) __asan_verify(s, i);
  return i;
 }
+
+#endif /* __aarch64__ */
--- a/libc/intrin/strrchr.c
+++ b/libc/intrin/strrchr.c
@ -17,6 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/str/str.h"
+#ifndef __aarch64__

 /**
 * Searches for last instance of character in string.
@ -29,3 +30,5 @@
 char *strrchr(const char *s, int c) {
  return memrchr(s, c, strlen(s));
 }
+
+#endif /* __aarch64__ */
--- a/libc/log/checkfail.c
+++ b/libc/log/checkfail.c
@ -36,10 +36,16 @@ STATIC_YOINK("strerror_wr");
 /**
 * Handles failure of CHECK_xx() macros.
 */
-relegated void __check_fail(const char *suffix, const char *opstr,
-                            uint64_t want, const char *wantstr, uint64_t got,
-                            const char *gotstr, const char *file, int line,
-                            const char *fmt, ...) {
+relegated void __check_fail(const char *suffix,   //
+                            const char *opstr,    //
+                            uint64_t want,        //
+                            const char *wantstr,  //
+                            uint64_t got,         //
+                            const char *gotstr,   //
+                            const char *file,     //
+                            int line,             //
+                            const char *fmt,      //
+                            ...) {
  int e;
  char *p;
  size_t i;
--- a/libc/log/checkfail_ndebug.c
+++ b/libc/log/checkfail_ndebug.c
@ -33,21 +33,69 @@
 *
 * @see libc/log/thunks/__check_fail_ndebug.S
 */
-relegated wontreturn void __check_fail_ndebug(uint64_t want, uint64_t got,
-                                              const char *file, int line,
-                                              const char *opchar,
-                                              const char *fmt, ...) {
-  va_list va;
+static relegated wontreturn void __check_fail_ndebug(uint64_t want,       //
+                                                     uint64_t got,        //
+                                                     const char *file,    //
+                                                     int line,            //
+                                                     const char *opchar,  //
+                                                     const char *fmt,     //
+                                                     va_list va) {
  __restore_tty();
  kprintf("%rerror:%s:%d: check failed: %'ld %s %'ld% m", file, line, want,
          opchar, got);
-  if (*fmt) {
+  if (fmt && *fmt) {
    kprintf(": ");
-    va_start(va, fmt);
    kvprintf(fmt, va);
-    va_end(va);
  }
  kprintf("\n");
  if (_weaken(__die)) _weaken(__die)();
  _Exitr(68);
 }
+
+void __check_fail_eq(uint64_t want, uint64_t got, const char *file, int line,
+                     const char *opchar, const char *fmt, ...) {
+  va_list va;
+  va_start(va, fmt);
+  __check_fail_ndebug(want, got, file, line, opchar, fmt, va);
+  va_end(va);
+}
+
+void __check_fail_ne(uint64_t want, uint64_t got, const char *file, int line,
+                     const char *opchar, const char *fmt, ...) {
+  va_list va;
+  va_start(va, fmt);
+  __check_fail_ndebug(want, got, file, line, opchar, fmt, va);
+  va_end(va);
+}
+
+void __check_fail_le(uint64_t want, uint64_t got, const char *file, int line,
+                     const char *opchar, const char *fmt, ...) {
+  va_list va;
+  va_start(va, fmt);
+  __check_fail_ndebug(want, got, file, line, opchar, fmt, va);
+  va_end(va);
+}
+
+void __check_fail_lt(uint64_t want, uint64_t got, const char *file, int line,
+                     const char *opchar, const char *fmt, ...) {
+  va_list va;
+  va_start(va, fmt);
+  __check_fail_ndebug(want, got, file, line, opchar, fmt, va);
+  va_end(va);
+}
+
+void __check_fail_ge(uint64_t want, uint64_t got, const char *file, int line,
+                     const char *opchar, const char *fmt, ...) {
+  va_list va;
+  va_start(va, fmt);
+  __check_fail_ndebug(want, got, file, line, opchar, fmt, va);
+  va_end(va);
+}
+
+void __check_fail_gt(uint64_t want, uint64_t got, const char *file, int line,
+                     const char *opchar, const char *fmt, ...) {
+  va_list va;
+  va_start(va, fmt);
+  __check_fail_ndebug(want, got, file, line, opchar, fmt, va);
+  va_end(va);
+}
--- a/libc/log/log.mk
+++ b/libc/log/log.mk
@ -6,9 +6,7 @@ PKGS += LIBC_LOG
 LIBC_LOG_ARTIFACTS += LIBC_LOG_A
 LIBC_LOG = $(LIBC_LOG_A_DEPS) $(LIBC_LOG_A)
 LIBC_LOG_A = o/$(MODE)/libc/log/log.a
-LIBC_LOG_A_FILES :=					\
-	$(wildcard libc/log/thunks/*)			\
-	$(wildcard libc/log/*)
+LIBC_LOG_A_FILES := $(wildcard libc/log/*)
 LIBC_LOG_A_HDRS = $(filter %.h,$(LIBC_LOG_A_FILES))
 LIBC_LOG_A_SRCS_C = $(filter %.c,$(LIBC_LOG_A_FILES))
 LIBC_LOG_A_SRCS_S = $(filter %.S,$(LIBC_LOG_A_FILES))
--- a/libc/log/thunks/__check_fail_eq.S
+++ b/libc/log/thunks/__check_fail_eq.S
@ -1,30 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-.text.unlikely
-
-//	Code-size saving thunk for CHECK_EQ() in NDEBUG mode.
-__check_fail_eq:
-	lea	.Lop(%rip),%r8
-	jmp	__check_fail_ndebug
-	.endfn	__check_fail_eq,globl
-
-	.rodata.str1.1
-.Lop:	.asciz	"=="
-	.previous
--- a/libc/log/thunks/__check_fail_ge.S
+++ b/libc/log/thunks/__check_fail_ge.S
@ -1,30 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-.text.unlikely
-
-//	Code-size saving thunk for CHECK_GE() in NDEBUG mode.
-__check_fail_ge:
-	lea	.Lop(%rip),%r8
-	jmp	__check_fail_ndebug
-	.endfn	__check_fail_ge,globl
-
-	.rodata.str1.1
-.Lop:	.asciz	">="
-	.previous
--- a/libc/log/thunks/__check_fail_gt.S
+++ b/libc/log/thunks/__check_fail_gt.S
@ -1,30 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-.text.unlikely
-
-//	Code-size saving thunk for CHECK_GT() in NDEBUG mode.
-__check_fail_gt:
-	lea	.Lop(%rip),%r8
-	jmp	__check_fail_ndebug
-	.endfn	__check_fail_gt,globl
-
-	.rodata.str1.1
-.Lop:	.asciz	">"
-	.previous
--- a/libc/log/thunks/__check_fail_le.S
+++ b/libc/log/thunks/__check_fail_le.S
@ -1,30 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-.text.unlikely
-
-//	Code-size saving thunk for CHECK_LE() in NDEBUG mode.
-__check_fail_le:
-	lea	.Lop(%rip),%r8
-	jmp	__check_fail_ndebug
-	.endfn	__check_fail_le,globl
-
-	.rodata.str1.1
-.Lop:	.asciz	"<="
-	.previous
--- a/libc/log/thunks/__check_fail_lt.S
+++ b/libc/log/thunks/__check_fail_lt.S
@ -1,30 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-.text.unlikely
-
-//	Code-size saving thunk for CHECK_LT() in NDEBUG mode.
-__check_fail_lt:
-	lea	.Lop(%rip),%r8
-	jmp	__check_fail_ndebug
-	.endfn	__check_fail_lt,globl
-
-	.rodata.str1.1
-.Lop:	.asciz	"<"
-	.previous
--- a/libc/log/thunks/__check_fail_ne.S
+++ b/libc/log/thunks/__check_fail_ne.S
@ -1,30 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-.text.unlikely
-
-//	Code-size saving thunk for CHECK_NE() in NDEBUG mode.
-__check_fail_ne:
-	lea	.Lop(%rip),%r8
-	jmp	__check_fail_ndebug
-	.endfn	__check_fail_ne,globl
-
-	.rodata.str1.1
-.Lop:	.asciz	"!="
-	.previous
--- a/libc/nexgen32e/bench.h
+++ b/libc/nexgen32e/bench.h
@ -43,8 +43,22 @@
    Ticks;                                               \
  })
 #else
-#define __startbench() rdtsc()
-#define __endbench()   rdtsc()
+#define __startbench()                \
+  ({                                  \
+    uint64_t _ts;                     \
+    asm volatile("isb" ::: "memory"); \
+    _ts = rdtsc();                    \
+    asm volatile("isb" ::: "memory"); \
+    _ts;                              \
+  })
+#define __endbench()                  \
+  ({                                  \
+    uint64_t _ts;                     \
+    asm volatile("isb" ::: "memory"); \
+    _ts = rdtsc();                    \
+    asm volatile("isb" ::: "memory"); \
+    _ts;                              \
+  })
 #endif

 #define __startbench_m() mfence_lfence_rdtsc_lfence()
--- a/libc/nexgen32e/crc32-pclmul.S
+++ b/libc/nexgen32e/crc32-pclmul.S
@ -1,262 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-//	Computes Phil Katz CRC-32 w/ carryless multiply isa.
-//
-//	This is support code that's abstracted by crc32_z().
-//
-//	@param	edi is initial value
-//	@param	rsi points to buffer
-//	@param	rdx is bytes in buffer that's >=64 and %16==0
-//	@return	eax is crc32
-//	@note	needs Westmere (c.2010) or Bulldozer (c.2011)
-//	@see	“Fast CRC Computation for Generic Polynomials Using
-//		 PCLMULQDQ Instruction” V. Gopal, E. Ozturk, et al.,
-//		 2009, intel.ly/2ySEwL0
-crc32_pclmul:
-	.leafprologue
-	.profilable
-	movdqu	(%rsi),%xmm7
-	movd	%edi,%xmm1
-	movdqu	16(%rsi),%xmm9
-	movdqu	32(%rsi),%xmm4
-	movdqu	48(%rsi),%xmm0
-	lea	-64(%rdx),%rdi
-	lea	64(%rsi),%rcx
-	pxor	%xmm7,%xmm1
-	movdqa	.Lk1k2(%rip),%xmm8
-	cmp	$63,%rdi
-	jbe	2f
-	lea	-128(%rdx),%rdi
-	mov	%rdi,%rdx
-	shr	$6,%rdx
-	lea	2(%rdx),%rax
-	sal	$6,%rax
-	add	%rax,%rsi
-	mov	%rcx,%rax
-3:	add	$64,%rax
-	movdqa	%xmm1,%xmm7
-	movdqa	%xmm4,%xmm5
-	movdqa	%xmm0,%xmm3
-	movdqa	%xmm9,%xmm6
-	movdqa	%xmm9,%xmm2
-	movdqu	-48(%rax),%xmm9
-	pclmullqlqdq %xmm8,%xmm7
-	pclmullqlqdq %xmm8,%xmm6
-	pclmullqlqdq %xmm8,%xmm5
-	pclmulhqhqdq %xmm8,%xmm1
-	pclmulhqhqdq %xmm8,%xmm2
-	pclmulhqhqdq %xmm8,%xmm4
-	pxor	%xmm7,%xmm1
-	movdqu	-64(%rax),%xmm7
-	pxor	%xmm6,%xmm2
-	pxor	%xmm5,%xmm4
-	movdqu	-32(%rax),%xmm6
-	movdqu	-16(%rax),%xmm5
-	pclmullqlqdq %xmm8,%xmm3
-	pclmulhqhqdq %xmm8,%xmm0
-	pxor	%xmm7,%xmm1
-	pxor	%xmm3,%xmm0
-	pxor	%xmm2,%xmm9
-	pxor	%xmm6,%xmm4
-	pxor	%xmm5,%xmm0
-	cmp	%rsi,%rax
-	jne	3b
-	lea	1(%rdx),%rax
-	sal	$6,%rdx
-	sal	$6,%rax
-	sub	%rdx,%rdi
-	add	%rax,%rcx
-2:	movdqa	.Lk3k4(%rip),%xmm3
-	movdqa	%xmm1,%xmm2
-	movdqa	%xmm1,%xmm5
-	pclmulhqhqdq %xmm3,%xmm2
-	pclmullqlqdq %xmm3,%xmm5
-	pxor	%xmm9,%xmm2
-	pxor	%xmm5,%xmm2
-	movdqa	%xmm2,%xmm5
-	pclmulhqhqdq %xmm3,%xmm2
-	movdqa	%xmm2,%xmm1
-	pclmullqlqdq %xmm3,%xmm5
-	pxor	%xmm4,%xmm1
-	pxor	%xmm5,%xmm1
-	movdqa	%xmm1,%xmm2
-	pclmulhqhqdq %xmm3,%xmm1
-	pclmullqlqdq %xmm3,%xmm2
-	pxor	%xmm1,%xmm0
-	pxor	%xmm2,%xmm0
-	cmp	$15,%rdi
-	jbe	4f
-	sub	$16,%rdi
-	mov	%rcx,%rax
-	and	$-16,%rdi
-	lea	16(%rcx,%rdi),%rdx
-5:	movdqa	%xmm0,%xmm1
-	movdqu	(%rax),%xmm6
-	pclmulhqhqdq %xmm3,%xmm0
-	add	$16,%rax
-	pclmullqlqdq %xmm3,%xmm1
-	pxor	%xmm1,%xmm0
-	pxor	%xmm6,%xmm0
-	cmp	%rax,%rdx
-	jne	5b
-4:	movdqa	%xmm0,%xmm1
-	movdqa	.Lboop(%rip),%xmm2
-	psrldq	$8,%xmm0
-	pclmullqhqdq %xmm3,%xmm1
-	movdqa	.Lpoly(%rip),%xmm3
-	pxor	%xmm1,%xmm0
-	movdqa	%xmm0,%xmm1
-	pand	%xmm2,%xmm0
-	pclmullqlqdq .Lk5k0(%rip),%xmm0
-	psrldq	$4,%xmm1
-	pxor	%xmm0,%xmm1
-	movdqa	%xmm1,%xmm0
-	pand	%xmm2,%xmm0
-	pclmullqhqdq %xmm3,%xmm0
-	pand	%xmm2,%xmm0
-	pclmullqlqdq %xmm3,%xmm0
-	pxor	%xmm1,%xmm0
-	movq	%xmm0,%rax
-	shr	$32,%rax
-	.leafepilogue
-	.endfn	crc32_pclmul,globl,hidden
-
-//	Definitions of the bit-reflected domain constants k1,k2,k3, etc.
-//	and the CRC32+Barrett polynomials given at the end of the paper.
-	.rodata.cst16
-.Lk1k2:	.quad	0x0000000154442bd4
-	.quad	0x00000001c6e41596
-	.endobj	.Lk1k2
-.Lk3k4:	.quad	0x00000001751997d0
-	.quad	0x00000000ccaa009e
-	.endobj	.Lk3k4
-.Lk5k0:	.quad	0x0000000163cd6124
-	.quad	0x0000000000000000
-	.endobj	.Lk5k0
-.Lboop:	.quad	0x00000000ffffffff
-	.quad	0x00000000ffffffff
-	.endobj	.Lboop
-.Lpoly:	.quad	0x00000001db710641
-	.quad	0x00000001f7011641
-	.endobj	.Lpoly
-	.previous
-
-/*	crc32() w/ pclmul for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1               4437.000        42.375        38.141      85
-	1                 45.000        39.375        38.234      85
-	2                 31.500        25.312        23.102     141
-	3                 25.667        19.792        17.911     181
-	4                 21.250        16.219        15.035     216
-	7                 18.429        12.946        11.712     277
-	8                 16.125        12.578        10.998     296
-	15                12.867         9.925         9.161     355
-	16                12.438         9.836         9.114     357
-	31                11.194         8.528         8.149     399
-	32                10.781         8.418         8.098     401
-	63                 9.063         7.780         7.647     425
-	64                 3.109         1.604         1.414    2299
-	127                2.260         1.824         1.729    1880
-	128                1.305         0.860         0.806    4033
-	255                1.290         1.001         0.948    3428
-	256                0.574         0.491         0.476    6822
-	511                0.773         0.571         0.546    5956
-	512                0.354         0.320         0.306   10613
-	1023               0.425         0.365         0.347    9375
-	1024               0.237         0.229         0.231   14097
-	2047               0.278         0.251         0.246   13236
-	2048               0.187         0.187         0.188   17306
-	4095               0.229         0.200         0.194   16761
-	4096               0.162         0.170         0.167   19438
-	8191               0.182         0.173         0.178   18266
-	8192               0.162         0.155         0.158   20560
-	16383              0.156         0.162         0.154   21136
-	16384              0.156         0.156         0.148   22005
-	32767              0.163         0.149         0.149   21768
-	32768              0.150         0.146         0.145   22491
-	65535              0.158         0.141         0.141   23102
-	65536              0.149         0.140         0.138   23478
-	131071             0.150         0.145         0.141   23011
-	131072             0.148         0.141         0.148   21892
-	262143             0.151         0.148         0.147   22136
-	262144             0.149         0.146         0.146   22298
-	524287             0.150         0.149         0.149   21832
-	524288             0.148         0.148         0.147   22043
-	1048575            0.148         0.158         0.163   19913
-	1048576            0.156         0.179         0.153   21186
-	2097151            0.153         0.149         0.148   21979
-	2097152            0.147         0.148         0.147   22040
-	4194303            0.148         0.148         0.151   21482
-	4194304            0.148         0.148         0.147   22061
-	8388607            0.185         0.183         0.185   17536
-	8388608            0.193         0.180         0.183   17769
-
-	crc32() w/ 10+ year old cpus for #c per n where c ≈ 0.293ns
-	N                     x1            x8           x64	mBps
-	------------------------------------------------------------
-	1               4447.000        43.625        37.641      86
-	1                 41.000        37.125        37.609      86
-	2                 31.500        26.562        22.477     145
-	3                 25.000        20.125        17.422     187
-	4                 21.250        16.594        15.230     213
-	7                 16.714        13.089        11.717     277
-	8                 16.875        12.609        11.174     291
-	15                12.733         9.958         9.339     348
-	16                12.438         9.852         9.208     353
-	31                10.935         8.617         8.164     398
-	32                10.906         8.496         8.155     399
-	63                 9.095         7.819         7.692     423
-	64                 9.172         7.807         7.692     423
-	127                8.165         7.531         7.438     437
-	128                8.133         7.503         7.437     437
-	255                7.714         7.329         7.293     446
-	256                7.723         7.348         7.293     446
-	511                7.434         7.253         7.223     450
-	512                7.412         7.237         7.218     450
-	1023               7.274         7.214         7.201     451
-	1024               7.292         7.203         7.189     452
-	2047               7.232         7.185         7.178     453
-	2048               7.239         7.189         7.186     452
-	4095               7.189         7.175         7.172     453
-	4096               7.192         7.173         7.172     453
-	8191               7.187         7.173         7.172     453
-	8192               7.183         7.174         7.181     453
-	16383              7.175         7.170         7.169     453
-	16384              7.176         7.169         7.169     453
-	32767              7.169         7.182         7.170     453
-	32768              7.173         7.172         7.172     453
-	65535              7.170         7.170         7.171     453
-	65536              7.172         7.171         7.204     451
-	131071             7.170         7.354         7.260     448
-	131072             7.172         7.172         7.182     453
-	262143             7.037         7.178         7.182     453
-	262144             7.169         7.343         7.205     451
-	524287             7.438         7.170         7.206     451
-	524288             7.169         7.164         7.209     451
-	1048575            6.995         7.119         7.158     454
-	1048576            7.168         7.110         7.157     454
-	2097151            7.057         7.058         7.065     460
-	2097152            6.977         7.047         7.089     458
-	4194303            7.017         7.504         7.030     462
-	4194304            7.025         7.059         7.030     462
-	8388607            7.082         6.980         6.997     464
-	8388608            7.051         6.985         6.999     464 */
--- a/libc/nexgen32e/crc32.h
+++ b/libc/nexgen32e/crc32.h
@ -8,12 +8,6 @@ extern const uint32_t kCrc32cTab[256];
 void crc32init(uint32_t[hasatleast 256], uint32_t);
 uint32_t crc32a(uint32_t, const void *, size_t);
 uint32_t crc32c(uint32_t, const void *, size_t);
-uint32_t crc32_z(uint32_t, const void *, size_t);
-uint32_t crc32c_pure(uint32_t, const void *, size_t)
-strlenesque _Hide;
-uint32_t crc32c_sse42(uint32_t, const void *, size_t)
-strlenesque _Hide;
-uint32_t crc32_pclmul(uint32_t, const void *, size_t) _Hide;

 COSMOPOLITAN_C_END_
 #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
--- a/libc/str/crc32z.c
+++ b/libc/str/crc32z.c
@ -1,63 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/dce.h"
-#include "libc/intrin/asan.internal.h"
-#include "libc/macros.internal.h"
-#include "libc/nexgen32e/crc32.h"
-#include "libc/nexgen32e/x86feature.h"
-#include "libc/str/str.h"
-
-/**
- * Computes Phil Katz CRC-32 used by zip/zlib/gzip/etc.
- *
- *     x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
- *     0b100000100110000010001110110110111
- *     _bitreverse32(0x104c11db7)
- *
- * This implementation takes 32 picoseconds per byte or 30 gibibyte/s.
- *
- * @param h is initial value
- */
-uint32_t crc32_z(uint32_t h, const void *data, size_t size) {
-  size_t n;
-  static bool once;
-  const unsigned char *p, *e;
-  static uint32_t kCrc32Tab[256];
-  if (!once) {
-    crc32init(kCrc32Tab, 0xedb88320);
-    once = 0;
-  }
-  if (size == -1) {
-    size = data ? strlen(data) : 0;
-  }
-  p = data;
-  e = p + size;
-  h ^= 0xffffffff;
-  if (X86_HAVE(PCLMUL)) {
-    while (((intptr_t)p & 15) && p < e)
-      h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
-    if ((n = ROUNDDOWN(e - p, 16)) >= 64) {
-      if (IsAsan()) __asan_verify(p, n);
-      h = crc32_pclmul(h, p, n); /* 51x faster */
-      p += n;
-    }
-  }
-  while (p < e) h = h >> 8 ^ kCrc32Tab[(h & 0xff) ^ *p++];
-  return h ^ 0xffffffff;
-}
--- a/libc/sysv/consts/hwap.h
+++ b/libc/sysv/consts/hwap.h
@ -0,0 +1,61 @@
+#ifndef COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_
+#define COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_
+#ifdef __aarch64__
+
+// Feature bits for getauxval(AT_HWCAP) on AARCH64 GNU/SystemD.
+
+#define HWCAP_FP       (1 << 0)
+#define HWCAP_ASIMD    (1 << 1)
+#define HWCAP_EVTSTRM  (1 << 2)
+#define HWCAP_AES      (1 << 3)
+#define HWCAP_PMULL    (1 << 4)
+#define HWCAP_SHA1     (1 << 5)
+#define HWCAP_SHA2     (1 << 6)
+#define HWCAP_CRC32    (1 << 7)
+#define HWCAP_ATOMICS  (1 << 8)
+#define HWCAP_FPHP     (1 << 9)
+#define HWCAP_ASIMDHP  (1 << 10)
+#define HWCAP_CPUID    (1 << 11)
+#define HWCAP_ASIMDRDM (1 << 12)
+#define HWCAP_JSCVT    (1 << 13)
+#define HWCAP_FCMA     (1 << 14)
+#define HWCAP_LRCPC    (1 << 15)
+#define HWCAP_DCPOP    (1 << 16)
+#define HWCAP_SHA3     (1 << 17)
+#define HWCAP_SM3      (1 << 18)
+#define HWCAP_SM4      (1 << 19)
+#define HWCAP_ASIMDDP  (1 << 20)
+#define HWCAP_SHA512   (1 << 21)
+#define HWCAP_SVE      (1 << 22)
+#define HWCAP_ASIMDFHM (1 << 23)
+#define HWCAP_DIT      (1 << 24)
+#define HWCAP_USCAT    (1 << 25)
+#define HWCAP_ILRCPC   (1 << 26)
+#define HWCAP_FLAGM    (1 << 27)
+#define HWCAP_SSBS     (1 << 28)
+#define HWCAP_SB       (1 << 29)
+#define HWCAP_PACA     (1 << 30)
+#define HWCAP_PACG     (1UL << 31)
+
+#define HWCAP2_DCPODP     (1 << 0)
+#define HWCAP2_SVE2       (1 << 1)
+#define HWCAP2_SVEAES     (1 << 2)
+#define HWCAP2_SVEPMULL   (1 << 3)
+#define HWCAP2_SVEBITPERM (1 << 4)
+#define HWCAP2_SVESHA3    (1 << 5)
+#define HWCAP2_SVESM4     (1 << 6)
+#define HWCAP2_FLAGM2     (1 << 7)
+#define HWCAP2_FRINT      (1 << 8)
+#define HWCAP2_SVEI8MM    (1 << 9)
+#define HWCAP2_SVEF32MM   (1 << 10)
+#define HWCAP2_SVEF64MM   (1 << 11)
+#define HWCAP2_SVEBF16    (1 << 12)
+#define HWCAP2_I8MM       (1 << 13)
+#define HWCAP2_BF16       (1 << 14)
+#define HWCAP2_DGH        (1 << 15)
+#define HWCAP2_RNG        (1 << 16)
+#define HWCAP2_BTI        (1 << 17)
+#define HWCAP2_MTE        (1 << 18)
+
+#endif /* __aarch64__ */
+#endif /* COSMOPOLITAN_LIBC_SYSV_CONSTS_HWAP_H_ */
--- a/libc/tinymath/acoshl.c
+++ b/libc/tinymath/acoshl.c
@ -5,6 +5,13 @@
 │ FreeBSD lib/msun/src/e_acoshl.c                                              │
 │ Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans.     │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -28,12 +35,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
--- a/libc/tinymath/asinh.c
+++ b/libc/tinymath/asinh.c
@ -27,6 +27,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/feval.internal.h"
+#include "libc/tinymath/freebsd.internal.h"

 asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
--- a/libc/tinymath/asinhl.c
+++ b/libc/tinymath/asinhl.c
@ -5,6 +5,13 @@
 │ FreeBSD lib/msun/src/s_asinhl.c                                              │
 │ Converted to ldbl by David Schultz <das@FreeBSD.ORG> and Bruce D. Evans.     │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -28,12 +35,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
--- a/libc/tinymath/atan2.c
+++ b/libc/tinymath/atan2.c
@ -4,6 +4,13 @@
 │                                                                              │
 │ FreeBSD lib/msun/src/e_atan2.c                                               │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -27,12 +34,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
--- a/libc/tinymath/atan2l.c
+++ b/libc/tinymath/atan2l.c
@ -79,7 +79,7 @@ long double atan2l(long double y, long double x)
 	long double z;
 	int m, ex, ey;

-	if (isnan(x) || isnan(y))
+	if (isunordered(x, y))
 		return x+y;
 	if (x == 1)
 		return atanl(y);
--- a/libc/tinymath/coshl.c
+++ b/libc/tinymath/coshl.c
@ -5,6 +5,13 @@
 │ FreeBSD lib/msun/src/s_tanhf.c                                               │
 │ Converted to long double by Bruce D. Evans.                                  │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -28,12 +35,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
--- a/libc/tinymath/cosl.c
+++ b/libc/tinymath/cosl.c
@ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

-long double cosl(long double x) {
+/**
+ * Returns cosine of 𝑥.
+ */
+long double cosl(long double x)
+{
 #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
 	return cos(x);
 #elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
--- a/libc/tinymath/expm1f.c
+++ b/libc/tinymath/expm1f.c
@ -4,6 +4,13 @@
 │                                                                              │
 │ FreeBSD lib/msun/src/s_expm1f.c                                              │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -27,12 +34,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
--- a/libc/tinymath/expo2.c
+++ b/libc/tinymath/expo2.c
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 #define asdouble(i) ((union{uint64_t _i; double _f;}){i})._f
 #define INSERT_WORDS(d,hi,lo)                     \
--- a/libc/tinymath/expo2f.c
+++ b/libc/tinymath/expo2f.c
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 #define asfloat(i) ((union{uint32_t _i; float _f;}){i})._f
 #define SET_FLOAT_WORD(d,w)                       \
--- a/libc/tinymath/fdim.c
+++ b/libc/tinymath/fdim.c
@ -22,6 +22,6 @@
 * Returns positive difference.
 */
 double fdim(double x, double y) {
-  if (isnan(x) || isnan(y)) return NAN;
+  if (isunordered(x, y)) return NAN;
  return x > y ? x - y : 0;
 }
--- a/libc/tinymath/fdimf.c
+++ b/libc/tinymath/fdimf.c
@ -22,6 +22,6 @@
 * Returns positive difference.
 */
 float fdimf(float x, float y) {
-  if (isnan(x) || isnan(y)) return NAN;
+  if (isunordered(x, y)) return NAN;
  return x > y ? x - y : 0;
 }
--- a/libc/tinymath/fdiml.c
+++ b/libc/tinymath/fdiml.c
@ -25,7 +25,7 @@ long double fdiml(long double x, long double y) {
 #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
  return fdim(x, y);
 #else
-  if (isnan(x) || isnan(y)) return NAN;
+  if (isunordered(x, y)) return NAN;
  return x > y ? x - y : 0;
 #endif
 }
--- a/libc/tinymath/freebsd.internal.h
+++ b/libc/tinymath/freebsd.internal.h
@ -906,67 +906,6 @@ irintl(long double x)
 	__x + __y;			\
 })

-/*
- * ieee style elementary functions
- *
- * We rename functions here to improve other sources' diffability
- * against fdlibm.
- */
-#define	__ieee754_sqrt	sqrt
-#define	__ieee754_acos	acos
-#define	__ieee754_acosh	acosh
-#define	__ieee754_log	log
-#define	__ieee754_log2	log2
-#define	__ieee754_atanh	atanh
-#define	__ieee754_asin	asin
-#define	__ieee754_atan2	atan2
-#define	__ieee754_exp	exp
-#define	__ieee754_cosh	cosh
-#define	__ieee754_fmod	fmod
-#define	__ieee754_pow	pow
-#define	__ieee754_lgamma lgamma
-#define	__ieee754_gamma	gamma
-#define	__ieee754_lgamma_r lgamma_r
-#define	__ieee754_gamma_r gamma_r
-#define	__ieee754_log10	log10
-#define	__ieee754_sinh	sinh
-#define	__ieee754_hypot	hypot
-#define	__ieee754_j0	j0
-#define	__ieee754_j1	j1
-#define	__ieee754_y0	y0
-#define	__ieee754_y1	y1
-#define	__ieee754_jn	jn
-#define	__ieee754_yn	yn
-#define	__ieee754_remainder remainder
-#define	__ieee754_scalb	scalb
-#define	__ieee754_sqrtf	sqrtf
-#define	__ieee754_acosf	acosf
-#define	__ieee754_acoshf acoshf
-#define	__ieee754_logf	logf
-#define	__ieee754_atanhf atanhf
-#define	__ieee754_asinf	asinf
-#define	__ieee754_atan2f atan2f
-#define	__ieee754_expf	expf
-#define	__ieee754_coshf	coshf
-#define	__ieee754_fmodf	fmodf
-#define	__ieee754_powf	powf
-#define	__ieee754_lgammaf lgammaf
-#define	__ieee754_gammaf gammaf
-#define	__ieee754_lgammaf_r lgammaf_r
-#define	__ieee754_gammaf_r gammaf_r
-#define	__ieee754_log10f log10f
-#define	__ieee754_log2f log2f
-#define	__ieee754_sinhf	sinhf
-#define	__ieee754_hypotf hypotf
-#define	__ieee754_j0f	j0f
-#define	__ieee754_j1f	j1f
-#define	__ieee754_y0f	y0f
-#define	__ieee754_y1f	y1f
-#define	__ieee754_jnf	jnf
-#define	__ieee754_ynf	ynf
-#define	__ieee754_remainderf remainderf
-#define	__ieee754_scalbf scalbf
-
 /* fdlibm kernel function */
 int	__kernel_rem_pio2(double*,double*,int,int,int);

--- a/libc/tinymath/llrint.c
+++ b/libc/tinymath/llrint.c
@ -16,7 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/tinymath/tinymath.h"
+#include "libc/math.h"

 /**
 * Rounds to nearest integer.
--- a/libc/tinymath/lrint.c
+++ b/libc/tinymath/lrint.c
@ -83,7 +83,8 @@ static dontinline long lrint_slow(double x) {
 /**
 * Rounds to nearest integer.
 */
-long lrint(double x) {
+long lrint(double x)
+{
 #ifdef __x86_64__
 	long res;
 	asm("cvtsd2si\t%1,%0" : "=r"(res) : "x"(x));
--- a/libc/tinymath/modf.c
+++ b/libc/tinymath/modf.c
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 double modf(double x, double *iptr)
 {
--- a/libc/tinymath/modff.c
+++ b/libc/tinymath/modff.c
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 float modff(float x, float *iptr)
 {
--- a/libc/tinymath/nextafter.c
+++ b/libc/tinymath/nextafter.c
@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 double nextafter(double x, double y)
 {
@ -40,7 +40,7 @@ double nextafter(double x, double y)
 	uint64_t ax, ay;
 	int e;

-	if (isnan(x) || isnan(y))
+	if (isunordered(x, y))
 		return x + y;
 	if (ux.i == uy.i)
 		return y;
--- a/libc/tinymath/nextafterf.c
+++ b/libc/tinymath/nextafterf.c
@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 float nextafterf(float x, float y)
 {
 	union {float f; uint32_t i;} ux={x}, uy={y};
 	uint32_t ax, ay, e;

-	if (isnan(x) || isnan(y))
+	if (isunordered(x, y))
 		return x + y;
 	if (ux.i == uy.i)
 		return y;
--- a/libc/tinymath/nextafterl.c
+++ b/libc/tinymath/nextafterl.c
@ -36,13 +36,14 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

-long double nextafterl(long double x, long double y) {
+long double nextafterl(long double x, long double y)
+{
 #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
 	return nextafter(x, y);
 #elif LDBL_MANT_DIG == 64 && LDBL_MAX_EXP == 16384
 	union ldshape ux, uy;

-	if (isnan(x) || isnan(y))
+	if (isunordered(x, y))
 		return x + y;
 	if (x == y)
 		return y;
@ -75,7 +76,7 @@ long double nextafterl(long double x, long double y) {
 #elif LDBL_MANT_DIG == 113 && LDBL_MAX_EXP == 16384
 	union ldshape ux, uy;

-	if (isnan(x) || isnan(y))
+	if (isunordered(x, y))
 		return x + y;
 	if (x == y)
 		return y;
--- a/libc/tinymath/nexttoward.c
+++ b/libc/tinymath/nexttoward.c
@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 double nexttoward(double x, long double y)
 {
 	union {double f; uint64_t i;} ux = {x};
 	int e;

-	if (isnan(x) || isnan(y))
+	if (isunordered(x, y))
 		return x + y;
 	if (x == y)
 		return y;
--- a/libc/tinymath/nexttowardf.c
+++ b/libc/tinymath/nexttowardf.c
@ -32,14 +32,14 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 float nexttowardf(float x, long double y)
 {
 	union {float f; uint32_t i;} ux = {x};
 	uint32_t e;

-	if (isnan(x) || isnan(y))
+	if (isunordered(x, y))
 		return x + y;
 	if (x == y)
 		return y;
--- a/libc/tinymath/nexttowardl.c
+++ b/libc/tinymath/nexttowardl.c
@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 long double nexttowardl(long double x, long double y)
 {
--- a/libc/tinymath/polevll.c
+++ b/libc/tinymath/polevll.c
@ -34,7 +34,7 @@ asm(".ident\t\"\\n\\n\
 OpenBSD libm (ISC License)\\n\
 Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 /* origin: OpenBSD /usr/src/lib/libm/src/polevll.c */
 /*
--- a/libc/tinymath/pow.c
+++ b/libc/tinymath/pow.c
@ -32,10 +32,10 @@
 #include "libc/tinymath/pow_data.internal.h"

 asm(".ident\t\"\\n\\n\
-Double-precision math functions (MIT License)\\n\
-Copyright 2018 ARM Limited\"");
+Optimized Routines (MIT License)\\n\
+Copyright 2022 ARM Limited\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 /*
 * Double-precision x^y function.
--- a/libc/tinymath/poz.c
+++ b/libc/tinymath/poz.c
@ -121,9 +121,9 @@ double pochisq(
    	    e = (even ? 0.0 : LOG_SQRT_PI);
    	    c = log(a);
    	    while (z <= x) {
-		e = log(z) + e;
-		s += ex(c * z - a - e);
-		z += 1.0;
+				e = log(z) + e;
+				s += ex(c * z - a - e);
+				z += 1.0;
    	    }
    	    return (s);
    	} else {
--- a/libc/tinymath/rempio2large.c
+++ b/libc/tinymath/rempio2large.c
@ -35,8 +35,8 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
+// clang-format off

-/* clang-format off */
 /* origin: FreeBSD /usr/src/lib/msun/src/k_rem_pio2.c */
 /*
 * ====================================================
--- a/libc/tinymath/scalb.c
+++ b/libc/tinymath/scalb.c
@ -58,7 +58,7 @@ asm(".include \"libc/disclaimer.inc\"");
 */
 double scalb(double x, double fn)
 {
-	if (isnan(x) || isnan(fn))
+	if (isunordered(x, fn))
 		return x*fn;
 	if (!isfinite(fn)) {
 		if (fn > 0.0)
--- a/libc/tinymath/scalbf.c
+++ b/libc/tinymath/scalbf.c
@ -38,7 +38,8 @@ asm(".include \"libc/disclaimer.inc\"");

 float scalbf(float x, float fn)
 {
-	if (isnan(x) || isnan(fn)) return x*fn;
+	if (isunordered(x, fn))
+		return x*fn;
 	if (!isfinite(fn)) {
 		if (fn > 0.0f)
 			return x*fn;
--- a/libc/tinymath/sincosf.internal.h
+++ b/libc/tinymath/sincosf.internal.h
@ -3,7 +3,7 @@
 #include "libc/tinymath/internal.h"
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 COSMOPOLITAN_C_START_
-/* clang-format off */
+// clang-format off

 /*
 * Header for sinf, cosf and sincosf.
--- a/libc/tinymath/sindf.c
+++ b/libc/tinymath/sindf.c
@ -35,7 +35,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 /* origin: FreeBSD /usr/src/lib/msun/src/k_sinf.c */
 /*
--- a/libc/tinymath/sinf.c
+++ b/libc/tinymath/sinf.c
@ -37,7 +37,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 /* origin: FreeBSD /usr/src/lib/msun/src/s_sinf.c */
 /*
--- a/libc/tinymath/sinh.c
+++ b/libc/tinymath/sinh.c
@ -27,12 +27,13 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/expo.internal.h"
+#include "libc/tinymath/freebsd.internal.h"

 asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 /**
 * Returns hyperbolic sine of 𝑥.
--- a/libc/tinymath/sinhf.c
+++ b/libc/tinymath/sinhf.c
@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 /**
 * Returns hyperbolic sine of 𝑥.
--- a/libc/tinymath/sinhl.c
+++ b/libc/tinymath/sinhl.c
@ -5,6 +5,13 @@
 │ FreeBSD lib/msun/src/e_sinhl.c                                               │
 │ Converted to long double by Bruce D. Evans                                   │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -28,12 +35,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/likely.h"
 #include "libc/math.h"
--- a/libc/tinymath/sinl.c
+++ b/libc/tinymath/sinl.c
@ -36,7 +36,11 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

-long double sinl(long double x) {
+/**
+ * Returns sine of 𝑥.
+ */
+long double sinl(long double x)
+{
 #if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
 	return sin(x);
 #elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
--- a/libc/tinymath/tan.c
+++ b/libc/tinymath/tan.c
@ -36,7 +36,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off

 /* origin: FreeBSD /usr/src/lib/msun/src/s_tan.c */
 /*
--- a/libc/tinymath/tanhf.c
+++ b/libc/tinymath/tanhf.c
@ -5,6 +5,13 @@
 │ FreeBSD lib/msun/src/s_tanhf.c                                               │
 │ Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.     │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -28,12 +35,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
--- a/libc/tinymath/tanhl.c
+++ b/libc/tinymath/tanhl.c
@ -5,6 +5,13 @@
 │ FreeBSD lib/msun/src/s_tanhl.c                                               │
 │ Converted to long double by Bruce D. Evans                                   │
 │                                                                              │
+│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
+│                                                                              │
+│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
+│ Permission to use, copy, modify, and distribute this                         │
+│ software is freely granted, provided that this notice                        │
+│ is preserved.                                                                │
+│                                                                              │
 │ Copyright (c) 1992-2023 The FreeBSD Project.                                 │
 │                                                                              │
 │ Redistribution and use in source and binary forms, with or without           │
@ -28,12 +35,6 @@
 │ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF       │
 │ SUCH DAMAGE.                                                                 │
 │                                                                              │
-│ Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.            │
-│ Developed at SunPro, a Sun Microsystems, Inc. business.                      │
-│ Permission to use, copy, modify, and distribute this                         │
-│ software is freely granted, provided that this notice                        │
-│ is preserved.                                                                │
-│                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/intrin/likely.h"
 #include "libc/math.h"
--- a/libc/tinymath/tinymath.h
+++ b/libc/tinymath/tinymath.h
--- a/libc/zipos/open.c
+++ b/libc/zipos/open.c
@ -31,7 +31,6 @@
 #include "libc/intrin/directmap.internal.h"
 #include "libc/intrin/extend.internal.h"
 #include "libc/intrin/weaken.h"
-#include "libc/nexgen32e/crc32.h"
 #include "libc/runtime/internal.h"
 #include "libc/runtime/memtrack.internal.h"
 #include "libc/sysv/consts/f.h"
@ -159,11 +158,6 @@ static int __zipos_load(struct Zipos *zipos, size_t cf, unsigned flags,
  h->pos = 0;
  h->cfile = cf;
  h->size = size;
-  if (!IsTiny() && h->mem &&
-      crc32_z(0, h->mem, h->size) != ZIP_LFILE_CRC32(zipos->map + lf)) {
-    h->mem = 0;
-    eio();
-  }
  if (h->mem) {
    minfd = 3;
    __fds_lock();
--- a/test/libc/intrin/memcmp_test.c
+++ b/test/libc/intrin/memcmp_test.c
@ -61,6 +61,17 @@ TEST(memcmp, hug) {
  }
 }

+static int coerce(int result) {
+#ifdef __aarch64__
+  // arm's strcmp assembly is nuts and unpredictable, but it's legal
+  if (result < 0) return -1;
+  if (result > 0) return +1;
+  return 0;
+#else
+  return result;
+#endif
+}
+
 TEST(memcmp, fuzz) {
  int i, o, n, g;
  char a[256], b[256];
@ -79,8 +90,18 @@ TEST(memcmp, fuzz) {
    }
    o = rand() & 31;
    n = rand() % (sizeof(a) - o);
-    g = golden(a + o, b + o, n);
-    ASSERT_EQ(g, memcmp(a + o, b + o, n), "n=%d o=%d", n, o);
+    g = coerce(golden(a + o, b + o, n));
+#if 0
+    if (memcmp(a + o, b + o, n) != g) {
+      kprintf("const size_t g = %d;\n", g);
+      kprintf("const size_t n = %d;\n", n);
+      kprintf("const char a[] = unbingstr(%#.*hhhs); /* %p */\n", n, a + o,
+              a + o);
+      kprintf("const char b[] = unbingstr(%#.*hhhs); /* %p */\n", n, b + o,
+              b + o);
+    }
+#endif
+    ASSERT_EQ(g, coerce(memcmp(a + o, b + o, n)), "n=%d o=%d", n, o);
    ASSERT_EQ(!!g, !!bcmp(a + o, b + o, n), "n=%d o=%d", n, o);
    ASSERT_EQ(!!g, !!timingsafe_bcmp(a + o, b + o, n), "n=%d o=%d", n, o);
    ASSERT_EQ(MAX(-1, MIN(1, g)), timingsafe_memcmp(a + o, b + o, n),
--- a/test/libc/intrin/memrchr_test.c
+++ b/test/libc/intrin/memrchr_test.c
--- a/test/libc/intrin/strchr_test.c
+++ b/test/libc/intrin/strchr_test.c
@ -190,9 +190,11 @@ BENCH(strchr, bench2) {
  char *strlen_(const char *) asm("strlen");
  char *rawmemchr_(const char *, int) asm("rawmemchr");
  EZBENCH2("strchr z", donothing, strchr_(kHyperion, 'z'));
-  EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z'));
-  EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize));
  EZBENCH2("strchr Z", donothing, strchr_(kHyperion, 'Z'));
+  EZBENCH2("memchr z", donothing, memchr_(kHyperion, 'z', kHyperionSize));
+  EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize));
+  EZBENCH2("rawmemchr z", donothing, rawmemchr_(kHyperion, 'z'));
+  EZBENCH2("rawmemchr Z", donothing, rawmemchr_(kHyperion, 'z'));
  EZBENCH2("rawmemchr \\0", donothing, rawmemchr_(kHyperion, 0));
  EZBENCH2("strlen", donothing, strlen_(kHyperion));
  EZBENCH2("memchr Z", donothing, memchr_(kHyperion, 'Z', kHyperionSize));
--- a/test/libc/intrin/strrchr_test.c
+++ b/test/libc/intrin/strrchr_test.c
--- a/test/libc/nexgen32e/crc32_test.c
+++ b/test/libc/nexgen32e/crc32_test.c
@ -1,49 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-#include "libc/nexgen32e/crc32.h"
-#include "libc/nexgen32e/x86feature.h"
-#include "libc/str/str.h"
-#include "libc/testlib/ezbench.h"
-#include "libc/testlib/hyperion.h"
-#include "libc/testlib/testlib.h"
-#include "third_party/zlib/zlib.h"
-
-TEST(crc32, testBigText) {
-  size_t size;
-  void *hyperion;
-  size = kHyperionSize;
-  hyperion = kHyperion;
-  EXPECT_EQ(0xe9ded8e6, crc32(0, hyperion, size));
-  EXPECT_EQ(0xe9ded8e6, crc32_z(0, hyperion, size));
-  if (X86_HAVE(PCLMUL)) {
-    size = ROUNDDOWN(size, 64);
-    EXPECT_EQ(0xc7adc04f, crc32(0, hyperion, size));
-    EXPECT_EQ(0xc7adc04f, crc32_z(0, hyperion, size));
-    EXPECT_EQ(0xc7adc04f,
-              0xffffffffu ^ crc32_pclmul(0 ^ 0xffffffffu, hyperion, size));
-  }
-}
-
-#define TESTSTR "libc/calls/typedef/sighandler_t.h"
-
-BENCH(crc32c, bench) {
-  EZBENCH2("crc32c", donothing,
-           EXPROPRIATE(crc32c(0, VEIL("r", TESTSTR), sizeof(TESTSTR) - 1)));
-}
--- a/test/libc/str/crc32c_test.c
+++ b/test/libc/str/crc32c_test.c
@ -16,17 +16,18 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/intrin/bits.h"
 #include "libc/dce.h"
+#include "libc/intrin/bits.h"
+#include "libc/mem/gc.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/nexgen32e/x86feature.h"
-#include "libc/mem/gc.internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
+#include "third_party/zlib/zlib.h"

 #define FANATICS "Fanatics"

--- a/test/libc/str/crc32z_test.c
+++ b/test/libc/str/crc32z_test.c
@ -17,18 +17,19 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
-#include "libc/intrin/bits.h"
 #include "libc/dce.h"
 #include "libc/intrin/asan.internal.h"
+#include "libc/intrin/bits.h"
+#include "libc/mem/gc.internal.h"
 #include "libc/mem/mem.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/nexgen32e/x86feature.h"
-#include "libc/mem/gc.internal.h"
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
+#include "third_party/zlib/zlib.h"

 #define FANATICS "Fanatics"

--- a/test/libc/str/highwayhash64_test.c
+++ b/test/libc/str/highwayhash64_test.c
@ -15,15 +15,16 @@
 │ See the License for the specific language governing permissions and          │
 │ limitations under the License.                                               │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/str/highwayhash64.h"
 #include "libc/inttypes.h"
 #include "libc/nexgen32e/crc32.h"
 #include "libc/stdio/rand.h"
 #include "libc/stdio/stdio.h"
-#include "libc/str/highwayhash64.h"
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/hyperion.h"
 #include "libc/testlib/testlib.h"
+#include "third_party/zlib/zlib.h"

 #define kMaxSize 64

--- a/test/libc/str/strcmp_test.c
+++ b/test/libc/str/strcmp_test.c
@ -472,8 +472,6 @@ TEST(wcscmp, testTwosComplementBane) {
 TEST(wcsncmp, testTwosComplementBane) {
  wchar_t *B1 = malloc(4);
  wchar_t *B2 = malloc(4);
-  B1[1] = L'\0';
-  B2[1] = L'\0';
  EXPECT_EQ(wcsncmp(memcpy(B1, "\x00\x00\x00\x80", 4),
                    memcpy(B2, "\x00\x00\x00\x80", 4), 1),
            0);
--- a/test/libc/tinymath/asinh_test.c
+++ b/test/libc/tinymath/asinh_test.c
@ -18,6 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/mem/gc.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 #include "libc/x/x.h"
 #include "libc/x/xasprintf.h"
@ -51,3 +52,9 @@ TEST(asinhl, test) {
  EXPECT_STREQ("NAN", _gc(xdtoal(_asinhl(NAN))));
  EXPECT_STREQ("INFINITY", _gc(xdtoal(_asinhl(INFINITY))));
 }
+
+BENCH(asinh, bench) {
+  EZBENCH2("asinh", donothing, _asinh(.7));    // ~26ns
+  EZBENCH2("asinhf", donothing, _asinhf(.7));  // ~17ns
+  EZBENCH2("asinhl", donothing, _asinhl(.7));  // ~48ns
+}
--- a/test/libc/tinymath/sinh_test.c
+++ b/test/libc/tinymath/sinh_test.c
@ -18,6 +18,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/mem/gc.internal.h"
+#include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
 #include "libc/x/x.h"

@ -60,3 +61,9 @@ TEST(sinhf, test) {
  EXPECT_STREQ("INFINITY", gc(xdtoaf(_sinhf(INFINITY))));
  EXPECT_STREQ("-INFINITY", gc(xdtoaf(_sinhf(-INFINITY))));
 }
+
+BENCH(sinh, bench) {
+  EZBENCH2("sinh", donothing, _sinh(.7));    // ~24ns
+  EZBENCH2("sinhf", donothing, _sinhf(.7));  // ~19ns
+  EZBENCH2("sinhl", donothing, _sinhl(.7));  // ~15ns
+}
--- a/Show more
+++ b/Show more