Make considerably more progress on AARCH64

- Utilities like pledge.com now build - kprintf() will no longer balk at 48-bit addresses - There's a new aarch64-dbg build mode that should work - gc() and defer() are mostly pacified; avoid using them on aarch64 - THIRD_PART_STB now has Arm Neon intrinsics for fast image handling
2025-07-12 14:09:12 +00:00 · 2023-05-12 22:42:57 -07:00 · 2023-05-12 22:42:57 -07:00 · fd34ef732d
commit fd34ef732d
parent 1bfb3aab1b
91 changed files with 1288 additions and 1192 deletions
--- a/third_party/bzip2/bz_internal_error.c
+++ b/third_party/bzip2/bz_internal_error.c
@ -18,5 +18,5 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/

 void bz_internal_error(int err) {
-  asm("hlt");
+  __builtin_trap();
 }
--- a/third_party/bzip2/crctable.c
+++ b/third_party/bzip2/crctable.c
@ -34,7 +34,7 @@ static textstartup void BZ2_crc32Table_init() {
    BZ2_crc32Table[i] = u;
  }
  if (BZ2_crc32Table[0] || BZ2_crc32Table[255] != 0xb1f740b4) {
-    asm("hlt");
+    __builtin_trap();
  }
 }

--- a/third_party/chibicc/chibicc.mk
+++ b/third_party/chibicc/chibicc.mk
@ -10,6 +10,8 @@
 #   This makefile compiles and runs each test twice. The first with
 #   GCC-built chibicc, and a second time with chibicc-built chibicc

+ifeq ($(ARCH), x86_64)
+
 CHIBICC = o/$(MODE)/third_party/chibicc/chibicc.com
 CHIBICC_FLAGS =								\
 	-fno-common							\
@ -111,6 +113,8 @@ THIRD_PARTY_CHIBICC_CHECKS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)
 THIRD_PARTY_CHIBICC_OBJS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)_OBJS))
 $(THIRD_PARTY_CHIBICC_OBJS): $(BUILD_FILES) third_party/chibicc/chibicc.mk

+endif
+
 .PHONY: o/$(MODE)/third_party/chibicc
 o/$(MODE)/third_party/chibicc:						\
 		o/$(MODE)/third_party/chibicc/test			\
--- a/third_party/chibicc/test/test.mk
+++ b/third_party/chibicc/test/test.mk
@ -10,6 +10,8 @@
 #   This makefile compiles and runs each test twice. The first with
 #   GCC-built chibicc, and a second time with chibicc-built chibicc

+ifeq ($(ARCH), x86_64)
+
 PKGS += THIRD_PARTY_CHIBICC_TEST

 THIRD_PARTY_CHIBICC_TEST_A = o/$(MODE)/third_party/chibicc/test/test.a
@ -75,6 +77,8 @@ o/$(MODE)/third_party/chibicc/test/%.o:						\

 o/$(MODE)/third_party/chibicc/test/int128_test.o: private QUOTA = -M1024m

+endif
+
 .PHONY: o/$(MODE)/third_party/chibicc/test
 o/$(MODE)/third_party/chibicc/test:						\
 		$(THIRD_PARTY_CHIBICC_TEST_BINS)				\
--- a/third_party/compiler_rt/lshrti3.c
+++ b/third_party/compiler_rt/lshrti3.c
@ -1,48 +0,0 @@
-/* clang-format off */
-/* ===-- lshrti3.c - Implement __lshrti3 -----------------------------------===
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is dual licensed under the MIT and the University of Illinois Open
- * Source Licenses. See LICENSE.TXT for details.
- *
- * ===----------------------------------------------------------------------===
- *
- * This file implements __lshrti3 for the compiler_rt library.
- *
- * ===----------------------------------------------------------------------===
- */
-
-STATIC_YOINK("huge_compiler_rt_license");
-
-#include "third_party/compiler_rt/int_lib.h"
-
-#ifdef CRT_HAS_128BIT
-
-/* Returns: logical a >> b */
-
-/* Precondition:  0 <= b < bits_in_tword */
-
-COMPILER_RT_ABI ti_int
-__lshrti3(ti_int a, si_int b)
-{
-    const int bits_in_dword = (int)(sizeof(di_int) * CHAR_BIT);
-    utwords input;
-    utwords result;
-    input.all = a;
-    if (b & bits_in_dword)  /* bits_in_dword <= b < bits_in_tword */
-    {
-        result.s.high = 0;
-        result.s.low = input.s.high >> (b - bits_in_dword);
-    }
-    else  /* 0 <= b < bits_in_dword */
-    {
-        if (b == 0)
-            return a;
-        result.s.high  = input.s.high >> b;
-        result.s.low = (input.s.high << (bits_in_dword - b)) | (input.s.low >> b);
-    }
-    return result.all;
-}
-
-#endif /* CRT_HAS_128BIT */
--- a/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/collect-ld.sym
+++ b/third_party/gcc/libexec/gcc/aarch64-linux-musl/9.2.0/collect-ld.sym
@ -0,0 +1 @@
+../../../../x86_64-linux-musl/bin/ld.bfd
--- a/third_party/sed/process.c
+++ b/third_party/sed/process.c
@ -68,7 +68,7 @@ static __inline int	 regexec_e(regex_t *, const char *, int, int, size_t);
 static void		 regsub(SPACE *, char *, char *);
 static int		 substitute(struct s_command *);

-struct s_appends *appends;	/* Array of pointers to strings to append. */
+struct s_appends *appends_;	/* Array of pointers to strings to append. */
 static size_t appendx;		/* Index into appends array. */
 size_t appendnum;			/* Size of appends array. */

@ -111,12 +111,12 @@ redirect:
 				goto redirect;
 			case 'a':
 				if (appendx >= appendnum)
-					appends = xrealloc(appends,
+					appends_ = xrealloc(appends_,
 					    sizeof(struct s_appends) *
 					    (appendnum *= 2));
-				appends[appendx].type = AP_STRING;
-				appends[appendx].s = cp->t;
-				appends[appendx].len = strlen(cp->t);
+				appends_[appendx].type = AP_STRING;
+				appends_[appendx].s = cp->t;
+				appends_[appendx].len = strlen(cp->t);
 				appendx++;
 				break;
 			case 'b':
@ -204,12 +204,12 @@ redirect:
 				exit(0);
 			case 'r':
 				if (appendx >= appendnum)
-					appends = xrealloc(appends,
+					appends_ = xrealloc(appends_,
 					    sizeof(struct s_appends) *
 					    (appendnum *= 2));
-				appends[appendx].type = AP_FILE;
-				appends[appendx].s = cp->t;
-				appends[appendx].len = strlen(cp->t);
+				appends_[appendx].type = AP_FILE;
+				appends_[appendx].s = cp->t;
+				appends_[appendx].len = strlen(cp->t);
 				appendx++;
 				break;
 			case 's':
@ -541,9 +541,9 @@ flush_appends(void)
 	char *buf = gc(malloc(8 * 1024));

 	for (i = 0; i < appendx; i++)
-		switch (appends[i].type) {
+		switch (appends_[i].type) {
 		case AP_STRING:
-			fwrite(appends[i].s, sizeof(char), appends[i].len,
+			fwrite(appends_[i].s, sizeof(char), appends_[i].len,
 			    outfile);
 			break;
 		case AP_FILE:
@ -555,7 +555,7 @@ flush_appends(void)
 			 * would be truly bizarre, but possible.  It's probably
 			 * not that big a performance win, anyhow.
 			 */
-			if ((f = fopen(appends[i].s, "r")) == NULL)
+			if ((f = fopen(appends_[i].s, "r")) == NULL)
 				break;
 			while ((count = fread(buf, sizeof(char), sizeof(buf), f)))
 				(void)fwrite(buf, sizeof(char), count, outfile);
--- a/third_party/sqlite3/sqlite3.mk
+++ b/third_party/sqlite3/sqlite3.mk
@ -58,6 +58,7 @@ THIRD_PARTY_SQLITE3_A_DIRECTDEPS =					\
 	LIBC_TIME							\
 	LIBC_TINYMATH							\
 	LIBC_ZIPOS							\
+	THIRD_PARTY_COMPILER_RT						\
 	THIRD_PARTY_GDTOA						\
 	THIRD_PARTY_LINENOISE						\
 	THIRD_PARTY_MUSL						\
@ -79,7 +80,8 @@ o/$(MODE)/third_party/sqlite3/sqlite3.com.dbg:				\
 o/$(MODE)/third_party/sqlite3/sqlite3.com:				\
 		o/$(MODE)/third_party/sqlite3/sqlite3.com.dbg		\
 		o/$(MODE)/third_party/zip/zip.com			\
-		o/$(MODE)/tool/build/symtab.com
+		o/$(MODE)/tool/build/symtab.com				\
+		$(VM)
 	@$(MAKE_OBJCOPY)
 	@$(MAKE_SYMTAB_CREATE)
 	@$(MAKE_SYMTAB_ZIP)
--- a/third_party/stb/idct-sse.S
+++ b/third_party/stb/idct-sse.S
@ -1,426 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-//	Computes inverse discrete cosine transform.
-//
-//	@note	used to decode jpeg
-	.p2align 4
-stbi__idct_simd$sse:
-	push	%rbp
-	mov	%rsp,%rbp
-	movslq	%esi,%rsi
-	lea	(%rdi,%rsi),%rax
-	sub	$96,%rsp
-	movdqa	32(%rdx),%xmm0
-	movdqa	112(%rdx),%xmm9
-	movdqa	48(%rdx),%xmm1
-	movdqa	80(%rdx),%xmm7
-	movdqa	%xmm0,%xmm2
-	punpcklwd 96(%rdx),%xmm2
-	punpckhwd 96(%rdx),%xmm0
-	movdqa	%xmm9,%xmm8
-	movdqa	16(%rdx),%xmm5
-	movdqa	%xmm2,%xmm3
-	movdqa	%xmm2,%xmm6
-	movdqa	%xmm0,%xmm2
-	pmaddwd	.LC1(%rip),%xmm3
-	movdqa	%xmm0,%xmm4
-	pmaddwd	.LC1(%rip),%xmm2
-	pmaddwd	.LC0(%rip),%xmm4
-	punpckhwd %xmm1,%xmm8
-	pmaddwd	.LC0(%rip),%xmm6
-	movaps	%xmm3,-48(%rbp)
-	movdqa	(%rdx),%xmm3
-	movaps	%xmm2,-64(%rbp)
-	movdqa	64(%rdx),%xmm2
-	movdqa	%xmm3,%xmm0
-	movaps	%xmm4,-32(%rbp)
-	paddw	%xmm2,%xmm0
-	psubw	%xmm2,%xmm3
-	movaps	%xmm6,-16(%rbp)
-	movdqa	%xmm0,%xmm4
-	pxor	%xmm0,%xmm0
-	movdqa	%xmm0,%xmm11
-	movdqa	%xmm0,%xmm12
-	movdqa	%xmm0,%xmm2
-	punpcklwd %xmm4,%xmm11
-	punpckhwd %xmm3,%xmm12
-	punpcklwd %xmm3,%xmm2
-	movdqa	%xmm11,%xmm13
-	movdqa	%xmm0,%xmm11
-	movdqa	%xmm12,%xmm3
-	punpckhwd %xmm4,%xmm11
-	movdqa	%xmm8,%xmm12
-	movdqa	%xmm8,%xmm4
-	movdqa	%xmm11,%xmm14
-	movdqa	%xmm7,%xmm8
-	movdqa	%xmm9,%xmm11
-	punpckhwd %xmm5,%xmm8
-	psrad	$4,%xmm3
-	punpcklwd %xmm1,%xmm11
-	psrad	$4,%xmm13
-	psrad	$4,%xmm14
-	movdqa	%xmm11,%xmm15
-	movaps	%xmm13,-80(%rbp)
-	movdqa	%xmm8,%xmm6
-	paddw	%xmm7,%xmm1
-	pmaddwd	.LC3(%rip),%xmm15
-	movaps	%xmm14,-96(%rbp)
-	movdqa	%xmm8,%xmm14
-	movdqa	%xmm5,%xmm8
-	pmaddwd	.LC2(%rip),%xmm11
-	pmaddwd	.LC2(%rip),%xmm12
-	paddw	%xmm9,%xmm8
-	psrad	$4,%xmm2
-	pmaddwd	.LC3(%rip),%xmm4
-	pmaddwd	.LC5(%rip),%xmm6
-	pmaddwd	.LC4(%rip),%xmm14
-	movdqa	%xmm4,%xmm10
-	movdqa	%xmm7,%xmm4
-	movdqa	%xmm8,%xmm7
-	punpcklwd %xmm5,%xmm4
-	punpcklwd %xmm1,%xmm7
-	punpckhwd %xmm1,%xmm8
-	movdqa	%xmm4,%xmm13
-	movdqa	%xmm7,%xmm9
-	pmaddwd	.LC5(%rip),%xmm4
-	pmaddwd	.LC6(%rip),%xmm9
-	movdqa	%xmm8,%xmm5
-	movdqa	%xmm7,%xmm1
-	pmaddwd	.LC7(%rip),%xmm8
-	pmaddwd	.LC6(%rip),%xmm5
-	movdqa	%xmm15,%xmm7
-	paddd	%xmm9,%xmm11
-	paddd	%xmm9,%xmm4
-	movdqa	.LC8(%rip),%xmm9
-	paddd	%xmm8,%xmm14
-	paddd	%xmm10,%xmm8
-	movdqa	-96(%rbp),%xmm10
-	paddd	-64(%rbp),%xmm10
-	pmaddwd	.LC7(%rip),%xmm1
-	pmaddwd	.LC4(%rip),%xmm13
-	paddd	%xmm5,%xmm12
-	paddd	%xmm5,%xmm6
-	paddd	%xmm9,%xmm10
-	movdqa	-80(%rbp),%xmm5
-	paddd	-48(%rbp),%xmm5
-	paddd	%xmm1,%xmm13
-	paddd	%xmm1,%xmm7
-	movdqa	%xmm10,%xmm1
-	psubd	%xmm6,%xmm10
-	paddd	%xmm9,%xmm5
-	paddd	%xmm6,%xmm1
-	psrad	$10,%xmm10
-	movdqa	-16(%rbp),%xmm6
-	movdqa	%xmm1,%xmm15
-	movdqa	%xmm5,%xmm1
-	psubd	%xmm4,%xmm5
-	psrad	$10,%xmm5
-	paddd	%xmm4,%xmm1
-	paddd	%xmm2,%xmm6
-	packssdw %xmm10,%xmm5
-	movdqa	-32(%rbp),%xmm10
-	paddd	%xmm9,%xmm6
-	paddd	%xmm9,%xmm2
-	psrad	$10,%xmm15
-	psrad	$10,%xmm1
-	psubd	-16(%rbp),%xmm2
-	paddd	%xmm3,%xmm10
-	paddd	%xmm9,%xmm3
-	packssdw %xmm15,%xmm1
-	paddd	%xmm9,%xmm10
-	psubd	-32(%rbp),%xmm3
-	movdqa	%xmm10,%xmm4
-	psubd	%xmm8,%xmm10
-	paddd	%xmm8,%xmm4
-	psrad	$10,%xmm10
-	movdqa	%xmm4,%xmm15
-	movdqa	%xmm6,%xmm4
-	psubd	%xmm7,%xmm6
-	psrad	$10,%xmm6
-	psrad	$10,%xmm15
-	paddd	%xmm7,%xmm4
-	movdqa	%xmm3,%xmm7
-	psubd	%xmm14,%xmm3
-	packssdw %xmm10,%xmm6
-	psrad	$10,%xmm3
-	psrad	$10,%xmm4
-	paddd	%xmm14,%xmm7
-	movdqa	%xmm7,%xmm8
-	movdqa	%xmm2,%xmm7
-	psubd	%xmm13,%xmm2
-	paddd	%xmm13,%xmm7
-	psrad	$10,%xmm8
-	packssdw %xmm15,%xmm4
-	psrad	$10,%xmm7
-	psrad	$10,%xmm2
-	packssdw %xmm8,%xmm7
-	movdqa	-80(%rbp),%xmm8
-	packssdw %xmm3,%xmm2
-	paddd	%xmm9,%xmm8
-	paddd	-96(%rbp),%xmm9
-	psubd	-48(%rbp),%xmm8
-	psubd	-64(%rbp),%xmm9
-	movdqa	%xmm8,%xmm3
-	movdqa	%xmm9,%xmm10
-	psubd	%xmm11,%xmm8
-	paddd	%xmm12,%xmm10
-	paddd	%xmm11,%xmm3
-	psrad	$10,%xmm8
-	psrad	$10,%xmm10
-	psrad	$10,%xmm3
-	psubd	%xmm12,%xmm9
-	psrad	$10,%xmm9
-	packssdw %xmm10,%xmm3
-	movdqa	%xmm1,%xmm10
-	packssdw %xmm9,%xmm8
-	movdqa	%xmm7,%xmm9
-	punpckhwd %xmm6,%xmm7
-	punpcklwd %xmm6,%xmm9
-	punpcklwd %xmm8,%xmm10
-	punpckhwd %xmm8,%xmm1
-	movdqa	%xmm3,%xmm6
-	movdqa	%xmm4,%xmm8
-	punpckhwd %xmm5,%xmm3
-	punpcklwd %xmm5,%xmm6
-	punpcklwd %xmm2,%xmm8
-	movdqa	%xmm3,%xmm5
-	punpckhwd %xmm2,%xmm4
-	movdqa	%xmm8,%xmm3
-	movdqa	%xmm10,%xmm2
-	punpckhwd %xmm6,%xmm8
-	punpcklwd %xmm6,%xmm3
-	punpcklwd %xmm9,%xmm2
-	movdqa	%xmm8,%xmm6
-	movdqa	%xmm4,%xmm8
-	punpckhwd %xmm9,%xmm10
-	punpcklwd %xmm5,%xmm8
-	punpckhwd %xmm5,%xmm4
-	movdqa	%xmm2,%xmm5
-	punpcklwd %xmm3,%xmm5
-	punpckhwd %xmm3,%xmm2
-	movdqa	%xmm1,%xmm15
-	movdqa	%xmm10,%xmm3
-	punpckhwd %xmm7,%xmm1
-	punpckhwd %xmm6,%xmm10
-	punpcklwd %xmm6,%xmm3
-	movdqa	%xmm1,%xmm6
-	punpckhwd %xmm4,%xmm1
-	punpcklwd %xmm4,%xmm6
-	movdqa	%xmm3,%xmm4
-	punpcklwd %xmm7,%xmm15
-	punpcklwd %xmm6,%xmm4
-	punpckhwd %xmm6,%xmm3
-	movdqa	%xmm15,%xmm7
-	movdqa	%xmm4,%xmm6
-	punpcklwd %xmm8,%xmm7
-	movdqa	%xmm3,%xmm11
-	movdqa	%xmm4,%xmm12
-	movdqa	%xmm3,%xmm4
-	movdqa	%xmm5,%xmm3
-	paddw	%xmm7,%xmm3
-	movdqa	%xmm1,%xmm9
-	punpckhwd %xmm8,%xmm15
-	punpcklwd %xmm10,%xmm9
-	psubw	%xmm7,%xmm5
-	movdqa	%xmm15,%xmm7
-	movdqa	%xmm9,%xmm14
-	punpcklwd %xmm2,%xmm7
-	movdqa	%xmm1,%xmm8
-	pmaddwd	.LC0(%rip),%xmm6
-	punpckhwd %xmm10,%xmm8
-	paddw	%xmm15,%xmm10
-	movaps	%xmm6,-16(%rbp)
-	pmaddwd	.LC1(%rip),%xmm4
-	movdqa	%xmm0,%xmm6
-	pmaddwd	.LC0(%rip),%xmm11
-	pmaddwd	.LC2(%rip),%xmm14
-	pmaddwd	.LC1(%rip),%xmm12
-	pmaddwd	.LC3(%rip),%xmm9
-	movaps	%xmm4,-64(%rbp)
-	movdqa	%xmm3,%xmm4
-	movdqa	%xmm0,%xmm3
-	punpckhwd %xmm4,%xmm6
-	punpcklwd %xmm4,%xmm3
-	movdqa	%xmm0,%xmm4
-	movaps	%xmm11,-32(%rbp)
-	movdqa	%xmm6,%xmm13
-	movdqa	%xmm15,%xmm6
-	punpcklwd %xmm5,%xmm4
-	movaps	%xmm12,-48(%rbp)
-	punpckhwd %xmm2,%xmm6
-	paddw	%xmm1,%xmm2
-	punpckhwd %xmm5,%xmm0
-	movdqa	%xmm14,%xmm11
-	movdqa	%xmm2,%xmm5
-	movdqa	%xmm7,%xmm14
-	punpckhwd %xmm10,%xmm2
-	psrad	$4,%xmm13
-	punpcklwd %xmm10,%xmm5
-	movaps	%xmm13,-80(%rbp)
-	movdqa	%xmm8,%xmm12
-	movdqa	%xmm5,%xmm10
-	pmaddwd	.LC4(%rip),%xmm14
-	pmaddwd	.LC6(%rip),%xmm10
-	movdqa	%xmm2,%xmm15
-	pmaddwd	.LC7(%rip),%xmm5
-	pmaddwd	.LC3(%rip),%xmm8
-	pmaddwd	.LC5(%rip),%xmm7
-	movdqa	%xmm14,%xmm13
-	movdqa	%xmm6,%xmm14
-	paddd	%xmm5,%xmm13
-	paddd	%xmm5,%xmm9
-	pmaddwd	.LC5(%rip),%xmm6
-	psrad	$4,%xmm3
-	pmaddwd	.LC6(%rip),%xmm15
-	paddd	%xmm10,%xmm7
-	paddd	%xmm10,%xmm11
-	psrad	$4,%xmm4
-	pmaddwd	.LC2(%rip),%xmm12
-	psrad	$4,%xmm0
-	pmaddwd	.LC4(%rip),%xmm14
-	pmaddwd	.LC7(%rip),%xmm2
-	movdqa	-80(%rbp),%xmm5
-	paddd	%xmm15,%xmm12
-	paddd	-64(%rbp),%xmm5
-	paddd	%xmm2,%xmm14
-	paddd	%xmm8,%xmm2
-	movdqa	-48(%rbp),%xmm8
-	paddd	%xmm6,%xmm15
-	movdqa	.LC9(%rip),%xmm6
-	paddd	%xmm3,%xmm8
-	paddd	%xmm6,%xmm8
-	paddd	%xmm6,%xmm5
-	movdqa	%xmm5,%xmm10
-	movdqa	%xmm8,%xmm1
-	psubd	%xmm15,%xmm5
-	psubd	%xmm7,%xmm8
-	psrad	$17,%xmm5
-	paddd	%xmm7,%xmm1
-	movdqa	-32(%rbp),%xmm7
-	psrad	$17,%xmm8
-	paddd	%xmm15,%xmm10
-	paddd	%xmm6,%xmm3
-	packssdw %xmm5,%xmm8
-	movdqa	-16(%rbp),%xmm5
-	paddd	%xmm0,%xmm7
-	paddd	%xmm6,%xmm0
-	paddd	%xmm6,%xmm7
-	psrad	$17,%xmm10
-	psubd	-32(%rbp),%xmm0
-	paddd	%xmm4,%xmm5
-	psrad	$17,%xmm1
-	movdqa	%xmm7,%xmm15
-	paddd	%xmm6,%xmm5
-	packssdw %xmm10,%xmm1
-	psubd	%xmm2,%xmm7
-	movdqa	%xmm5,%xmm10
-	paddd	%xmm6,%xmm4
-	psubd	%xmm9,%xmm5
-	psubd	-16(%rbp),%xmm4
-	psrad	$17,%xmm7
-	paddd	%xmm2,%xmm15
-	psrad	$17,%xmm5
-	psubd	-48(%rbp),%xmm3
-	paddd	-80(%rbp),%xmm6
-	packssdw %xmm7,%xmm5
-	movdqa	%xmm4,%xmm2
-	movdqa	%xmm0,%xmm7
-	psubd	-64(%rbp),%xmm6
-	paddd	%xmm14,%xmm7
-	psrad	$17,%xmm15
-	paddd	%xmm13,%xmm2
-	psubd	%xmm14,%xmm0
-	psrad	$17,%xmm7
-	psubd	%xmm13,%xmm4
-	psrad	$17,%xmm0
-	paddd	%xmm9,%xmm10
-	psrad	$17,%xmm2
-	psrad	$17,%xmm4
-	packuswb %xmm8,%xmm5
-	packssdw %xmm0,%xmm4
-	packssdw %xmm7,%xmm2
-	movdqa	%xmm3,%xmm0
-	movdqa	%xmm6,%xmm7
-	psrad	$17,%xmm10
-	paddd	%xmm11,%xmm0
-	paddd	%xmm12,%xmm7
-	psubd	%xmm12,%xmm6
-	packssdw %xmm15,%xmm10
-	psubd	%xmm11,%xmm3
-	psrad	$17,%xmm7
-	packuswb %xmm10,%xmm1
-	psrad	$17,%xmm0
-	psrad	$17,%xmm6
-	psrad	$17,%xmm3
-	packssdw %xmm7,%xmm0
-	packssdw %xmm6,%xmm3
-	packuswb %xmm0,%xmm2
-	movdqa	%xmm1,%xmm0
-	packuswb %xmm4,%xmm3
-	movdqa	%xmm2,%xmm4
-	punpckhbw %xmm5,%xmm2
-	punpcklbw %xmm3,%xmm0
-	punpcklbw %xmm5,%xmm4
-	punpckhbw %xmm3,%xmm1
-	movdqa	%xmm2,%xmm3
-	movdqa	%xmm0,%xmm2
-	movdqa	%xmm1,%xmm5
-	punpcklbw %xmm4,%xmm2
-	punpckhbw %xmm4,%xmm0
-	punpcklbw %xmm3,%xmm5
-	movdqa	%xmm2,%xmm4
-	punpckhbw %xmm5,%xmm2
-	punpckhbw %xmm3,%xmm1
-	punpcklbw %xmm5,%xmm4
-	movdqa	%xmm0,%xmm3
-	punpckhbw %xmm1,%xmm0
-	movq	%xmm4,(%rdi)
-	pshufd	$78,%xmm4,%xmm4
-	punpcklbw %xmm1,%xmm3
-	movq	%xmm4,(%rax)
-	add	%rsi,%rax
-	movq	%xmm2,(%rax)
-	add	%rsi,%rax
-	pshufd	$78,%xmm2,%xmm2
-	movq	%xmm2,(%rax)
-	add	%rsi,%rax
-	movq	%xmm3,(%rax)
-	add	%rsi,%rax
-	pshufd	$78,%xmm3,%xmm3
-	movq	%xmm3,(%rax)
-	movq	%xmm0,(%rax,%rsi)
-	pshufd	$78,%xmm0,%xmm0
-	movq	%xmm0,(%rax,%rsi,2)
-	leave
-	ret
-	.endfn	stbi__idct_simd$sse,globl
-
-	.rodata.cst16
-.LC0:	.value	2217,-5350,2217,-5350,2217,-5350,2217,-5350
-.LC1:	.value	5352,2217,5352,2217,5352,2217,5352,2217
-.LC2:	.value	-6811,-8034,-6811,-8034,-6811,-8034,-6811,-8034
-.LC3:	.value	-8034,4552,-8034,4552,-8034,4552,-8034,4552
-.LC4:	.value	6813,-1597,6813,-1597,6813,-1597,6813,-1597
-.LC5:	.value	-1597,4552,-1597,4552,-1597,4552,-1597,4552
-.LC6:	.value	1131,4816,1131,4816,1131,4816,1131,4816
-.LC7:	.value	4816,-5681,4816,-5681,4816,-5681,4816,-5681
-.LC8:	.long	0x200,0x200,0x200,0x200
-.LC9:	.long	0x1010000,0x1010000,0x1010000,0x1010000
--- a/third_party/stb/internal.h
+++ b/third_party/stb/internal.h
@ -1,19 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_
-#define COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_
-#if !(__ASSEMBLER__ + __LINKER__ + 0)
-COSMOPOLITAN_C_START_
-
-void stbi__YCbCr_to_RGB_row(unsigned char *, const unsigned char *,
-                            const unsigned char *, const unsigned char *,
-                            unsigned, unsigned) _Hide;
-int stbi__YCbCr_to_RGB_row$sse2(unsigned char *, const unsigned char *,
-                                const unsigned char *, const unsigned char *,
-                                unsigned) _Hide;
-void stbi__idct_simd$sse(unsigned char *out, int out_stride,
-                         short data[64]) _Hide;
-void stbi__idct_simd$avx(unsigned char *out, int out_stride,
-                         short data[64]) _Hide;
-
-COSMOPOLITAN_C_END_
-#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
-#endif /* COSMOPOLITAN_THIRD_PARTY_STB_INTERNAL_H_ */
--- a/third_party/stb/stb_image.c
+++ b/third_party/stb/stb_image.c
@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/stb/stb_image.h"
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
 #include "libc/fmt/conv.h"
@ -31,20 +32,33 @@
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/x/x.h"
-#include "third_party/stb/internal.h"
-#include "third_party/stb/stb_image.h"
-
-#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
+#include "third_party/aarch64/arm_neon.h"
+#include "third_party/intel/ammintrin.internal.h"

 asm(".ident\t\"\\n\\n\
 stb_image (Public Domain)\\n\
 Credit: Sean Barrett, et al.\\n\
 http://nothings.org/stb\"");

+#ifdef __x86_64__
+#define STBI_SSE2
+#define idct_block_kernel stbi__idct_simd
+#elif defined(__aarch64__)
+#define STBI_NEON
+#define idct_block_kernel stbi__idct_simd
+#else
+#define idct_block_kernel stbi__idct_block
+#endif
+
+#define ROL(w, k) ((w) << (k) | CheckUnsigned(w) >> (sizeof(w) * 8 - (k)))
+
 #ifndef STBI_REALLOC_SIZED
 #define STBI_REALLOC_SIZED(p, oldsz, newsz) realloc(p, newsz)
 #endif

+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
 // stbi__context structure is our basic context used by all images, so it
 // contains all the IO context, plus some basic image information
 typedef struct {
@ -1219,6 +1233,556 @@ forceinline unsigned char stbi__clamp(int x) {
  return (unsigned char)x;
 }

+#define stbi__f2f(x) ((int)(((x)*4096 + 0.5)))
+#define stbi__fsh(x) ((x)*4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0, s1, s2, s3, s4, s5, s6, s7)     \
+  int t0, t1, t2, t3, p1, p2, p3, p4, p5, x0, x1, x2, x3; \
+  p2 = s2;                                                \
+  p3 = s6;                                                \
+  p1 = (p2 + p3) * stbi__f2f(0.5411961f);                 \
+  t2 = p1 + p3 * stbi__f2f(-1.847759065f);                \
+  t3 = p1 + p2 * stbi__f2f(0.765366865f);                 \
+  p2 = s0;                                                \
+  p3 = s4;                                                \
+  t0 = stbi__fsh(p2 + p3);                                \
+  t1 = stbi__fsh(p2 - p3);                                \
+  x0 = t0 + t3;                                           \
+  x3 = t0 - t3;                                           \
+  x1 = t1 + t2;                                           \
+  x2 = t1 - t2;                                           \
+  t0 = s7;                                                \
+  t1 = s5;                                                \
+  t2 = s3;                                                \
+  t3 = s1;                                                \
+  p3 = t0 + t2;                                           \
+  p4 = t1 + t3;                                           \
+  p1 = t0 + t3;                                           \
+  p2 = t1 + t2;                                           \
+  p5 = (p3 + p4) * stbi__f2f(1.175875602f);               \
+  t0 = t0 * stbi__f2f(0.298631336f);                      \
+  t1 = t1 * stbi__f2f(2.053119869f);                      \
+  t2 = t2 * stbi__f2f(3.072711026f);                      \
+  t3 = t3 * stbi__f2f(1.501321110f);                      \
+  p1 = p5 + p1 * stbi__f2f(-0.899976223f);                \
+  p2 = p5 + p2 * stbi__f2f(-2.562915447f);                \
+  p3 = p3 * stbi__f2f(-1.961570560f);                     \
+  p4 = p4 * stbi__f2f(-0.390180644f);                     \
+  t3 += p1 + p4;                                          \
+  t2 += p2 + p3;                                          \
+  t1 += p2 + p4;                                          \
+  t0 += p1 + p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
+  int i, val[64], *v = val;
+  stbi_uc *o;
+  short *d = data;
+
+  // columns
+  for (i = 0; i < 8; ++i, ++d, ++v) {
+    // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+    if (d[8] == 0 && d[16] == 0 && d[24] == 0 && d[32] == 0 && d[40] == 0 &&
+        d[48] == 0 && d[56] == 0) {
+      //    no shortcut                 0     seconds
+      //    (1|2|3|4|5|6|7)==0          0     seconds
+      //    all separate               -0.047 seconds
+      //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+      int dcterm = d[0] * 4;
+      v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+    } else {
+      STBI__IDCT_1D(d[0], d[8], d[16], d[24], d[32], d[40], d[48], d[56])
+      // constants scaled things up by 1<<12; let's bring them back
+      // down, but keep 2 extra bits of precision
+      x0 += 512;
+      x1 += 512;
+      x2 += 512;
+      x3 += 512;
+      v[0] = (x0 + t3) >> 10;
+      v[56] = (x0 - t3) >> 10;
+      v[8] = (x1 + t2) >> 10;
+      v[48] = (x1 - t2) >> 10;
+      v[16] = (x2 + t1) >> 10;
+      v[40] = (x2 - t1) >> 10;
+      v[24] = (x3 + t0) >> 10;
+      v[32] = (x3 - t0) >> 10;
+    }
+  }
+
+  for (i = 0, v = val, o = out; i < 8; ++i, v += 8, o += out_stride) {
+    // no fast case since the first 1D IDCT spread components out
+    STBI__IDCT_1D(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])
+    // constants scaled things up by 1<<12, plus we had 1<<2 from first
+    // loop, plus horizontal and vertical each scale by sqrt(8) so together
+    // we've got an extra 1<<3, so 1<<17 total we need to remove.
+    // so we want to round that, which means adding 0.5 * 1<<17,
+    // aka 65536. Also, we'll end up with -128 to 127 that we want
+    // to encode as 0..255 by adding 128, so we'll add that before the shift
+    x0 += 65536 + (128 << 17);
+    x1 += 65536 + (128 << 17);
+    x2 += 65536 + (128 << 17);
+    x3 += 65536 + (128 << 17);
+    // tried computing the shifts into temps, or'ing the temps to see
+    // if any were out of range, but that was slower
+    o[0] = stbi__clamp((x0 + t3) >> 17);
+    o[7] = stbi__clamp((x0 - t3) >> 17);
+    o[1] = stbi__clamp((x1 + t2) >> 17);
+    o[6] = stbi__clamp((x1 - t2) >> 17);
+    o[2] = stbi__clamp((x2 + t1) >> 17);
+    o[5] = stbi__clamp((x2 - t1) >> 17);
+    o[3] = stbi__clamp((x3 + t0) >> 17);
+    o[4] = stbi__clamp((x3 - t0) >> 17);
+  }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
+  // This is constructed to match our regular (generic) integer IDCT exactly.
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i tmp;
+
+// dot product constant: even elems=x, odd elems=y
+#define dct_const(x, y) _mm_setr_epi16((x), (y), (x), (y), (x), (y), (x), (y))
+
+// out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+// out(1) = c1[even]*x + c1[odd]*y
+#define dct_rot(out0, out1, x, y, c0, c1)        \
+  __m128i c0##lo = _mm_unpacklo_epi16((x), (y)); \
+  __m128i c0##hi = _mm_unpackhi_epi16((x), (y)); \
+  __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+  __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+  __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+  __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+// out = in << 12  (in 16-bit, out 32-bit)
+#define dct_widen(out, in)                                              \
+  __m128i out##_l =                                                     \
+      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+  __m128i out##_h =                                                     \
+      _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+// wide add
+#define dct_wadd(out, a, b)                      \
+  __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+  __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b)                      \
+  __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+  __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+// butterfly a/b, add bias, then shift by "s" and pack
+#define dct_bfly32o(out0, out1, a, b, bias, s)                               \
+  {                                                                          \
+    __m128i abiased_l = _mm_add_epi32(a##_l, bias);                          \
+    __m128i abiased_h = _mm_add_epi32(a##_h, bias);                          \
+    dct_wadd(sum, abiased, b);                                               \
+    dct_wsub(dif, abiased, b);                                               \
+    out0 =                                                                   \
+        _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+    out1 =                                                                   \
+        _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+  }
+
+// 8-bit interleave step (for transposes)
+#define dct_interleave8(a, b)  \
+  tmp = a;                     \
+  a = _mm_unpacklo_epi8(a, b); \
+  b = _mm_unpackhi_epi8(tmp, b)
+
+// 16-bit interleave step (for transposes)
+#define dct_interleave16(a, b)  \
+  tmp = a;                      \
+  a = _mm_unpacklo_epi16(a, b); \
+  b = _mm_unpackhi_epi16(tmp, b)
+
+#define dct_pass(bias, shift)                        \
+  {                                                  \
+    /* even part */                                  \
+    dct_rot(t2e, t3e, row2, row6, rot0_0, rot0_1);   \
+    __m128i sum04 = _mm_add_epi16(row0, row4);       \
+    __m128i dif04 = _mm_sub_epi16(row0, row4);       \
+    dct_widen(t0e, sum04);                           \
+    dct_widen(t1e, dif04);                           \
+    dct_wadd(x0, t0e, t3e);                          \
+    dct_wsub(x3, t0e, t3e);                          \
+    dct_wadd(x1, t1e, t2e);                          \
+    dct_wsub(x2, t1e, t2e);                          \
+    /* odd part */                                   \
+    dct_rot(y0o, y2o, row7, row3, rot2_0, rot2_1);   \
+    dct_rot(y1o, y3o, row5, row1, rot3_0, rot3_1);   \
+    __m128i sum17 = _mm_add_epi16(row1, row7);       \
+    __m128i sum35 = _mm_add_epi16(row3, row5);       \
+    dct_rot(y4o, y5o, sum17, sum35, rot1_0, rot1_1); \
+    dct_wadd(x4, y0o, y4o);                          \
+    dct_wadd(x5, y1o, y5o);                          \
+    dct_wadd(x6, y2o, y5o);                          \
+    dct_wadd(x7, y3o, y4o);                          \
+    dct_bfly32o(row0, row7, x0, x7, bias, shift);    \
+    dct_bfly32o(row1, row6, x1, x6, bias, shift);    \
+    dct_bfly32o(row2, row5, x2, x5, bias, shift);    \
+    dct_bfly32o(row3, row4, x3, x4, bias, shift);    \
+  }
+
+  __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f),
+                             stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+  __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f(0.765366865f),
+                             stbi__f2f(0.5411961f));
+  __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f),
+                             stbi__f2f(1.175875602f));
+  __m128i rot1_1 =
+      dct_const(stbi__f2f(1.175875602f),
+                stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+  __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f(0.298631336f),
+                             stbi__f2f(-1.961570560f));
+  __m128i rot2_1 =
+      dct_const(stbi__f2f(-1.961570560f),
+                stbi__f2f(-1.961570560f) + stbi__f2f(3.072711026f));
+  __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f(2.053119869f),
+                             stbi__f2f(-0.390180644f));
+  __m128i rot3_1 =
+      dct_const(stbi__f2f(-0.390180644f),
+                stbi__f2f(-0.390180644f) + stbi__f2f(1.501321110f));
+
+  // rounding biases in column/row passes, see stbi__idct_block for explanation.
+  __m128i bias_0 = _mm_set1_epi32(512);
+  __m128i bias_1 = _mm_set1_epi32(65536 + (128 << 17));
+
+  // load
+  row0 = _mm_load_si128((const __m128i *)(data + 0 * 8));
+  row1 = _mm_load_si128((const __m128i *)(data + 1 * 8));
+  row2 = _mm_load_si128((const __m128i *)(data + 2 * 8));
+  row3 = _mm_load_si128((const __m128i *)(data + 3 * 8));
+  row4 = _mm_load_si128((const __m128i *)(data + 4 * 8));
+  row5 = _mm_load_si128((const __m128i *)(data + 5 * 8));
+  row6 = _mm_load_si128((const __m128i *)(data + 6 * 8));
+  row7 = _mm_load_si128((const __m128i *)(data + 7 * 8));
+
+  // column pass
+  dct_pass(bias_0, 10);
+
+  {
+    // 16bit 8x8 transpose pass 1
+    dct_interleave16(row0, row4);
+    dct_interleave16(row1, row5);
+    dct_interleave16(row2, row6);
+    dct_interleave16(row3, row7);
+
+    // transpose pass 2
+    dct_interleave16(row0, row2);
+    dct_interleave16(row1, row3);
+    dct_interleave16(row4, row6);
+    dct_interleave16(row5, row7);
+
+    // transpose pass 3
+    dct_interleave16(row0, row1);
+    dct_interleave16(row2, row3);
+    dct_interleave16(row4, row5);
+    dct_interleave16(row6, row7);
+  }
+
+  // row pass
+  dct_pass(bias_1, 17);
+
+  {
+    // pack
+    __m128i p0 = _mm_packus_epi16(row0, row1);  // a0a1a2a3...a7b0b1b2b3...b7
+    __m128i p1 = _mm_packus_epi16(row2, row3);
+    __m128i p2 = _mm_packus_epi16(row4, row5);
+    __m128i p3 = _mm_packus_epi16(row6, row7);
+
+    // 8bit 8x8 transpose pass 1
+    dct_interleave8(p0, p2);  // a0e0a1e1...
+    dct_interleave8(p1, p3);  // c0g0c1g1...
+
+    // transpose pass 2
+    dct_interleave8(p0, p1);  // a0c0e0g0...
+    dct_interleave8(p2, p3);  // b0d0f0h0...
+
+    // transpose pass 3
+    dct_interleave8(p0, p2);  // a0b0c0d0...
+    dct_interleave8(p1, p3);  // a4b4c4d4...
+
+    // store
+    _mm_storel_epi64((__m128i *)out, p0);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p0, 0x4e));
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, p2);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p2, 0x4e));
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, p1);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p1, 0x4e));
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, p3);
+    out += out_stride;
+    _mm_storel_epi64((__m128i *)out, _mm_shuffle_epi32(p3, 0x4e));
+  }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif  // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
+  int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+  int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+  int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+  int16x4_t rot0_2 = vdup_n_s16(stbi__f2f(0.765366865f));
+  int16x4_t rot1_0 = vdup_n_s16(stbi__f2f(1.175875602f));
+  int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+  int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+  int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+  int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+  int16x4_t rot3_0 = vdup_n_s16(stbi__f2f(0.298631336f));
+  int16x4_t rot3_1 = vdup_n_s16(stbi__f2f(2.053119869f));
+  int16x4_t rot3_2 = vdup_n_s16(stbi__f2f(3.072711026f));
+  int16x4_t rot3_3 = vdup_n_s16(stbi__f2f(1.501321110f));
+
+#define dct_long_mul(out, inq, coeff)                      \
+  int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+  int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff)                          \
+  int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+  int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq)                               \
+  int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+  int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b)                    \
+  int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+  int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b)                    \
+  int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+  int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0, out1, a, b, shiftop, s)              \
+  {                                                            \
+    dct_wadd(sum, a, b);                                       \
+    dct_wsub(dif, a, b);                                       \
+    out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+    out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+  }
+
+#define dct_pass(shiftop, shift)                     \
+  {                                                  \
+    /* even part */                                  \
+    int16x8_t sum26 = vaddq_s16(row2, row6);         \
+    dct_long_mul(p1e, sum26, rot0_0);                \
+    dct_long_mac(t2e, p1e, row6, rot0_1);            \
+    dct_long_mac(t3e, p1e, row2, rot0_2);            \
+    int16x8_t sum04 = vaddq_s16(row0, row4);         \
+    int16x8_t dif04 = vsubq_s16(row0, row4);         \
+    dct_widen(t0e, sum04);                           \
+    dct_widen(t1e, dif04);                           \
+    dct_wadd(x0, t0e, t3e);                          \
+    dct_wsub(x3, t0e, t3e);                          \
+    dct_wadd(x1, t1e, t2e);                          \
+    dct_wsub(x2, t1e, t2e);                          \
+    /* odd part */                                   \
+    int16x8_t sum15 = vaddq_s16(row1, row5);         \
+    int16x8_t sum17 = vaddq_s16(row1, row7);         \
+    int16x8_t sum35 = vaddq_s16(row3, row5);         \
+    int16x8_t sum37 = vaddq_s16(row3, row7);         \
+    int16x8_t sumodd = vaddq_s16(sum17, sum35);      \
+    dct_long_mul(p5o, sumodd, rot1_0);               \
+    dct_long_mac(p1o, p5o, sum17, rot1_1);           \
+    dct_long_mac(p2o, p5o, sum35, rot1_2);           \
+    dct_long_mul(p3o, sum37, rot2_0);                \
+    dct_long_mul(p4o, sum15, rot2_1);                \
+    dct_wadd(sump13o, p1o, p3o);                     \
+    dct_wadd(sump24o, p2o, p4o);                     \
+    dct_wadd(sump23o, p2o, p3o);                     \
+    dct_wadd(sump14o, p1o, p4o);                     \
+    dct_long_mac(x4, sump13o, row7, rot3_0);         \
+    dct_long_mac(x5, sump24o, row5, rot3_1);         \
+    dct_long_mac(x6, sump23o, row3, rot3_2);         \
+    dct_long_mac(x7, sump14o, row1, rot3_3);         \
+    dct_bfly32o(row0, row7, x0, x7, shiftop, shift); \
+    dct_bfly32o(row1, row6, x1, x6, shiftop, shift); \
+    dct_bfly32o(row2, row5, x2, x5, shiftop, shift); \
+    dct_bfly32o(row3, row4, x3, x4, shiftop, shift); \
+  }
+
+  // load
+  row0 = vld1q_s16(data + 0 * 8);
+  row1 = vld1q_s16(data + 1 * 8);
+  row2 = vld1q_s16(data + 2 * 8);
+  row3 = vld1q_s16(data + 3 * 8);
+  row4 = vld1q_s16(data + 4 * 8);
+  row5 = vld1q_s16(data + 5 * 8);
+  row6 = vld1q_s16(data + 6 * 8);
+  row7 = vld1q_s16(data + 7 * 8);
+
+  // add DC bias
+  row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+  // column pass
+  dct_pass(vrshrn_n_s32, 10);
+
+  // 16bit 8x8 transpose
+  {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y)              \
+  {                                  \
+    int16x8x2_t t = vtrnq_s16(x, y); \
+    x = t.val[0];                    \
+    y = t.val[1];                    \
+  }
+#define dct_trn32(x, y)                                                \
+  {                                                                    \
+    int32x4x2_t t =                                                    \
+        vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); \
+    x = vreinterpretq_s16_s32(t.val[0]);                               \
+    y = vreinterpretq_s16_s32(t.val[1]);                               \
+  }
+#define dct_trn64(x, y)                                     \
+  {                                                         \
+    int16x8_t x0 = x;                                       \
+    int16x8_t y0 = y;                                       \
+    x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0));   \
+    y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); \
+  }
+
+    // pass 1
+    dct_trn16(row0, row1);  // a0b0a2b2a4b4a6b6
+    dct_trn16(row2, row3);
+    dct_trn16(row4, row5);
+    dct_trn16(row6, row7);
+
+    // pass 2
+    dct_trn32(row0, row2);  // a0b0c0d0a4b4c4d4
+    dct_trn32(row1, row3);
+    dct_trn32(row4, row6);
+    dct_trn32(row5, row7);
+
+    // pass 3
+    dct_trn64(row0, row4);  // a0b0c0d0e0f0g0h0
+    dct_trn64(row1, row5);
+    dct_trn64(row2, row6);
+    dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+  }
+
+  // row pass
+  // vrshrn_n_s32 only supports shifts up to 16, we need
+  // 17. so do a non-rounding shift of 16 first then follow
+  // up with a rounding shift by 1.
+  dct_pass(vshrn_n_s32, 16);
+
+  {
+    // pack and round
+    uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+    uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+    uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+    uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+    uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+    uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+    uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+    uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+    // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y)           \
+  {                                \
+    uint8x8x2_t t = vtrn_u8(x, y); \
+    x = t.val[0];                  \
+    y = t.val[1];                  \
+  }
+#define dct_trn8_16(x, y)                                                      \
+  {                                                                            \
+    uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); \
+    x = vreinterpret_u8_u16(t.val[0]);                                         \
+    y = vreinterpret_u8_u16(t.val[1]);                                         \
+  }
+#define dct_trn8_32(x, y)                                                      \
+  {                                                                            \
+    uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); \
+    x = vreinterpret_u8_u32(t.val[0]);                                         \
+    y = vreinterpret_u8_u32(t.val[1]);                                         \
+  }
+
+    // sadly can't use interleaved stores here since we only write
+    // 8 bytes to each scan line!
+
+    // 8x8 8-bit transpose pass 1
+    dct_trn8_8(p0, p1);
+    dct_trn8_8(p2, p3);
+    dct_trn8_8(p4, p5);
+    dct_trn8_8(p6, p7);
+
+    // pass 2
+    dct_trn8_16(p0, p2);
+    dct_trn8_16(p1, p3);
+    dct_trn8_16(p4, p6);
+    dct_trn8_16(p5, p7);
+
+    // pass 3
+    dct_trn8_32(p0, p4);
+    dct_trn8_32(p1, p5);
+    dct_trn8_32(p2, p6);
+    dct_trn8_32(p3, p7);
+
+    // store
+    vst1_u8(out, p0);
+    out += out_stride;
+    vst1_u8(out, p1);
+    out += out_stride;
+    vst1_u8(out, p2);
+    out += out_stride;
+    vst1_u8(out, p3);
+    out += out_stride;
+    vst1_u8(out, p4);
+    out += out_stride;
+    vst1_u8(out, p5);
+    out += out_stride;
+    vst1_u8(out, p6);
+    out += out_stride;
+    vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+  }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif  // STBI_NEON
+
 #define STBI__MARKER_none 0xff
 // if there's a pending marker from the entropy stream, return that
 // otherwise, fetch from the stream and get a marker. if there's no
@ -1275,7 +1839,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
                                       z->huff_ac + ha, z->fast_ac[ha], n,
                                       z->dequant[z->img_comp[n].tq]))
            return 0;
-          stbi__idct_simd$sse(
+          idct_block_kernel(
              z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
              z->img_comp[n].w2, data);
          // every data block is an MCU, so countdown the restart interval
@ -1309,7 +1873,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z) {
                                             z->huff_ac + ha, z->fast_ac[ha], n,
                                             z->dequant[z->img_comp[n].tq]))
                  return 0;
-                stbi__idct_simd$sse(
+                idct_block_kernel(
                    z->img_comp[n].data + z->img_comp[n].w2 * y2 + x2,
                    z->img_comp[n].w2, data);
              }
@ -1411,7 +1975,7 @@ static void stbi__jpeg_finish(stbi__jpeg *z) {
          short *data =
              z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
          stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
-          stbi__idct_simd$sse(
+          idct_block_kernel(
              z->img_comp[n].data + z->img_comp[n].w2 * j * 8 + i * 8,
              z->img_comp[n].w2, data);
        }
@ -1905,6 +2469,203 @@ static unsigned char *stbi__resample_row_nearest(unsigned char *out,
  return out;
 }

+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
+                                   const stbi_uc *pcb, const stbi_uc *pcr,
+                                   int count, int step) {
+  int i;
+  for (i = 0; i < count; ++i) {
+    int y_fixed = (y[i] << 20) + (1 << 19);  // rounding
+    int r, g, b;
+    int cr = pcr[i] - 128;
+    int cb = pcb[i] - 128;
+    r = y_fixed + cr * stbi__float2fixed(1.40200f);
+    g = y_fixed + (cr * -stbi__float2fixed(0.71414f)) +
+        ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
+    b = y_fixed + cb * stbi__float2fixed(1.77200f);
+    r >>= 20;
+    g >>= 20;
+    b >>= 20;
+    if ((unsigned)r > 255) {
+      if (r < 0)
+        r = 0;
+      else
+        r = 255;
+    }
+    if ((unsigned)g > 255) {
+      if (g < 0)
+        g = 0;
+      else
+        g = 255;
+    }
+    if ((unsigned)b > 255) {
+      if (b < 0)
+        b = 0;
+      else
+        b = 255;
+    }
+    out[0] = (stbi_uc)r;
+    out[1] = (stbi_uc)g;
+    out[2] = (stbi_uc)b;
+    out[3] = 255;
+    out += step;
+  }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
+                                    stbi_uc const *pcb, stbi_uc const *pcr,
+                                    int count, int step) {
+  int i = 0;
+
+#ifdef STBI_SSE2
+  // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+  // it's useful in practice (you wouldn't use it for textures, for example).
+  // so just accelerate step == 4 case.
+  if (step == 4) {
+    // this is a fairly straightforward implementation and not super-optimized.
+    __m128i signflip = _mm_set1_epi8(-0x80);
+    __m128i cr_const0 = _mm_set1_epi16((short)(1.40200f * 4096.0f + 0.5f));
+    __m128i cr_const1 = _mm_set1_epi16(-(short)(0.71414f * 4096.0f + 0.5f));
+    __m128i cb_const0 = _mm_set1_epi16(-(short)(0.34414f * 4096.0f + 0.5f));
+    __m128i cb_const1 = _mm_set1_epi16((short)(1.77200f * 4096.0f + 0.5f));
+    __m128i y_bias = _mm_set1_epi8((char)(unsigned char)128);
+    __m128i xw = _mm_set1_epi16(255);  // alpha channel
+
+    for (; i + 7 < count; i += 8) {
+      // load
+      __m128i y_bytes = _mm_loadl_epi64((__m128i *)(y + i));
+      __m128i cr_bytes = _mm_loadl_epi64((__m128i *)(pcr + i));
+      __m128i cb_bytes = _mm_loadl_epi64((__m128i *)(pcb + i));
+      __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip);  // -128
+      __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip);  // -128
+
+      // unpack to short (and left-shift cr, cb by 8)
+      __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes);
+      __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+      __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+      // color transform
+      __m128i yws = _mm_srli_epi16(yw, 4);
+      __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+      __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+      __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+      __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+      __m128i rws = _mm_add_epi16(cr0, yws);
+      __m128i gwt = _mm_add_epi16(cb0, yws);
+      __m128i bws = _mm_add_epi16(yws, cb1);
+      __m128i gws = _mm_add_epi16(gwt, cr1);
+
+      // descale
+      __m128i rw = _mm_srai_epi16(rws, 4);
+      __m128i bw = _mm_srai_epi16(bws, 4);
+      __m128i gw = _mm_srai_epi16(gws, 4);
+
+      // back to byte, set up for transpose
+      __m128i brb = _mm_packus_epi16(rw, bw);
+      __m128i gxb = _mm_packus_epi16(gw, xw);
+
+      // transpose to interleave channels
+      __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+      __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+      __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+      __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+      // store
+      _mm_storeu_si128((__m128i *)(out + 0), o0);
+      _mm_storeu_si128((__m128i *)(out + 16), o1);
+      out += 32;
+    }
+  }
+#endif
+
+#ifdef STBI_NEON
+  // in this version, step=3 support would be easy to add. but is there demand?
+  if (step == 4) {
+    // this is a fairly straightforward implementation and not super-optimized.
+    uint8x8_t signflip = vdup_n_u8(0x80);
+    int16x8_t cr_const0 = vdupq_n_s16((short)(1.40200f * 4096.0f + 0.5f));
+    int16x8_t cr_const1 = vdupq_n_s16(-(short)(0.71414f * 4096.0f + 0.5f));
+    int16x8_t cb_const0 = vdupq_n_s16(-(short)(0.34414f * 4096.0f + 0.5f));
+    int16x8_t cb_const1 = vdupq_n_s16((short)(1.77200f * 4096.0f + 0.5f));
+
+    for (; i + 7 < count; i += 8) {
+      // load
+      uint8x8_t y_bytes = vld1_u8(y + i);
+      uint8x8_t cr_bytes = vld1_u8(pcr + i);
+      uint8x8_t cb_bytes = vld1_u8(pcb + i);
+      int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+      int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+      // expand to s16
+      int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+      int16x8_t crw = vshll_n_s8(cr_biased, 7);
+      int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+      // color transform
+      int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+      int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+      int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+      int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+      int16x8_t rws = vaddq_s16(yws, cr0);
+      int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+      int16x8_t bws = vaddq_s16(yws, cb1);
+
+      // undo scaling, round, convert to byte
+      uint8x8x4_t o;
+      o.val[0] = vqrshrun_n_s16(rws, 4);
+      o.val[1] = vqrshrun_n_s16(gws, 4);
+      o.val[2] = vqrshrun_n_s16(bws, 4);
+      o.val[3] = vdup_n_u8(255);
+
+      // store, interleaving r/g/b/a
+      vst4_u8(out, o);
+      out += 8 * 4;
+    }
+  }
+#endif
+
+  for (; i < count; ++i) {
+    int y_fixed = (y[i] << 20) + (1 << 19);  // rounding
+    int r, g, b;
+    int cr = pcr[i] - 128;
+    int cb = pcb[i] - 128;
+    r = y_fixed + cr * stbi__float2fixed(1.40200f);
+    g = y_fixed + cr * -stbi__float2fixed(0.71414f) +
+        ((cb * -stbi__float2fixed(0.34414f)) & 0xffff0000);
+    b = y_fixed + cb * stbi__float2fixed(1.77200f);
+    r >>= 20;
+    g >>= 20;
+    b >>= 20;
+    if ((unsigned)r > 255) {
+      if (r < 0)
+        r = 0;
+      else
+        r = 255;
+    }
+    if ((unsigned)g > 255) {
+      if (g < 0)
+        g = 0;
+      else
+        g = 255;
+    }
+    if ((unsigned)b > 255) {
+      if (b < 0)
+        b = 0;
+      else
+        b = 255;
+    }
+    out[0] = (stbi_uc)r;
+    out[1] = (stbi_uc)g;
+    out[2] = (stbi_uc)b;
+    out[3] = 255;
+    out += step;
+  }
+}
+#endif
+
 // set up the kernels
 static void stbi__setup_jpeg(stbi__jpeg *j) {
  j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
--- a/third_party/stb/ycbcr-sse2.S
+++ b/third_party/stb/ycbcr-sse2.S
@ -1,93 +0,0 @@
-/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8     -*-│
-│vi: set et ft=asm ts=8 tw=8 fenc=utf-8                                     :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/macros.internal.h"
-
-	.balign	16
-stbi__YCbCr_to_RGB_row$sse2:
-	.leafprologue
-	.profilable
-	xor	%eax,%eax
-	cmp	$8,%r8d
-	jl	1f
-	xor	%eax,%eax
-	movdqa	2f(%rip),%xmm2
-	movdqa	3f(%rip),%xmm8
-	movdqa	4f(%rip),%xmm9
-	movdqa	5f(%rip),%xmm10
-	movdqa	6f(%rip),%xmm4
-	movdqa	7f(%rip),%xmm5
-	.balign	16
-0:	movq	(%rsi,%rax),%xmm6
-	movq	(%rcx,%rax),%xmm7
-	movq	(%rdx,%rax),%xmm1
-	movdqa	%xmm2,%xmm0
-	punpcklbw %xmm6,%xmm0
-	pxor	%xmm2,%xmm7
-	pxor	%xmm6,%xmm6
-	punpcklbw %xmm7,%xmm6
-	pxor	%xmm2,%xmm1
-	pxor	%xmm3,%xmm3
-	punpcklbw %xmm1,%xmm3
-	psrlw	$4,%xmm0
-	movdqa	%xmm6,%xmm7
-	pmulhw	%xmm8,%xmm7
-	movdqa	%xmm3,%xmm1
-	pmulhw	%xmm9,%xmm1
-	pmulhw	%xmm10,%xmm3
-	pmulhw	%xmm4,%xmm6
-	paddw	%xmm1,%xmm6
-	paddw	%xmm0,%xmm7
-	paddw	%xmm0,%xmm3
-	paddw	%xmm0,%xmm6
-	psraw	$4,%xmm7
-	psraw	$4,%xmm3
-	packuswb %xmm3,%xmm7
-	psraw	$4,%xmm6
-	packuswb %xmm5,%xmm6
-	movdqa	%xmm7,%xmm0
-	punpcklbw %xmm6,%xmm0
-	punpckhbw %xmm6,%xmm7
-	movdqa	%xmm0,%xmm1
-	punpcklwd %xmm7,%xmm1
-	punpckhwd %xmm7,%xmm0
-	movdqu	%xmm1,(%rdi,%rax,4)
-	movdqu	%xmm0,16(%rdi,%rax,4)
-	add	$8,%rax
-	lea	7(%rax),%r9d
-	cmp	%r8d,%r9d
-	jl	0b
-1:	.leafepilogue
-	.endfn	stbi__YCbCr_to_RGB_row$sse2,globl
-
-	.rodata.cst16
-2:	.byte	128,128,128,128,128,128,128,128
-	.zero	8
-3:	.short	5743,5743,5743,5743,5743,5743,5743,5743
-4:	.short	64126,64126,64126,64126,64126,64126,64126,64126
-5:	.short	7258,7258,7258,7258,7258,7258,7258,7258
-6:	.short	62611,62611,62611,62611,62611,62611,62611,62611
-7:	.short	255,255,255,255,255,255,255,255
-
-	.end
-//	These should be better but need to get them to work
-3:	.short	11485,11485,11485,11485,11485,11485,11485,11485		# J′R m=13 99.964387%
-4:	.short	-11277,-11277,-11277,-11277,-11277,-11277,-11277,-11277	# J′G m=15 99.935941%
-5:	.short	14516,14516,14516,14516,14516,14516,14516,14516		# J′B m=13 99.947219%
-6:	.short	-23401,-23401,-23401,-23401,-23401,-23401,-23401,-23401	# J′G m=15 99.935941%
-7:	.short	255,255,255,255,255,255,255,255
--- a/third_party/stb/ycbcr.c
+++ b/third_party/stb/ycbcr.c
@ -1,56 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/limits.h"
-#include "libc/log/check.h"
-#include "libc/log/log.h"
-#include "libc/macros.internal.h"
-#include "libc/str/str.h"
-#include "third_party/stb/internal.h"
-
-/* this is a reduced-precision calculation of YCbCr-to-RGB introduced
-   to make sure the code produces the same results in both SIMD and scalar */
-#define FLOAT2FIXED(x) (((int)((x)*4096.0f + .5f)) << 8)
-
-void stbi__YCbCr_to_RGB_row(unsigned char *out, const unsigned char *y,
-                            const unsigned char *pcb, const unsigned char *pcr,
-                            unsigned count, unsigned step) {
-  unsigned i;
-  unsigned char b4[4];
-  int y_fixed, r, g, b, cr, cb;
-  CHECK(step == 3 || step == 4);
-  CHECK_LE(count, INT_MAX / 4 - 4);
-  for (i = step == 4 ? stbi__YCbCr_to_RGB_row$sse2(out, y, pcb, pcr, count) : 0;
-       i < count; ++i) {
-    y_fixed = (y[i] << 20) + (1 << 19); /* rounding */
-    cr = pcr[i] - 128;
-    cb = pcb[i] - 128;
-    r = y_fixed + cr * FLOAT2FIXED(1.40200f);
-    g = y_fixed + (cr * -FLOAT2FIXED(0.71414f)) +
-        ((cb * -FLOAT2FIXED(0.34414f)) & 0xffff0000);
-    b = y_fixed + cb * FLOAT2FIXED(1.77200f);
-    r >>= 20;
-    g >>= 20;
-    b >>= 20;
-    b4[0] = MIN(255, MAX(0, r));
-    b4[1] = MIN(255, MAX(0, g));
-    b4[2] = MIN(255, MAX(0, b));
-    b4[3] = 255;
-    memcpy(out + i * step, b4, 4);
-  }
-}