From 1bc48bc8e4c212c73c3f64b3ed6179c4171463a7 Mon Sep 17 00:00:00 2001
From: mataha <mataha@users.noreply.github.com>
Date: Sat, 23 Dec 2023 06:39:27 +0100
Subject: [PATCH] Update stb (#885)

This commit and, by extension, PR attempts to update `stb` in the most
straightforward way possible as well as include fixes from main repo's
unmerged PRs for cases rearing their ugly heads during everyday usage:

 - stb#1299: stb_rect_pack: Make rect_height_compare a stable sort
 - stb#1402: stb_image: Fix "unused invalid_chunk" with STBI_FAILURE_USERMSG
 - stb#1404: stb_image: Fix gif two_back memory address
 - stb#1420: stb_image: Improve error reporting if file operations fail
   within *_from_file functions
 - stb#1445: stb_vorbis: Few static analyzers fixes
 - stb#1487: stb_vorbis: Fix residue classdata bounding for
   f->temp_memory_required
 - stb#1490: stb_vorbis: Fix broken clamp in codebook_decode_deinterleave_repeat
 - stb#1496: stb_image: Fix pnm only build
 - stb#1497: stb_image: Fix memory leaks if stbi__convert failed
 - stb#1498: stb_vorbis: Fix memory leaks in stb_vorbis
 - stb#1499: stb_vorbis: Minor change to prevent the undefined behavior -
   left shift of a negative value
 - stb#1500: stb_vorbis: Fix signed integer overflow

Includes additional small fixes that I felt didn't warrant a separate PR.
---
 dsp/core/core.h                       |    5 +-
 dsp/core/dct.c                        |   79 +-
 third_party/stb/README.cosmo          |   29 +-
 third_party/stb/README.txt            |  143 ++--
 third_party/stb/stb_image.c           | 1014 +++++++++++++++----------
 third_party/stb/stb_image.h           |   15 +-
 third_party/stb/stb_image_resize.c    |   30 +-
 third_party/stb/stb_image_write.c     |  621 ++++++++++-----
 third_party/stb/stb_image_write.h     |    1 -
 third_party/stb/stb_image_write_png.c |  379 ---------
 third_party/stb/stb_rect_pack.c       |   45 +-
 third_party/stb/stb_truetype.c        |    4 +-
 third_party/stb/stb_vorbis.c          |  358 +++++----
 third_party/stb/stb_vorbis.h          |   15 +
 tool/viz/derasterize.c                |   11 +-
 tool/viz/memzoom.c                    |   23 +-
 tool/viz/od16.c                       |   19 +-
 tool/viz/printansi.c                  |   15 +-
 tool/viz/printimage.c                 |   17 +-
 tool/viz/printvideo.c                 |   19 +-
 20 files changed, 1560 insertions(+), 1282 deletions(-)
 delete mode 100644 third_party/stb/stb_image_write_png.c

diff --git a/dsp/core/core.h b/dsp/core/core.h
index 714f3a392..eadf040f9 100644
--- a/dsp/core/core.h
+++ b/dsp/core/core.h
@@ -9,8 +9,9 @@ int mulaw(int);
 int unmulaw(int);
 void *double2byte(long, const void *, double, double) vallocesque;
 void *byte2double(long, const void *, double, double) vallocesque;
-void *dct(float[8][8], float, float, float, float, float);
-void *dctjpeg(float[8][8]);
+void *dct(float[restrict hasatleast 8][8], unsigned,
+          float, float, float, float, float);
+void *dctjpeg(float[restrict hasatleast 8][8], unsigned);
 double det3(const double[3][3]) nosideeffect;
 void *inv3(double[restrict 3][3], const double[restrict 3][3], double);
 void *matmul3(double[restrict 3][3], const double[3][3], const double[3][3]);
diff --git a/dsp/core/dct.c b/dsp/core/dct.c
index 506c96f86..cae19d596 100644
--- a/dsp/core/dct.c
+++ b/dsp/core/dct.c
@@ -18,40 +18,40 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "dsp/core/core.h"
 
-#define DCT(A, B, C, D, E, F, G, H, T, C0, C1, C2, C3, C4)    \
-  do {                                                        \
-    T z1, z2, z3, z4, z5, z11, z13;                           \
-    T t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13;     \
-    t0 = A + H;                                               \
-    t7 = A - H;                                               \
-    t1 = B + G;                                               \
-    t6 = B - G;                                               \
-    t2 = C + F;                                               \
-    t5 = C - F;                                               \
-    t3 = D + E;                                               \
-    t4 = D - E;                                               \
-    t10 = t0 + t3;                                            \
-    t13 = t0 - t3;                                            \
-    t11 = t1 + t2;                                            \
-    t12 = t1 - t2;                                            \
-    A = t10 + t11;                                            \
-    E = t10 - t11;                                            \
-    z1 = (t12 + t13) * C0;                                    \
-    C = t13 + z1;                                             \
-    G = t13 - z1;                                             \
-    t10 = t4 + t5;                                            \
-    t11 = t5 + t6;                                            \
-    t12 = t6 + t7;                                            \
-    z5 = (t10 - t12) * C1;                                    \
-    z2 = t10 * C2 + z5;                                       \
-    z4 = t12 * C3 + z5;                                       \
-    z3 = t11 * C4;                                            \
-    z11 = t7 + z3;                                            \
-    z13 = t7 - z3;                                            \
-    F = z13 + z2;                                             \
-    D = z13 - z2;                                             \
-    B = z11 + z4;                                             \
-    H = z11 - z4;                                             \
+#define DCT(A, B, C, D, E, F, G, H, T, C0, C1, C2, C3, C4) \
+  do {                                                     \
+    T z1, z2, z3, z4, z5, z11, z13;                        \
+    T t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13;  \
+    t0 = A + H;                                            \
+    t7 = A - H;                                            \
+    t1 = B + G;                                            \
+    t6 = B - G;                                            \
+    t2 = C + F;                                            \
+    t5 = C - F;                                            \
+    t3 = D + E;                                            \
+    t4 = D - E;                                            \
+    t10 = t0 + t3;                                         \
+    t13 = t0 - t3;                                         \
+    t11 = t1 + t2;                                         \
+    t12 = t1 - t2;                                         \
+    A = t10 + t11;                                         \
+    E = t10 - t11;                                         \
+    z1 = (t12 + t13) * C0;                                 \
+    C = t13 + z1;                                          \
+    G = t13 - z1;                                          \
+    t10 = t4 + t5;                                         \
+    t11 = t5 + t6;                                         \
+    t12 = t6 + t7;                                         \
+    z5 = (t10 - t12) * C1;                                 \
+    z2 = t10 * C2 + z5;                                    \
+    z4 = t12 * C3 + z5;                                    \
+    z3 = t11 * C4;                                         \
+    z11 = t7 + z3;                                         \
+    z13 = t7 - z3;                                         \
+    F = z13 + z2;                                          \
+    D = z13 - z2;                                          \
+    B = z11 + z4;                                          \
+    H = z11 - z4;                                          \
   } while (0)
 
 /**
@@ -65,20 +65,21 @@
  *
  * @cost ~100ns
  */
-void *dct(float M[8][8], float c0, float c1, float c2, float c3, float c4) {
+void *dct(float M[restrict hasatleast 8][8], unsigned stride,
+          float c0, float c1, float c2, float c3, float c4) {
   unsigned y, x;
-  for (y = 0; y < 8; ++y) {
+  for (y = 0; y < stride * 8; y += stride) {
     DCT(M[y][0], M[y][1], M[y][2], M[y][3], M[y][4], M[y][5], M[y][6], M[y][7],
         float, c0, c1, c2, c3, c4);
   }
-  for (x = 0; x < 8; ++x) {
+  for (x = 0; x < stride * 8; x += stride) {
     DCT(M[0][x], M[1][x], M[2][x], M[3][x], M[4][x], M[5][x], M[6][x], M[7][x],
         float, c0, c1, c2, c3, c4);
   }
   return M;
 }
 
-void *dctjpeg(float M[8][8]) {
-  return dct(M, .707106781f, .382683433f, .541196100f, 1.306562965f,
+void *dctjpeg(float M[restrict hasatleast 8][8], unsigned stride) {
+  return dct(M, stride, .707106781f, .382683433f, .541196100f, 1.306562965f,
              .707106781f);
 }
diff --git a/third_party/stb/README.cosmo b/third_party/stb/README.cosmo
index ca3b785c9..b2c7717ad 100644
--- a/third_party/stb/README.cosmo
+++ b/third_party/stb/README.cosmo
@@ -5,8 +5,8 @@ LOCAL CHANGES
   - Removed undefined behavior
   - Removed BMP [endian code made it 100x slower than PNG/JPEG]
   - Removed PIC [never heard of it]
-  - Removed TGA [consider imaagemagick convert command]
-  - Removed PSD [consider imaagemagick convert command]
+  - Removed TGA [consider imagemagick convert command]
+  - Removed PSD [consider imagemagick convert command]
   - Removed HDR [mine eyes and wikipedia agree stb gamma math is off]
   - Patched PNG loading edge case
   - Fixed code C standard says is undefined
@@ -14,10 +14,25 @@ LOCAL CHANGES
   - Removed unnecessary ifdefs
   - Removed MSVC torture code
 
-SYNCHRONIZATION POINT
+SYNCHRONIZATION POINT (`--date=format:"%a %b %d %H:%M:%S %Y %z"`)
 
-  commit f67165c2bb2af3060ecae7d20d6f731173485ad0
-  Author: Sean Barrett <sean2@nothings.org>
-  Date:   Mon Oct 28 09:30:02 2019 -0700
+  commit 5736b15f7ea0ffb08dd38af21067c314d6a3aae9
+  Author: Sean Barrett <seanb@radgametools.com>
+  Date:   Sun Jan 29 10:46:04 2023 -0800
 
-      Update README.md
+      re-add perlin noise again
+
+ADDITIONAL CHANGES/FIXES:
+
+  - https://github.com/nothings/stb/pull/1299
+  - https://github.com/nothings/stb/pull/1402
+  - https://github.com/nothings/stb/pull/1404
+  - https://github.com/nothings/stb/pull/1420
+  - https://github.com/nothings/stb/pull/1445
+  - https://github.com/nothings/stb/pull/1487
+  - https://github.com/nothings/stb/pull/1490
+  - https://github.com/nothings/stb/pull/1496
+  - https://github.com/nothings/stb/pull/1497
+  - https://github.com/nothings/stb/pull/1498
+  - https://github.com/nothings/stb/pull/1499
+  - https://github.com/nothings/stb/pull/1500
diff --git a/third_party/stb/README.txt b/third_party/stb/README.txt
index 4c6915271..5c2434237 100644
--- a/third_party/stb/README.txt
+++ b/third_party/stb/README.txt
@@ -1,13 +1,12 @@
-/*
- * stb_image - v2.23 - public domain image loader - http://nothings.org/stb
+/* stb_image - v2.29 - public domain image loader - http://nothings.org/stb
  *                                no warranty implied; use at your own risk
  *
  * [heavily modified by justine tunney]
  *
  *    JPEG baseline & progressive (12 bpc/arithmetic not supported, same
- *                      as stock IJG lib) PNG 1/2/4/8/16-bit-per-channel
+ *                      as stock IJG lib)
+ *    PNG 1/2/4/8/16-bit-per-channel
  *    GIF (*comp always reports as 4-channel)
- *    HDR (radiance rgbE format)
  *    PNM (PPM and PGM binary only)
  *
  *    Animated GIF still needs a proper API, but here's one way to do it:
@@ -18,45 +17,53 @@
  *
  * ============================    Contributors    =========================
  *
- * Image formats                          Extensions, features
- *  Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
- *  Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
- *  Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
- *  Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
- *  Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
- *  Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
- *  Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
- *  github:urraka (animated gif)           Junggon Kim (PNM comments)
- *  Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
- *                                         socks-the-fox (16-bit PNG)
- *                                         Jeremy Sawicki (ImageNet JPGs)
- *                                         Mikhail Morozov (1-bit BMP)
- * Optimizations & bugfixes                Anael Seghezzi (is-16-bit query)
- *  Fabian "ryg" Giesen
- *  Arseny Kapoulkine
- *  John-Mark Allen
+ * Image formats                         Extensions, features
+ *  Sean Barrett (jpeg, png, bmp)         Jetro Lauha (stbi_info)
+ *  Nicolas Schulz (hdr, psd)             Martin "SpartanJ" Golini (stbi_info)
+ *  Jonathan Dummer (tga)                 James "moose2000" Brown (iPhone PNG)
+ *  Jean-Marc Lienher (gif)               Ben "Disch" Wenger (io callbacks)
+ *  Tom Seddon (pic)                      Omar Cornut (1/2/4-bit PNG)
+ *  Thatcher Ulrich (psd)                 Nicolas Guillemot (vertical flip)
+ *  Ken Miller (pgm, ppm)                 Richard Mitton (16-bit PSD)
+ *  github:urraka (animated gif)          Junggon Kim (PNM comments)
+ *  Christopher Forseth (animated gif)    Daniel Gibson (16-bit TGA)
+ *                                        socks-the-fox (16-bit PNG)
+ * Optimizations & bugfixes               Jeremy Sawicki (ImageNet JPGs)
+ *  Fabian "ryg" Giesen                   Mikhail Morozov (1-bit BMP)
+ *  Arseny Kapoulkine                     Anael Seghezzi (is-16-bit query)
+ *  John-Mark Allen                       Simon Breuss (16-bit PNM)
  *  Carmelo J Fdez-Aguera
  *
  * Bug & warning fixes
- * Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
- * Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
- * Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
- * Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
- * the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
- * Janez Zemva             John Bartholomew   Michal Cichon github:romigrou
- * Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
- * Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
- * Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
- * Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
- * Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
- * Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
- * Julian Raschke          Gregory Mullen     Baldur Karlsson
- * github:poppolopoppo Christian Floisand      Kevin Schmidt      JR Smith
- * github:darealshinji Blazej Dariusz Roszkowski github:Michaelangel007
- */
-
-/*
- * DOCUMENTATION
+ *  Marc LeBlanc            Laurent Gomila         JR Smith
+ *  Christpher Lloyd        Sergio Gonzalez        Matvey Cherevko
+ *  Phil Jordan             Ryamond Barbiero       Zack Middleton
+ *  Hayaki Saito            Engin Manap
+ *  Luke Graham             Dale Weiler            Martins Mozeiko
+ *  Thomas Ruf              Neil Bickford          Blazej Dariusz Roszkowski
+ *  Janez Zemva             Gregory Mullen         Roy Eltham
+ *  Jonathan Blow           Kevin Schmidt
+ *  Eugene Golushkov        Brad Weinberger        the Horde3D community
+ *  Aruelien Pocheville     Alexander Veselov      github:rlyeh
+ *  Cass Everitt            [reserved]             github:romigrou
+ *  Paul Du Bois                                   github:svdijk
+ *  Philipp Wiesemann       Guillaume George       github:snagar
+ *  Josh Tobin              Joseph Thomson         github:Zelex
+ *  Julian Raschke          Dave Moore             github:grim210
+ *  Baldur Karlsson         Won Chun               github:sammyhw
+ *                          Nick Verigakis         github:phprus
+ *  Luca Sas                                       github:poppolopoppo
+ *  Ryan C. Gordon          Michal Cichon          github:darealshinji
+ *  David Woo               Tero Hanninen          github:Michaelangel007
+ *  Jerry Jansson           Cort Stratton          github:mosra
+ *                          Thibault Reuille       [reserved]
+ *  Nathan Reed                                    [reserved]
+ *  Johan Duparc            Aldo Culquicondor
+ *  Ronny Chevalier         Oriol Ferrer           Jacko Dirks
+ *  John Bartholomew        Matthew Gregan
+ *  Ken Hamada              Christian Floisand
+ *
+ * ============================    Documentation   =========================
  *
  * Limitations:
  *    - no 12-bit-per-channel JPEG
@@ -70,14 +77,15 @@
  *    // ... x = width, y = height, n = # 8-bit components per pixel ...
  *    // ... replace '0' with '1'..'4' to force that many components per pixel
  *    // ... but 'n' will always be the number that it would have been if you
- *    said 0 stbi_image_free(data)
+ *    // ... said 0
+ *    stbi_image_free(data);
  *
  * Standard parameters:
  *    int *x                 -- outputs image width in pixels
  *    int *y                 -- outputs image height in pixels
  *    int *channels_in_file  -- outputs # of image components in image file
  *    int desired_channels   -- if non-zero, # of image components requested in
- *    result
+ *                              result
  *
  * The return value from an image loader is an 'unsigned char *' which points
  * to the pixel data, or NULL on an allocation failure or if the image is
@@ -110,6 +118,32 @@
  *
  * Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
  *
+ * To query the width, height and component count of an image without having to
+ * decode the full file, you can use the stbi_info family of functions:
+ *
+ *   int x,y,n,ok;
+ *   ok = stbi_info(filename, &x, &y, &n);
+ *   // returns ok=1 and sets x, y, n if image is a supported format,
+ *   // 0 otherwise.
+ *
+ * Note that stb_image pervasively uses ints in its public API for sizes,
+ * including sizes of memory buffers. This is now part of the API and thus
+ * hard to change without causing breakage. As a result, the various image
+ * loaders all have certain limits on image size; these differ somewhat
+ * by format but generally boil down to either just under 2GB or just under
+ * 1GB. When the decoded image would be larger than this, stb_image decoding
+ * will fail.
+ *
+ * Additionally, stb_image will reject image files that have any of their
+ * dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+ * which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+ * the only way to have an image with such dimensions load correctly
+ * is for it to have a rather extreme aspect ratio. Either way, the
+ * assumption here is that such larger images are likely to be malformed
+ * or malicious. If you do need to load an image with individual dimensions
+ * larger than that, and it still fits in the overall size limit, you can
+ * #define STBI_MAX_DIMENSIONS on your own to be something larger.
+ *
  * ===========================================================================
  *
  * I/O callbacks
@@ -163,11 +197,10 @@
  *
  * iPhone PNG support:
  *
- * By default we convert iphone-formatted PNGs back to RGB, even though
- * they are internally encoded differently. You can disable this conversion
- * by calling stbi_convert_iphone_png_to_rgb(0), in which case
- * you will always just get the native iphone "format" through (which
- * is BGR stored in RGB).
+ * We optionally support converting iPhone-formatted PNGs (which store
+ * premultiplied BGRA) back to RGB, even though they're internally encoded
+ * differently. To enable this conversion, call
+ * stbi_convert_iphone_png_to_rgb(1).
  *
  * Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
  * pixel to remove any premultiplied alpha *only* if the image file explicitly
@@ -191,9 +224,18 @@
  *   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
  *     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
  *
+ *  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+ *    than that size (in either width or height) without further processing.
+ *    This is to let programs in the wild set an upper bound to prevent
+ *    denial-of-service attacks on untrusted data, as one could generate a
+ *    valid image of gigantic dimensions and force stb_image to allocate a
+ *    huge block of memory and spend disproportionate time decoding it. By
+ *    default this is set to (1 << 24), which is 16777216, but that's still
+ *    very big.
+ *
  */
 
-/* stb_image_resize - v0.96 - public domain image resizing
+/* stb_image_resize - v0.97 - public domain image resizing
  * by Jorge L Rodriguez (@VinoBS) - 2014
  * http://github.com/nothings/stb
  *
@@ -214,9 +256,7 @@
  *                             output_pixels, out_w, out_h, 0,
  *                             num_channels , alpha_chan  , 0, STBIR_EDGE_CLAMP)
  *                                                          // WRAP/REFLECT/ZERO
- */
-
-/*
+ *
  * DOCUMENTATION
  *
  *    SRGB & FLOATING POINT REPRESENTATION
@@ -348,6 +388,7 @@
  *    Nathan Reed: warning fixes
  *
  * REVISIONS
+ *    0.97 (2020-02-02) fixed warning
  *    0.96 (2019-03-04) fixed warnings
  *    0.95 (2017-07-23) fixed warnings
  *    0.94 (2017-03-18) fixed warnings
diff --git a/third_party/stb/stb_image.c b/third_party/stb/stb_image.c
index 97b560a33..6852c3adc 100644
--- a/third_party/stb/stb_image.c
+++ b/third_party/stb/stb_image.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -51,15 +51,12 @@ http://nothings.org/stb\"");
 #define idct_block_kernel stbi__idct_block
 #endif
 
-#define ROL(w, k) ((w) << (k) | (w) >> (sizeof(w) * CHAR_BIT - (k)))
+#define ROL(w, k) (((w) << (k)) | ((w) >> (-(k) & (sizeof(w) * CHAR_BIT - 1))))
 
-#ifndef STBI_REALLOC_SIZED
-#define STBI_REALLOC_SIZED(p, oldsz, newsz) realloc(p, newsz)
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
 #endif
 
-typedef unsigned char stbi_uc;
-typedef unsigned short stbi_us;
-
 // stbi__context structure is our basic context used by all images, so it
 // contains all the IO context, plus some basic image information
 typedef struct {
@@ -70,6 +67,7 @@ typedef struct {
   int read_from_callbacks;
   int buflen;
   unsigned char buffer_start[128];
+  int callback_already_read;
   unsigned char *img_buffer, *img_buffer_end;
   unsigned char *img_buffer_original, *img_buffer_original_end;
 } stbi__context;
@@ -83,6 +81,7 @@ static void stbi__start_mem(stbi__context *s, unsigned char const *buffer,
                             int len) {
   s->io.read = NULL;
   s->read_from_callbacks = 0;
+  s->callback_already_read = 0;
   s->img_buffer = s->img_buffer_original = (unsigned char *)buffer;
   s->img_buffer_end = s->img_buffer_original_end =
       (unsigned char *)buffer + len;
@@ -95,7 +94,8 @@ static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c,
   s->io_user_data = user;
   s->buflen = sizeof(s->buffer_start);
   s->read_from_callbacks = 1;
-  s->img_buffer_original = s->buffer_start;
+  s->callback_already_read = 0;
+  s->img_buffer = s->img_buffer_original = s->buffer_start;
   stbi__refill_buffer(s);
   s->img_buffer_original_end = s->img_buffer_end;
 }
@@ -105,11 +105,16 @@ static int stbi__stdio_read(void *user, char *data, int size) {
 }
 
 static void stbi__stdio_skip(void *user, int n) {
+  int ch;
   fseek(user, n, SEEK_CUR);
+  ch = fgetc(user);
+  if (ch != EOF) {
+    ungetc(ch, user);
+  }
 }
 
 static int stbi__stdio_eof(void *user) {
-  return feof(user);
+  return feof(user) || ferror(user);
 }
 
 static stbi_io_callbacks stbi__stdio_callbacks = {
@@ -168,8 +173,8 @@ const char *stbi_failure_reason(void) {
 
 static int stbi__err(const char *specific_details,
                      const char *general_details) {
-  /* DebugBreak(); */
-  /* WARNF("%s: %s", general_details, specific_details); */
+  // DebugBreak();
+  // WARNF("%s: %s", general_details, specific_details);
   stbi__g_failure_reason = general_details;
   return 0;
 }
@@ -204,17 +209,27 @@ static int stbi__mul2sizes_valid(int a, int b) {
   return a <= INT_MAX / b;
 }
 
-// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+// returns 1 if "a * b + add" has no negative terms/factors
+// and doesn't overflow
 static int stbi__mad2sizes_valid(int a, int b, int add) {
   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a * b, add);
 }
 
-// returns 1 if "a*b*c + add" has no negaive terms/factors and doesn't overflow
+// returns 1 if "a * b * c + add" has no negative terms/factors
+// and doesn't overflow
 static int stbi__mad3sizes_valid(int a, int b, int c, int add) {
   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
          stbi__addsizes_valid(a * b * c, add);
 }
 
+// returns 1 if "a * b * c * d + add" has no negative terms/factors
+// and doesn't overflow
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) {
+  return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a * b, c) &&
+         stbi__mul2sizes_valid(a * b * c, d) &&
+         stbi__addsizes_valid(a * b * c * d, add);
+}
+
 // mallocs with size overflow checking
 static void *stbi__malloc_mad2(int a, int b, int add) {
   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
@@ -226,6 +241,44 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add) {
   return xmalloc(a * b * c + add);
 }
 
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) {
+  if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+  return xmalloc(a * b * c * d + add);
+}
+
+// returns 1 if the sum of two signed ints is valid
+// (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b) {
+  if ((a >= 0) != (b >= 0)) {
+    // a and b have different signs, so no overflow
+    return 1;
+  }
+  if (a < 0 && b < 0) {
+    // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+    return a >= INT_MIN - b;
+  }
+  return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two ints fits in a signed short,
+// 0 on overflow.
+static int stbi__mul2shorts_valid(int a, int b) {
+  if (b == 0 || b == -1) {
+    // multiplication by 0 is always 0;
+    // check for -1 so SHRT_MIN / b doesn't overflow
+    return 1;
+  }
+  if ((a >= 0) == (b >= 0)) {
+    // product is positive, so similar to mul2sizes_valid
+    return a <= SHRT_MAX / b;
+  }
+  if (b < 0) {
+    // same as a * b >= SHRT_MIN
+    return a <= SHRT_MIN / b;
+  }
+  return a >= SHRT_MIN / b;
+}
+
 #define stbi__errpf(x, y) \
   ({                      \
     stbi__err(x, y);      \
@@ -247,22 +300,22 @@ void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip) {
 }
 
 static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp,
-                             int req_comp, stbi__result_info *ri, int bpc) {
+                             int req_comp, stbi__result_info *ri) {
   bzero(ri, sizeof(*ri));
   ri->bits_per_channel = 8;
   ri->num_channels = 0;
-#ifndef STBI_NO_JPEG
-  if (stbi__jpeg_test(s)) return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_PNG
+
+  // test the formats with a very explicit header first (at least a FOURCC
+  // or distinctive magic number first)
   if (stbi__png_test(s)) return stbi__png_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_GIF
   if (stbi__gif_test(s)) return stbi__gif_load(s, x, y, comp, req_comp, ri);
-#endif
-#ifndef STBI_NO_PNM
+
+  // then the formats that can end up attempting to load with just 1 or 2
+  // bytes matching expectations; these are prone to false positives, so
+  // try them later
+  if (stbi__jpeg_test(s)) return stbi__jpeg_load(s, x, y, comp, req_comp, ri);
   if (stbi__pnm_test(s)) return stbi__pnm_load(s, x, y, comp, req_comp, ri);
-#endif
+
   return stbi__errpuc("unknown image type",
                       "Image not of any known type, or corrupt");
 }
@@ -335,12 +388,18 @@ static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x,
                                                       int req_comp) {
   void *result;
   stbi__result_info ri;
-  result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+  result = stbi__load_main(s, x, y, comp, req_comp, &ri);
   if (result == NULL) return NULL;
+  assert(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
   if (ri.bits_per_channel != 8) {
-    assert(ri.bits_per_channel == 16);
-    result =
-        stbi__convert_16_to_8(result, *x, *y, req_comp == 0 ? *comp : req_comp);
+    // https://github.com/nothings/stb/pull/1497
+    unsigned char *converted = stbi__convert_16_to_8(
+        (uint16_t *)result, *x, *y, req_comp == 0 ? *comp : req_comp);
+    if (converted == NULL) {
+      free(result);
+      return NULL;
+    }
+    result = converted;
     ri.bits_per_channel = 8;
   }
   // @TODO: move stbi__convert_format to here
@@ -354,13 +413,20 @@ static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x,
 static uint16_t *stbi__load_and_postprocess_16bit(stbi__context *s, int *x,
                                                   int *y, int *comp,
                                                   int req_comp) {
+  void *result;
   stbi__result_info ri;
-  void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+  result = stbi__load_main(s, x, y, comp, req_comp, &ri);
   if (result == NULL) return NULL;
+  assert(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
   if (ri.bits_per_channel != 16) {
-    assert(ri.bits_per_channel == 8);
-    result = stbi__convert_8_to_16((unsigned char *)result, *x, *y,
-                                   req_comp == 0 ? *comp : req_comp);
+    // https://github.com/nothings/stb/pull/1497
+    uint16_t *converted = stbi__convert_8_to_16(
+        (unsigned char *)result, *x, *y, req_comp == 0 ? *comp : req_comp);
+    if (converted == NULL) {
+      free(result);
+      return NULL;
+    }
+    result = converted;
     ri.bits_per_channel = 16;
   }
   // @TODO: move stbi__convert_format16 to here
@@ -396,7 +462,14 @@ unsigned char *stbi_load_from_file(FILE *f, int *x, int *y, int *comp,
   result = stbi__load_and_postprocess_8bit(&s, x, y, comp, req_comp);
   if (result) {
     // need to 'unget' all the characters in the IO buffer
-    fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+    // https://github.com/nothings/stb/pull/1420
+    if (fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR)) {
+      // fseek() failed; we can no longer maintain the file cursor position
+      // guarantee of this function, so return null.
+      free(result);
+      return stbi__errpuc("bad file",
+                          "fseek() failed; seek position unreliable");
+    }
   }
   return result;
 }
@@ -409,7 +482,14 @@ uint16_t *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp,
   result = stbi__load_and_postprocess_16bit(&s, x, y, comp, req_comp);
   if (result) {
     // need to 'unget' all the characters in the IO buffer
-    fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR);
+    // https://github.com/nothings/stb/pull/1420
+    if (fseek(f, -(int)(s.img_buffer_end - s.img_buffer), SEEK_CUR)) {
+      // fseek() failed; we can no longer maintain the file cursor position
+      // guarantee of this function, so return null.
+      free(result);
+      return (uint16_t *)stbi__errpuc(
+          "bad file", "fseek() failed; seek position unreliable");
+    }
   }
   return result;
 }
@@ -476,6 +556,7 @@ enum { STBI__SCAN_load = 0, STBI__SCAN_type, STBI__SCAN_header };
 
 static void stbi__refill_buffer(stbi__context *s) {
   int n = (s->io.read)(s->io_user_data, (char *)s->buffer_start, s->buflen);
+  s->callback_already_read += (int)(s->img_buffer - s->img_buffer_original);
   if (n == 0) {
     // at end of file, treat same as if from memory, but need to handle case
     // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
@@ -510,6 +591,7 @@ forceinline int stbi__at_eof(stbi__context *s) {
 }
 
 static void stbi__skip(stbi__context *s, int n) {
+  if (n == 0) return;  // already there!
   if (n < 0) {
     s->img_buffer = s->img_buffer_end;
     return;
@@ -569,7 +651,7 @@ static uint32_t stbi__get32be(stbi__context *s) {
 //  generic converter from built-in img_n to req_comp
 //    individual types do this automatically as much as possible (e.g. jpeg
 //    does all cases internally since it needs to colorspace convert anyway,
-//    and it never has alpha, so very few cases ). png can automatically
+//    and it never has alpha, so very few cases). png can automatically
 //    interleave an alpha=255 channel, but falls back to this for other cases
 //
 //  assume data buffer is malloced, so malloc a new one and free that one
@@ -658,6 +740,9 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n,
       break;
       default:
         assert(0);
+        free(data);
+        free(good);
+        return stbi__errpuc("unsupported", "Unsupported format conversion");
     }
 #undef STBI__CASE
   }
@@ -751,6 +836,10 @@ static uint16_t *stbi__convert_format16(uint16_t *data, int img_n, int req_comp,
       break;
       default:
         assert(0);
+        free(data);
+        free(good);
+        return (uint16_t *)stbi__errpuc("unsupported",
+                                        "Unsupported format conversion");
     }
 #undef STBI__CASE
   }
@@ -849,8 +938,14 @@ static int stbi__build_huffman(stbi__huffman *h, int *count) {
   int i, j, k = 0;
   unsigned int code;
   // build size list for each symbol (from JPEG spec)
-  for (i = 0; i < 16; ++i)
-    for (j = 0; j < count[i]; ++j) h->size[k++] = (unsigned char)(i + 1);
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < count[i]; ++j) {
+      h->size[k++] = (unsigned char)(i + 1);
+      if (k >= 257) {
+        return stbi__err("bad size list", "Corrupt JPEG");
+      }
+    }
+  }
   h->size[k] = 0;
 
   // compute actual symbols (from jpeg spec)
@@ -962,8 +1057,10 @@ forceinline int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) {
   for (k = FAST_BITS + 1;; ++k)
     if (temp < h->maxcode[k]) break;
   if (k == 17) {
+    WARNF("j->code_bits: %d", j->code_bits);
     // error! code not found
     j->code_bits -= 16;
+    WARNF("Symbol: %d", k);
     return -1;
   }
 
@@ -971,6 +1068,10 @@ forceinline int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) {
 
   // convert the huffman code to the symbol id
   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+  if (c < 0 || c >= 256) {
+    // symbol id out of bounds!
+    return -1;
+  }
   assert((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) ==
          h->code[c]);
 
@@ -980,7 +1081,7 @@ forceinline int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) {
   return h->values[c];
 }
 
-// bias[n] = (-1<<n) + 1
+// bias[n] = (-1 << n) + 1
 static const int stbi__jbias[16] = {0,     -1,    -3,     -7,    -15,   -31,
                                     -63,   -127,  -255,   -511,  -1023, -2047,
                                     -4095, -8191, -16383, -32767};
@@ -990,21 +1091,29 @@ static const int stbi__jbias[16] = {0,     -1,    -3,     -7,    -15,   -31,
 forceinline int stbi__extend_receive(stbi__jpeg *j, int n) {
   int sgn;
   unsigned int k;
-  // TODO(jart): what is this
   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
-  sgn = (int32_t)j->code_buffer >> 31;  // sign bit is always in MSB
+  if (j->code_bits < n) {
+    // ran out of bits from stream, return 0s intead of continuing
+    return 0;
+  }
+  // sign bit is always in MSB;
+  // 0 if MSB clear (positive), 1 if MSB set (negative)
+  sgn = j->code_buffer >> 31;
   k = ROL(j->code_buffer, n);
-  assert(n >= 0 && n < (int)(sizeof(stbi__bmask) / sizeof(*stbi__bmask)));
   j->code_buffer = k & ~stbi__bmask[n];
   k &= stbi__bmask[n];
   j->code_bits -= n;
-  return k + (stbi__jbias[n] & ~sgn);
+  return k + (stbi__jbias[n] & (sgn - 1));
 }
 
 // get some unsigned bits
 forceinline int stbi__jpeg_get_bits(stbi__jpeg *j, int n) {
   unsigned int k;
   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+  if (j->code_bits < n) {
+    // ran out of bits from stream, return 0s intead of continuing
+    return 0;
+  }
   k = ROL(j->code_buffer, n);
   j->code_buffer = k & ~stbi__bmask[n];
   k &= stbi__bmask[n];
@@ -1015,6 +1124,10 @@ forceinline int stbi__jpeg_get_bits(stbi__jpeg *j, int n) {
 forceinline int stbi__jpeg_get_bit(stbi__jpeg *j) {
   unsigned int k;
   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+  if (j->code_bits < 1) {
+    // ran out of bits from stream, return 0s intead of continuing
+    return 0;
+  }
   k = j->code_buffer;
   j->code_buffer <<= 1;
   --j->code_bits;
@@ -1039,12 +1152,18 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64],
   int diff, dc, k, t, c, r, s, rs;
   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
   t = stbi__jpeg_huff_decode(j, hdc);
-  if (t < 0) return stbi__err("bad huffman code", "Corrupt JPEG");
+  if (t < 0 || t > 15) return stbi__err("bad huffman code", "Corrupt JPEG");
   // 0 all the ac values now so we can do it 32-bits at a time
   bzero(data, 64 * sizeof(data[0]));
   diff = t ? stbi__extend_receive(j, t) : 0;
+  if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) {
+    return stbi__err("bad delta", "Corrupt JPEG");
+  }
   dc = j->img_comp[b].dc_pred + diff;
   j->img_comp[b].dc_pred = dc;
+  if (!stbi__mul2shorts_valid(dc, dequant[0])) {
+    return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+  }
   data[0] = (short)(dc * dequant[0]);
   // decode AC components, see JPEG spec
   k = 1;
@@ -1055,6 +1174,10 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64],
     if (r) {               // fast-AC path
       k += (r >> 4) & 15;  // run
       s = r & 15;          // combined length
+      if (s > j->code_bits) {
+        return stbi__err("bad huffman code",
+                         "Combined length longer than code bits available");
+      }
       j->code_buffer <<= s;
       j->code_bits -= s;
       // decode into unzigzag'd location
@@ -1082,7 +1205,6 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64],
 static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64],
                                            stbi__huffman *hdc, int b) {
   int t;
-  short s;
   int diff, dc;
   if (j->spec_end != 0) {
     return stbi__err("can't merge dc and ac", "Corrupt JPEG");
@@ -1092,15 +1214,22 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64],
     // first scan for DC coefficient, must be first
     bzero(data, 64 * sizeof(data[0]));  // 0 all the ac values now
     t = stbi__jpeg_huff_decode(j, hdc);
+    if (t < 0 || t > 15) {
+      return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+    }
     diff = t ? stbi__extend_receive(j, t) : 0;
+    if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) {
+      return stbi__err("bad delta", "Corrupt JPEG");
+    }
     dc = j->img_comp[b].dc_pred + diff;
     j->img_comp[b].dc_pred = dc;
-    s = dc;
-    s *= 1u << j->succ_low;
-    data[0] = s; /* (short)(dc << j->succ_low); */
+    if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) {
+      return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+    }
+    data[0] = (short)(dc * (1u << j->succ_low));
   } else {
     // refinement scan for DC coefficient
-    if (stbi__jpeg_get_bit(j)) data[0] += (short)(1 << j->succ_low);
+    if (stbi__jpeg_get_bit(j)) data[0] += (short)(1u << j->succ_low);
   }
   return 1;
 }
@@ -1129,10 +1258,14 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64],
       if (r) {               // fast-AC path
         k += (r >> 4) & 15;  // run
         s = r & 15;          // combined length
+        if (s > j->code_bits) {
+          return stbi__err("bad huffman code",
+                           "Combined length longer than code bits available");
+        }
         j->code_buffer <<= s;
         j->code_bits -= s;
         zig = stbi__jpeg_dezigzag[k++];
-        data[zig] = (r / 256) * (1u << shift);
+        data[zig] = (short)((r >> 8) * (1u << shift));
       } else {
         rs = stbi__jpeg_huff_decode(j, hac);
         if (rs < 0) return stbi__err("bad huffman code", "Corrupt JPEG");
@@ -1149,15 +1282,13 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64],
         } else {
           k += r;
           zig = stbi__jpeg_dezigzag[k++];
-          data[zig] = stbi__extend_receive(j, s) * (1u << shift);
+          data[zig] = (short)(stbi__extend_receive(j, s) * (1u << shift));
         }
       }
     } while (k <= j->spec_end);
   } else {
     // refinement scan for these AC coefficients
-
-    bit = (short)(1 << j->succ_low);
-
+    bit = (short)(1u << j->succ_low);
     if (j->eob_run) {
       --j->eob_run;
       for (k = j->spec_start; k <= j->spec_end; ++k) {
@@ -1273,9 +1404,10 @@ forceinline unsigned char stbi__clamp(int x) {
   t1 += p2 + p4;                                          \
   t0 += p1 + p3;
 
-static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
+static void stbi__idct_block(unsigned char *out, int out_stride,
+                             short data[64]) {
   int i, val[64], *v = val;
-  stbi_uc *o;
+  unsigned char *o;
   short *d = data;
 
   // columns
@@ -1338,7 +1470,8 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64]) {
 // sse2 integer IDCT. not the fastest possible implementation but it
 // produces bit-identical results to the generic C version so it's
 // fully "transparent".
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
+static void stbi__idct_simd(unsigned char *out, int out_stride,
+                            short data[64]) {
   // This is constructed to match our regular (generic) integer IDCT exactly.
   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
   __m128i tmp;
@@ -1540,7 +1673,8 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
 
 // NEON integer IDCT. should produce bit-identical
 // results to the generic C version.
-static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64]) {
+static void stbi__idct_simd(unsigned char *out, int out_stride,
+                            short data[64]) {
   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
 
   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
@@ -2025,6 +2159,10 @@ static int stbi__process_marker(stbi__jpeg *z, int m) {
           sizes[i] = stbi__get8(z->s);
           n += sizes[i];
         }
+        if (n > 256) {
+          // Loop over i < n would write past end of values!
+          return stbi__err("bad DHT header", "Corrupt JPEG");
+        }
         L -= 17;
         if (tc == 0) {
           if (!stbi__build_huffman(z->huff_dc + th, sizes)) return 0;
@@ -2058,7 +2196,7 @@ static int stbi__process_marker(stbi__jpeg *z, int m) {
       for (i = 0; i < 5; ++i)
         if (stbi__get8(z->s) != tag[i]) ok = 0;
       L -= 5;
-      if (ok) z->jfif = 1;
+      if (ok) z->jfif = m;
     } else if (m == 0xEE && L >= 12) {  // Adobe APP14 segment
       static const unsigned char tag[6] = {'A', 'd', 'o', 'b', 'e', '\0'};
       int ok = 1;
@@ -2150,24 +2288,36 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan) {
   stbi__context *s = z->s;
   int Lf, p, i, q, h_max = 1, v_max = 1, c;
   Lf = stbi__get16be(s);
-  if (Lf < 11) return stbi__err("bad SOF len", "Corrupt JPEG");  // JPEG
+  if (Lf < 11) {
+    // JPEG
+    return stbi__err("bad SOF len", "Corrupt JPEG");
+  }
   p = stbi__get8(s);
-  if (p != 8)
-    return stbi__err("only 8-bit",
-                     "JPEG format not supported: 8-bit only");  // JPEG baseline
+  if (p != 8) {
+    // JPEG baseline
+    return stbi__err("only 8-bit", "JPEG format not supported: 8-bit only");
+  }
   s->img_y = stbi__get16be(s);
-  if (s->img_y == 0)
-    return stbi__err(
-        "no header height",
-        "JPEG format not supported: delayed height");  // Legal, but we don't
-  // handle it--but neither
-  // does IJG
+  if (s->img_y == 0) {
+    // Legal, but we don't handle it--but neither does IJG
+    return stbi__err("no header height",
+                     "JPEG format not supported: delayed height");
+  }
   s->img_x = stbi__get16be(s);
-  if (s->img_x == 0)
-    return stbi__err("0 width", "Corrupt JPEG");  // JPEG requires
+  if (s->img_x == 0) {
+    // JPEG requires
+    return stbi__err("0 width", "Corrupt JPEG");
+  }
+  if (s->img_y > STBI_MAX_DIMENSIONS) {
+    return stbi__err("too large", "Very large image (corrupt?)");
+  }
+  if (s->img_x > STBI_MAX_DIMENSIONS) {
+    return stbi__err("too large", "Very large image (corrupt?)");
+  }
   c = stbi__get8(s);
-  if (c != 3 && c != 1 && c != 4)
+  if (c != 3 && c != 1 && c != 4) {
     return stbi__err("bad component count", "Corrupt JPEG");
+  }
   s->img_n = c;
   for (i = 0; i < c; ++i) {
     z->img_comp[i].data = NULL;
@@ -2202,6 +2352,18 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan) {
     if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
   }
 
+  // check that plane subsampling factors are integer ratios;
+  // our resamplers can't deal with fractional ratios
+  // and I've never seen a non-corrupted JPEG file actually use them
+  for (i = 0; i < s->img_n; ++i) {
+    if (h_max % z->img_comp[i].h != 0) {
+      return stbi__err("bad H", "Corrupt JPEG");
+    }
+    if (v_max % z->img_comp[i].v != 0) {
+      return stbi__err("bad V", "Corrupt JPEG");
+    }
+  }
+
   // compute interleaved mcu info
   z->img_h_max = h_max;
   z->img_v_max = v_max;
@@ -2272,6 +2434,27 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) {
   return 1;
 }
 
+static unsigned char stbi__skip_jpeg_junk_at_end(stbi__jpeg *j) {
+  // some JPEGs have junk at end, skip over it but if we find what looks
+  // like a valid marker, resume there
+  while (!stbi__at_eof(j->s)) {
+    unsigned char x = stbi__get8(j->s);
+    while (x == 0xff) {  // might be a marker
+      if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+      x = stbi__get8(j->s);
+      if (x != 0x00 && x != 0xff) {
+        // not a stuffed zero or lead-in to another marker, looks
+        // like an actual marker, return it
+        return x;
+      }
+      // stuffed zero has x=0 now which ends the loop, meaning we go
+      // back to regular scan loop.
+      // repeated 0xff keeps trying to read the next byte of the marker.
+    }
+  }
+  return STBI__MARKER_none;
+}
+
 // decode image to YCbCr format
 static int stbi__decode_jpeg_image(stbi__jpeg *j) {
   int m;
@@ -2287,26 +2470,22 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j) {
       if (!stbi__process_scan_header(j)) return 0;
       if (!stbi__parse_entropy_coded_data(j)) return 0;
       if (j->marker == STBI__MARKER_none) {
-        // handle 0s at the end of image data from IP Kamera 9060
-        while (!stbi__at_eof(j->s)) {
-          int x = stbi__get8(j->s);
-          if (x == 255) {
-            j->marker = stbi__get8(j->s);
-            break;
-          }
-        }
+        j->marker = stbi__skip_jpeg_junk_at_end(j);
         // if we reach eof without hitting a marker, stbi__get_marker() below
         // will fail and we'll eventually return 0
       }
+      m = stbi__get_marker(j);
+      if (STBI__RESTART(m)) m = stbi__get_marker(j);
     } else if (stbi__DNL(m)) {
       int Ld = stbi__get16be(j->s);
       uint32_t NL = stbi__get16be(j->s);
       if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
       if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+      m = stbi__get_marker(j);
     } else {
-      if (!stbi__process_marker(j, m)) return 0;
+      if (!stbi__process_marker(j, m)) return 1;
+      m = stbi__get_marker(j);
     }
-    m = stbi__get_marker(j);
   }
   if (j->progressive) stbi__jpeg_finish(j);
   return 1;
@@ -2471,9 +2650,10 @@ static unsigned char *stbi__resample_row_nearest(unsigned char *out,
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
 #define stbi__float2fixed(x) (((int)((x)*4096.0f + 0.5f)) << 8)
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
-                                   const stbi_uc *pcb, const stbi_uc *pcr,
-                                   int count, int step) {
+static void stbi__YCbCr_to_RGB_row(unsigned char *out, const unsigned char *y,
+                                   const unsigned char *pcb,
+                                   const unsigned char *pcr, int count,
+                                   int step) {
   int i;
   for (i = 0; i < count; ++i) {
     int y_fixed = (y[i] << 20) + (1 << 19);  // rounding
@@ -2505,18 +2685,19 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y,
       else
         b = 255;
     }
-    out[0] = (stbi_uc)r;
-    out[1] = (stbi_uc)g;
-    out[2] = (stbi_uc)b;
+    out[0] = (unsigned char)r;
+    out[1] = (unsigned char)g;
+    out[2] = (unsigned char)b;
     out[3] = 255;
     out += step;
   }
 }
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
-static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
-                                    stbi_uc const *pcb, stbi_uc const *pcr,
-                                    int count, int step) {
+static void stbi__YCbCr_to_RGB_simd(unsigned char *out, unsigned char const *y,
+                                    unsigned char const *pcb,
+                                    unsigned char const *pcr, int count,
+                                    int step) {
   int i = 0;
 
 #ifdef STBI_SSE2
@@ -2656,9 +2837,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y,
       else
         b = 255;
     }
-    out[0] = (stbi_uc)r;
-    out[1] = (stbi_uc)g;
-    out[2] = (stbi_uc)b;
+    out[0] = (unsigned char)r;
+    out[1] = (unsigned char)g;
+    out[2] = (unsigned char)b;
     out[3] = 255;
     out += step;
   }
@@ -2722,6 +2903,13 @@ static unsigned char *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y,
     decode_n = z->s->img_n;
   }
 
+  // nothing to do if no components requested; check this now to avoid
+  // accessing uninitialized coutput[0] later
+  if (decode_n <= 0) {
+    stbi__cleanup_jpeg(z);
+    return NULL;
+  }
+
   // resample and color-convert
   {
     int k;
@@ -2876,7 +3064,10 @@ static dontinline void *stbi__jpeg_load(stbi__context *s, int *x, int *y,
                                         int *comp, int req_comp,
                                         stbi__result_info *ri) {
   unsigned char *result;
-  stbi__jpeg *j = (stbi__jpeg *)malloc(sizeof(stbi__jpeg));
+  stbi__jpeg *j;
+  j = malloc(sizeof(stbi__jpeg));
+  if (!j) return stbi__errpuc("outofmem", "Out of memory");
+  bzero(j, sizeof(stbi__jpeg));
   j->s = s;
   stbi__setup_jpeg(j);
   result = load_jpeg_image(j, x, y, comp, req_comp);
@@ -2888,6 +3079,8 @@ static int stbi__jpeg_test(stbi__context *s) {
   int r;
   stbi__jpeg *j;
   j = malloc(sizeof(stbi__jpeg));
+  if (!j) return stbi__err("outofmem", "Out of memory");
+  bzero(j, sizeof(stbi__jpeg));
   j->s = s;
   stbi__setup_jpeg(j);
   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
@@ -2910,6 +3103,8 @@ static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp) {
 static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) {
   int result;
   stbi__jpeg *j = (stbi__jpeg *)(malloc(sizeof(stbi__jpeg)));
+  if (!j) return stbi__err("outofmem", "Out of memory");
+  bzero(j, sizeof(stbi__jpeg));
   j->s = s;
   result = stbi__jpeg_info_raw(j, x, y, comp);
   free(j);
@@ -2926,6 +3121,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) {
 // fast-way is faster to check than jpeg huffman, but slow way is slower
 #define STBI__ZFAST_BITS 9  // accelerate all cases in default tables
 #define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS     288  // number of symbols in literal/length alphabet
 
 // zlib-style huffman encoding
 // (jpegs packs from left, zlib from right, so can't share code)
@@ -2934,8 +3130,8 @@ typedef struct {
   uint16_t firstcode[16];
   int maxcode[17];
   uint16_t firstsymbol[16];
-  unsigned char size[288];
-  uint16_t value[288];
+  unsigned char size[STBI__ZNSYMS];
+  uint16_t value[STBI__ZNSYMS];
 } stbi__zhuffman;
 
 static uint32_t ReverseBits32(uint32_t x) {
@@ -3010,6 +3206,7 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z,
 typedef struct {
   unsigned char *zbuffer, *zbuffer_end;
   int num_bits;
+  int hit_zeof_once;
   uint32_t code_buffer;
   char *zout;
   char *zout_start;
@@ -3018,14 +3215,20 @@ typedef struct {
   stbi__zhuffman z_length, z_distance;
 } stbi__zbuf;
 
+forceinline int stbi__zeof(stbi__zbuf *z) {
+  return (z->zbuffer >= z->zbuffer_end);
+}
+
 forceinline unsigned char stbi__zget8(stbi__zbuf *z) {
-  if (z->zbuffer >= z->zbuffer_end) return 0;
-  return *z->zbuffer++;
+  return stbi__zeof(z) ? 0 : *z->zbuffer++;
 }
 
 static void stbi__fill_bits(stbi__zbuf *z) {
   do {
-    assert(z->code_buffer < (1u << z->num_bits));
+    if (z->code_buffer >= (1u << z->num_bits)) {
+      z->zbuffer = z->zbuffer_end;  // treat this as EOF so we fail.
+      return;
+    }
     z->code_buffer |= (unsigned int)stbi__zget8(z) << z->num_bits;
     z->num_bits += 8;
   } while (z->num_bits <= 24);
@@ -3047,10 +3250,17 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) {
   k = stbi__bit_reverse(a->code_buffer, 16);
   for (s = STBI__ZFAST_BITS + 1;; ++s)
     if (k < z->maxcode[s]) break;
-  if (s == 16) return -1;  // invalid code!
+  if (s >= 16) return -1;  // invalid code!
   // code size is s, so:
   b = (k >> (16 - s)) - z->firstcode[s] + z->firstsymbol[s];
-  assert(z->size[b] == s);
+  if (b >= STBI__ZNSYMS) {
+    // some data was corrupt somewhere!
+    return -1;
+  }
+  if (z->size[b] != s) {
+    // was originally an assert, but report failure instead.
+    return -1;
+  }
   a->code_buffer >>= s;
   a->num_bits -= s;
   return z->value[b];
@@ -3058,7 +3268,23 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) {
 
 forceinline int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) {
   int b, s;
-  if (a->num_bits < 16) stbi__fill_bits(a);
+  if (a->num_bits < 16) {
+    if (stbi__zeof(a)) {
+      if (!a->hit_zeof_once) {
+        // This is the first time we hit eof, insert 16 extra padding bits
+        // to allow us to keep going; if we actually consume any of them
+        // though, that is invalid data. This is caught later.
+        a->hit_zeof_once = 1;
+        a->num_bits += 16;  // add 16 implicit zero bits
+      } else {
+        // We already inserted our extra 16 padding bits and are again
+        // out, this stream is actually prematurely terminated.
+        return -1;
+      }
+    } else {
+      stbi__fill_bits(a);
+    }
+  }
   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
   if (b) {
     s = b >> 9;
@@ -3071,13 +3297,19 @@ forceinline int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) {
 
 static int stbi__zexpand(stbi__zbuf *z, char *zout, int n) {
   char *q;
-  int cur, limit, old_limit;
+  unsigned int cur, limit;
   z->zout = zout;
   if (!z->z_expandable) return stbi__err("output buffer limit", "Corrupt PNG");
-  cur = (int)(z->zout - z->zout_start);
-  limit = old_limit = (int)(z->zout_end - z->zout_start);
-  while (cur + n > limit) limit *= 2;
-  q = (char *)STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+  cur = (unsigned int)(z->zout - z->zout_start);
+  limit = (unsigned)(z->zout_end - z->zout_start);
+  if (UINT_MAX - cur < (unsigned)n) {
+    return stbi__err("outofmem", "Out of memory");
+  }
+  while (cur + n > limit) {
+    if (limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+    limit *= 2;
+  }
+  q = (char *)realloc(z->zout_start, limit);
   if (q == NULL) return stbi__err("outofmem", "Out of memory");
   z->zout_start = q;
   z->zout = q + cur;
@@ -3122,19 +3354,36 @@ static int stbi__parse_huffman_block(stbi__zbuf *a) {
       int len, dist;
       if (z == 256) {
         a->zout = zout;
+        if (a->hit_zeof_once && a->num_bits < 16) {
+          // The first time we hit zeof, we inserted 16 extra zero bits into our
+          // bit buffer so the decoder can just do its speculative decoding. But
+          // if we actually consumed any of those bits (which is the case when
+          // num_bits < 16), the stream actually read past the end so it is
+          // malformed.
+          return stbi__err("unexpected end", "Corrupt PNG");
+        }
         return 1;
       }
+      if (z >= 286) {
+        // per DEFLATE, length codes 286 and 287
+        // must not appear in compressed data
+        return stbi__err("bad huffman code", "Corrupt PNG");
+      }
       z -= 257;
       len = stbi__zlength_base[z];
       if (stbi__zlength_extra[z])
         len += stbi__zreceive(a, stbi__zlength_extra[z]);
       z = stbi__zhuffman_decode(a, &a->z_distance);
-      if (z < 0) return stbi__err("bad huffman code", "Corrupt PNG");
+      if (z < 0 || z >= 30) {
+        // per DEFLATE, distance codes 30 and 31
+        // must not appear in compressed data
+        return stbi__err("bad huffman code", "Corrupt PNG");
+      }
       dist = stbi__zdist_base[z];
       if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
       if (zout - a->zout_start < dist)
         return stbi__err("bad dist", "Corrupt PNG");
-      if (zout + len > a->zout_end) {
+      if (len > a->zout_end - zout) {
         if (!stbi__zexpand(a, zout, len)) return 0;
         zout = a->zout;
       }
@@ -3184,11 +3433,12 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a) {
         c = stbi__zreceive(a, 2) + 3;
         if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
         fill = lencodes[n - 1];
-      } else if (c == 17)
+      } else if (c == 17) {
         c = stbi__zreceive(a, 3) + 3;
-      else {
-        assert(c == 18);
+      } else if (c == 18) {
         c = stbi__zreceive(a, 7) + 11;
+      } else {
+        return stbi__err("bad codelengths", "Corrupt PNG");
       }
       if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
       memset(lencodes + n, fill, c);
@@ -3213,7 +3463,7 @@ static int stbi__parse_uncompressed_block(stbi__zbuf *a) {
     a->code_buffer >>= 8;
     a->num_bits -= 8;
   }
-  assert(a->num_bits == 0);
+  if (a->num_bits < 0) return stbi__err("zlib corrupt", "Corrupt PNG");
   // now fill header the normal way
   while (k < 4) header[k++] = stbi__zget8(a);
   len = header[1] * 256 + header[0];
@@ -3234,6 +3484,8 @@ static int stbi__parse_zlib_header(stbi__zbuf *a) {
   int cm = cmf & 15;
   /* int cinfo = cmf >> 4; */
   int flg = stbi__zget8(a);
+  if (stbi__zeof(a))
+    return stbi__err("bad zlib header", "Corrupt PNG");  // zlib spec
   if ((cmf * 256 + flg) % 31 != 0)
     return stbi__err("bad zlib header", "Corrupt PNG");  // zlib spec
   if (flg & 32)
@@ -3246,7 +3498,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a) {
   return 1;
 }
 
-static const unsigned char stbi__zdefault_length[288] = {
+static const unsigned char stbi__zdefault_length[STBI__ZNSYMS] = {
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
@@ -3282,6 +3534,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) {
     if (!stbi__parse_zlib_header(a)) return 0;
   a->num_bits = 0;
   a->code_buffer = 0;
+  a->hit_zeof_once = 0;
   do {
     final = stbi__zreceive(a, 1);
     type = stbi__zreceive(a, 2);
@@ -3292,7 +3545,8 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) {
     } else {
       if (type == 1) {
         // use fixed code lengths
-        if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length, 288))
+        if (!stbi__zbuild_huffman(&a->z_length, stbi__zdefault_length,
+                                  STBI__ZNSYMS))
           return 0;
         if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32))
           return 0;
@@ -3443,31 +3697,57 @@ enum {
   STBI__F_up = 2,
   STBI__F_avg = 3,
   STBI__F_paeth = 4,
-  // synthetic filters used for first scanline to avoid needing a dummy row of
-  // 0s
-  STBI__F_avg_first,
-  STBI__F_paeth_first
+  // synthetic filter used for first scanline to avoid needing a dummy row of 0s
+  STBI__F_avg_first
 };
 
 static int stbi__de_iphone_flag = 0;
 static int stbi__unpremultiply_on_load = 0;
-static unsigned char first_row_filter[5] = {STBI__F_none, STBI__F_sub,
-                                            STBI__F_none, STBI__F_avg_first,
-                                            STBI__F_paeth_first};
+static unsigned char first_row_filter[5] = {
+    STBI__F_none, STBI__F_sub, STBI__F_none, STBI__F_avg_first,
+    STBI__F_sub  // Paeth with b = c = 0 turns out to be equivalent to sub
+};
 
 static int stbi__paeth(int a, int b, int c) {
-  int p = a + b - c;
-  int pa = abs(p - a);
-  int pb = abs(p - b);
-  int pc = abs(p - c);
-  if (pa <= pb && pa <= pc) return a;
-  if (pb <= pc) return b;
-  return c;
+  // This formulation looks very different from the reference in the PNG spec,
+  // but is actually equivalent and has favorable data dependencies and admits
+  // straightforward generation of branch-free code, which helps performance
+  // significantly.
+  int thresh = c * 3 - (a + b);
+  int lo = a < b ? a : b;
+  int hi = a < b ? b : a;
+  int t0 = (hi <= thresh) ? lo : c;
+  int t1 = (thresh <= lo) ? hi : t0;
+  return t1;
 }
 
 static const unsigned char stbi__depth_scale_table[9] = {
     0, 0xff, 0x55, 0, 0x11, 0, 0, 0, 0x01};
 
+// adds an extra all-255 alpha channel
+// dest == src is legal
+// img_n must be 1 or 3
+static void stbi__create_png_alpha_expand8(unsigned char *dest,
+                                           unsigned char *src, uint32_t x,
+                                           int img_n) {
+  int i;
+  // must process data backwards since we allow dest==src
+  if (img_n == 1) {
+    for (i = x - 1; i >= 0; --i) {
+      dest[i * 2 + 1] = 255;
+      dest[i * 2 + 0] = src[i];
+    }
+  } else {
+    assert(img_n == 3);
+    for (i = x - 1; i >= 0; --i) {
+      dest[i * 4 + 3] = 255;
+      dest[i * 4 + 2] = src[i * 3 + 2];
+      dest[i * 4 + 1] = src[i * 3 + 1];
+      dest[i * 4 + 0] = src[i * 3 + 0];
+    }
+  }
+}
+
 // create the png data from post-deflated data
 static int stbi__create_png_image_raw(stbi__png *a, unsigned char *raw,
                                       uint32_t raw_len, int out_n, uint32_t x,
@@ -3476,6 +3756,8 @@ static int stbi__create_png_image_raw(stbi__png *a, unsigned char *raw,
   stbi__context *s = a->s;
   uint32_t i, j, stride = x * out_n * bytes;
   uint32_t img_len, img_width_bytes;
+  unsigned char *filler_buf;
+  int all_ok = 1;
   int k;
   int img_n = s->img_n;  // copy it into a local for later
 
@@ -3487,9 +3769,13 @@ static int stbi__create_png_image_raw(stbi__png *a, unsigned char *raw,
   a->out = stbi__malloc_mad3(x, y, output_bytes,
                              0);  // extra bytes to write off the end into
 
+  // note: error exits here don't need to clean up a->out individually,
+  // stbi__do_png always does on error.
   if (!stbi__mad3sizes_valid(img_n, x, depth, 7))
     return stbi__err("too large", "Corrupt PNG");
   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+  if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes))
+    return stbi__err("too large", "Corrupt PNG");
   img_len = (img_width_bytes + 1) * y;
 
   // we used to check for exact match between raw_len and img_len on
@@ -3498,260 +3784,143 @@ static int stbi__create_png_image_raw(stbi__png *a, unsigned char *raw,
   // always.
   if (raw_len < img_len) return stbi__err("not enough pixels", "Corrupt PNG");
 
+  // Allocate two scan lines worth of filter workspace buffer.
+  filler_buf = stbi__malloc_mad2(img_width_bytes, 2, 0);
+  if (!filler_buf) return stbi__err("outofmem", "Out of memory");
+
+  // Filtering for low-bit-depth images
+  if (depth < 8) {
+    filter_bytes = 1;
+    width = img_width_bytes;
+  }
+
   for (j = 0; j < y; ++j) {
-    unsigned char *cur = a->out + stride * j;
-    unsigned char *prior;
+    // cur/prior filter buffers alternate
+    unsigned char *cur = filler_buf + (j & 1) * img_width_bytes;
+    unsigned char *prior = filler_buf + (~j & 1) * img_width_bytes;
+    unsigned char *dest = a->out + stride * j;
+    int nk = width * filter_bytes;
     int filter = *raw++;
 
-    if (filter > 4) return stbi__err("invalid filter", "Corrupt PNG");
-
-    if (depth < 8) {
-      assert(img_width_bytes <= x);
-      cur +=
-          x * out_n - img_width_bytes;  // store output to the rightmost img_len
-      // bytes, so we can decode in place
-      filter_bytes = 1;
-      width = img_width_bytes;
+    // check filter type
+    if (filter > 4) {
+      all_ok = stbi__err("invalid filter", "Corrupt PNG");
+      break;
     }
-    prior = cur - stride;  // bugfix: need to compute this after 'cur +='
-    // computation above
 
     // if first row, use special filter that doesn't sample previous row
     if (j == 0) filter = first_row_filter[filter];
 
-    // handle first byte explicitly
-    for (k = 0; k < filter_bytes; ++k) {
-      switch (filter) {
-        case STBI__F_none:
-          cur[k] = raw[k];
-          break;
-        case STBI__F_sub:
-          cur[k] = raw[k];
-          break;
-        case STBI__F_up:
-          cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-          break;
-        case STBI__F_avg:
+    // perform actual filtering
+    switch (filter) {
+      case STBI__F_none:
+        memcpy(cur, raw, nk);
+        break;
+      case STBI__F_sub:
+        memcpy(cur, raw, filter_bytes);
+        for (k = filter_bytes; k < nk; ++k)
+          cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
+        break;
+      case STBI__F_up:
+        for (k = 0; k < nk; ++k) cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
+        break;
+      case STBI__F_avg:
+        for (k = 0; k < filter_bytes; ++k)
           cur[k] = STBI__BYTECAST(raw[k] + (prior[k] >> 1));
-          break;
-        case STBI__F_paeth:
-          cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0, prior[k], 0));
-          break;
-        case STBI__F_avg_first:
-          cur[k] = raw[k];
-          break;
-        case STBI__F_paeth_first:
-          cur[k] = raw[k];
-          break;
-      }
-    }
-
-    if (depth == 8) {
-      if (img_n != out_n) cur[img_n] = 255;  // first pixel
-      raw += img_n;
-      cur += out_n;
-      prior += out_n;
-    } else if (depth == 16) {
-      if (img_n != out_n) {
-        cur[filter_bytes] = 255;      // first pixel top byte
-        cur[filter_bytes + 1] = 255;  // first pixel bottom byte
-      }
-      raw += filter_bytes;
-      cur += output_bytes;
-      prior += output_bytes;
-    } else {
-      raw += 1;
-      cur += 1;
-      prior += 1;
-    }
-
-    // this is a little gross, so that we don't switch per-pixel or
-    // per-component
-    if (depth < 8 || img_n == out_n) {
-      int nk = (width - 1) * filter_bytes;
-#define STBI__CASE(f) \
-  case f:             \
-    for (k = 0; k < nk; ++k)
-      switch (filter) {
-        // "none" filter turns into a memcpy here; make that explicit.
-        case STBI__F_none:
-          memcpy(cur, raw, nk);
-          break;
-          STBI__CASE(STBI__F_sub) {
-            cur[k] = STBI__BYTECAST(raw[k] + cur[k - filter_bytes]);
-          }
-          break;
-          STBI__CASE(STBI__F_up) {
-            cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-          }
-          break;
-          STBI__CASE(STBI__F_avg) {
-            cur[k] = STBI__BYTECAST(raw[k] +
-                                    ((prior[k] + cur[k - filter_bytes]) >> 1));
-          }
-          break;
-          STBI__CASE(STBI__F_paeth) {
-            cur[k] = STBI__BYTECAST(raw[k] +
-                                    stbi__paeth(cur[k - filter_bytes], prior[k],
-                                                prior[k - filter_bytes]));
-          }
-          break;
-          STBI__CASE(STBI__F_avg_first) {
-            cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
-          }
-          break;
-          STBI__CASE(STBI__F_paeth_first) {
-            cur[k] = STBI__BYTECAST(raw[k] +
-                                    stbi__paeth(cur[k - filter_bytes], 0, 0));
-          }
-          break;
-      }
-#undef STBI__CASE
-      raw += nk;
-    } else {
-      assert(img_n + 1 == out_n);
-#define STBI__CASE(f)                                                          \
-  case f:                                                                      \
-    for (i = x - 1; i >= 1; --i, cur[filter_bytes] = 255, raw += filter_bytes, \
-        cur += output_bytes, prior += output_bytes)                            \
-      for (k = 0; k < filter_bytes; ++k)
-      switch (filter) {
-        STBI__CASE(STBI__F_none) {
-          cur[k] = raw[k];
-        }
+        for (k = filter_bytes; k < nk; ++k)
+          cur[k] = STBI__BYTECAST(raw[k] +
+                                  ((prior[k] + cur[k - filter_bytes]) >> 1));
         break;
-        STBI__CASE(STBI__F_sub) {
-          cur[k] = STBI__BYTECAST(raw[k] + cur[k - output_bytes]);
-        }
-        break;
-        STBI__CASE(STBI__F_up) {
+      case STBI__F_paeth:
+        for (k = 0; k < filter_bytes; ++k)
+          // prior[k] == stbi__paeth(0, prior[k], 0)
           cur[k] = STBI__BYTECAST(raw[k] + prior[k]);
-        }
-        break;
-        STBI__CASE(STBI__F_avg) {
+        for (k = filter_bytes; k < nk; ++k)
           cur[k] = STBI__BYTECAST(raw[k] +
-                                  ((prior[k] + cur[k - output_bytes]) >> 1));
-        }
+                                  stbi__paeth(cur[k - filter_bytes], prior[k],
+                                              prior[k - filter_bytes]));
         break;
-        STBI__CASE(STBI__F_paeth) {
-          cur[k] = STBI__BYTECAST(raw[k] +
-                                  stbi__paeth(cur[k - output_bytes], prior[k],
-                                              prior[k - output_bytes]));
-        }
+      case STBI__F_avg_first:
+        memcpy(cur, raw, filter_bytes);
+        for (k = filter_bytes; k < nk; ++k)
+          cur[k] = STBI__BYTECAST(raw[k] + (cur[k - filter_bytes] >> 1));
         break;
-        STBI__CASE(STBI__F_avg_first) {
-          cur[k] = STBI__BYTECAST(raw[k] + (cur[k - output_bytes] >> 1));
-        }
-        break;
-        STBI__CASE(STBI__F_paeth_first) {
-          cur[k] =
-              STBI__BYTECAST(raw[k] + stbi__paeth(cur[k - output_bytes], 0, 0));
-        }
-        break;
-      }
-#undef STBI__CASE
-
-      // the loop above sets the high byte of the pixels' alpha, but for
-      // 16 bit png files we also need the low byte set. we'll do that here.
-      if (depth == 16) {
-        cur = a->out + stride * j;  // start at the beginning of the row again
-        for (i = 0; i < x; ++i, cur += output_bytes) {
-          cur[filter_bytes + 1] = 255;
-        }
-      }
     }
-  }
 
-  // we make a separate pass to expand bits to pixels; for performance,
-  // this could run two scanlines behind the above code, so it won't
-  // intefere with filtering but will still be in the cache.
-  if (depth < 8) {
-    for (j = 0; j < y; ++j) {
-      unsigned char *cur = a->out + stride * j;
-      unsigned char *in = a->out + stride * j + x * out_n - img_width_bytes;
-      // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common
-      // 8-bit path optimal at minimal cost for 1/2/4-bit png guarante byte
-      // alignment, if width is not multiple of 8/4/2 we'll decode dummy
-      // trailing data that will be skipped in the later loop
+    raw += nk;
+
+    // expand decoded bits in cur to dest, also adding an extra alpha channel
+    // if desired
+    if (depth < 8) {
       unsigned char scale = (color == 0)
                                 ? stbi__depth_scale_table[depth]
                                 : 1;  // scale grayscale values to 0..255 range
+      unsigned char *in = cur;
+      unsigned char *out = dest;
+      unsigned char inb = 0;
+      uint32_t nsmp = x * img_n;
 
-      // note that the final byte might overshoot and write more data than
-      // desired. we can allocate enough data that this never writes out of
-      // memory, but it could also overwrite the next scanline. can it
-      // overwrite non-empty data on the next scanline? yes, consider
-      // 1-pixel-wide scanlines with 1-bit-per-pixel. so we need to explicitly
-      // clamp the final ones
-
+      // expand bits to bytes first
       if (depth == 4) {
-        for (k = x * img_n; k >= 2; k -= 2, ++in) {
-          *cur++ = scale * ((*in >> 4));
-          *cur++ = scale * ((*in) & 0x0f);
+        for (i = 0; i < nsmp; ++i) {
+          if ((i & 1) == 0) inb = *in++;
+          *out++ = scale * (inb >> 4);
+          inb <<= 4;
         }
-        if (k > 0) *cur++ = scale * ((*in >> 4));
       } else if (depth == 2) {
-        for (k = x * img_n; k >= 4; k -= 4, ++in) {
-          *cur++ = scale * ((*in >> 6));
-          *cur++ = scale * ((*in >> 4) & 0x03);
-          *cur++ = scale * ((*in >> 2) & 0x03);
-          *cur++ = scale * ((*in) & 0x03);
+        for (i = 0; i < nsmp; ++i) {
+          if ((i & 3) == 0) inb = *in++;
+          *out++ = scale * (inb >> 6);
+          inb <<= 2;
         }
-        if (k > 0) *cur++ = scale * ((*in >> 6));
-        if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
-        if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
-      } else if (depth == 1) {
-        for (k = x * img_n; k >= 8; k -= 8, ++in) {
-          *cur++ = scale * ((*in >> 7));
-          *cur++ = scale * ((*in >> 6) & 0x01);
-          *cur++ = scale * ((*in >> 5) & 0x01);
-          *cur++ = scale * ((*in >> 4) & 0x01);
-          *cur++ = scale * ((*in >> 3) & 0x01);
-          *cur++ = scale * ((*in >> 2) & 0x01);
-          *cur++ = scale * ((*in >> 1) & 0x01);
-          *cur++ = scale * ((*in) & 0x01);
+      } else {
+        assert(depth == 1);
+        for (i = 0; i < nsmp; ++i) {
+          if ((i & 7) == 0) inb = *in++;
+          *out++ = scale * (inb >> 7);
+          inb <<= 1;
         }
-        if (k > 0) *cur++ = scale * ((*in >> 7));
-        if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
-        if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
-        if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
-        if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
-        if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
-        if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
       }
-      if (img_n != out_n) {
-        int q;
-        // insert alpha = 255
-        cur = a->out + stride * j;
+
+      // insert alpha=255 values if desired
+      if (img_n != out_n) stbi__create_png_alpha_expand8(dest, dest, x, img_n);
+    } else if (depth == 8) {
+      if (img_n == out_n)
+        memcpy(dest, cur, x * img_n);
+      else
+        stbi__create_png_alpha_expand8(dest, cur, x, img_n);
+    } else if (depth == 16) {
+      // convert the image data from big-endian to platform-native
+      // TODO TYPES
+      uint16_t *dest16 = (uint16_t *)dest;
+      uint32_t nsmp = x * img_n;
+
+      if (img_n == out_n) {
+        for (i = 0; i < nsmp; ++i, ++dest16, cur += 2)
+          *dest16 = (cur[0] << 8) | cur[1];
+      } else {
+        assert(img_n + 1 == out_n);
         if (img_n == 1) {
-          for (q = x - 1; q >= 0; --q) {
-            cur[q * 2 + 1] = 255;
-            cur[q * 2 + 0] = cur[q];
+          for (i = 0; i < x; ++i, dest16 += 2, cur += 2) {
+            dest16[0] = (cur[0] << 8) | cur[1];
+            dest16[1] = 0xffff;
           }
         } else {
           assert(img_n == 3);
-          for (q = x - 1; q >= 0; --q) {
-            cur[q * 4 + 3] = 255;
-            cur[q * 4 + 2] = cur[q * 3 + 2];
-            cur[q * 4 + 1] = cur[q * 3 + 1];
-            cur[q * 4 + 0] = cur[q * 3 + 0];
+          for (i = 0; i < x; ++i, dest16 += 4, cur += 6) {
+            dest16[0] = (cur[0] << 8) | cur[1];
+            dest16[1] = (cur[2] << 8) | cur[3];
+            dest16[2] = (cur[4] << 8) | cur[5];
+            dest16[3] = 0xffff;
           }
         }
       }
     }
-  } else if (depth == 16) {
-    // force the image data from big-endian to platform-native.
-    // this is done in a separate pass due to the decoding relying
-    // on the data being untouched, but could probably be done
-    // per-line during decode if care is taken.
-    unsigned char *cur = a->out;
-    uint16_t *cur16 = (uint16_t *)cur;
-
-    for (i = 0; i < x * y * out_n; ++i, cur16++, cur += 2) {
-      *cur16 = (cur[0] << 8) | cur[1];
-    }
   }
 
+  free(filler_buf);
+  if (!all_ok) return 0;
+
   return 1;
 }
 
@@ -3767,6 +3936,7 @@ static int stbi__create_png_image(stbi__png *a, unsigned char *image_data,
                                       a->s->img_x, a->s->img_y, depth, color);
   // de-interlacing
   final = stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+  if (!final) return stbi__err("outofmem", "Out of memory");
   for (p = 0; p < 7; ++p) {
     int xorig[] = {0, 4, 0, 2, 0, 1, 0};
     int yorig[] = {0, 0, 4, 0, 2, 0, 1};
@@ -3954,10 +4124,10 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
         first = 0;
         if (c.length != 13) return stbi__err("bad IHDR len", "Corrupt PNG");
         s->img_x = stbi__get32be(s);
-        if (s->img_x > (1 << 24))
-          return stbi__err("too large", "Very large image (corrupt?)");
         s->img_y = stbi__get32be(s);
-        if (s->img_y > (1 << 24))
+        if (s->img_y > STBI_MAX_DIMENSIONS)
+          return stbi__err("too large", "Very large image (corrupt?)");
+        if (s->img_x > STBI_MAX_DIMENSIONS)
           return stbi__err("too large", "Very large image (corrupt?)");
         z->depth = stbi__get8(s);
         if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 &&
@@ -3985,15 +4155,14 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
           s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
           if ((1 << 30) / s->img_x / s->img_n < s->img_y)
             return stbi__err("too large", "Image too large to decode");
-          if (scan == STBI__SCAN_header) return 1;
         } else {
           // if paletted, then pal_n is our final components, and
           // img_n is # components to decompress/filter.
           s->img_n = 1;
           if ((1 << 30) / s->img_x / 4 < s->img_y)
             return stbi__err("too large", "Corrupt PNG");
-          // if SCAN_header, have to scan to see if we have a tRNS
         }
+        // even with SCAN_header, have to scan to see if we have a tRNS
         break;
       }
 
@@ -4031,6 +4200,12 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
           if (c.length != (uint32_t)s->img_n * 2)
             return stbi__err("bad tRNS len", "Corrupt PNG");
           has_trans = 1;
+          // non-paletted with tRNS = constant alpha.
+          // if header-scanning, we can stop now.
+          if (scan == STBI__SCAN_header) {
+            ++s->img_n;
+            return 1;
+          }
           if (z->depth == 16) {
             for (k = 0; k < s->img_n; ++k)
               tc16[k] = (uint16_t)stbi__get16be(s);  // copy the values as-is
@@ -4048,17 +4223,19 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
         if (first) return stbi__err("first not IHDR", "Corrupt PNG");
         if (pal_img_n && !pal_len) return stbi__err("no PLTE", "Corrupt PNG");
         if (scan == STBI__SCAN_header) {
-          s->img_n = pal_img_n;
+          // header scan definitely stops at first IDAT
+          if (pal_img_n) s->img_n = pal_img_n;
           return 1;
         }
+        if (c.length > (1u << 30))
+          return stbi__err("IDAT size limit",
+                           "IDAT section larger than 2^30 bytes");
         if ((int)(ioff + c.length) < (int)ioff) return 0;
         if (ioff + c.length > idata_limit) {
-          uint32_t idata_limit_old = idata_limit;
           unsigned char *p;
           if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
           while (ioff + c.length > idata_limit) idata_limit *= 2;
-          (void)idata_limit_old;
-          p = STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit);
+          p = realloc(z->idata, idata_limit);
           if (p == NULL) return stbi__err("outofmem", "Out of memory");
           z->idata = p;
         }
@@ -4114,7 +4291,8 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
         }
         free(z->expanded);
         z->expanded = NULL;
-        stbi__get32be(s); /* nothings/stb#835 */
+        // end of PNG chunk, read and skip CRC
+        stbi__get32be(s);
         return 1;
       }
 
@@ -4122,7 +4300,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) {
         // if critical, fail
         if (first) return stbi__err("first not IHDR", "Corrupt PNG");
         if ((c.type & (1 << 29)) == 0) {
-#ifndef STBI_NO_FAILURE_STRINGS
+#if !defined(STBI_NO_FAILURE_STRINGS) && !defined(STBI_FAILURE_USERMSG)
           // not threadsafe
           static char invalid_chunk[] = "XXXX PNG chunk not known";
           invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
@@ -4147,10 +4325,13 @@ static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp,
   if (req_comp < 0 || req_comp > 4)
     return stbi__errpuc("bad req_comp", "Internal error");
   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
-    if (p->depth < 8)
+    if (p->depth <= 8)
       ri->bits_per_channel = 8;
+    else if (p->depth == 16)
+      ri->bits_per_channel = 16;
     else
-      ri->bits_per_channel = p->depth;
+      return stbi__errpuc("bad bits_per_channel",
+                          "PNG not supported: unsupported color depth");
     result = p->out;
     p->out = NULL;
     if (req_comp && req_comp != p->s->img_out_n) {
@@ -4297,6 +4478,10 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp,
   g->bgindex = stbi__get8(s);
   g->ratio = stbi__get8(s);
   g->transparent = -1;
+  if (g->w > STBI_MAX_DIMENSIONS)
+    return stbi__err("too large", "Very large image (corrupt?)");
+  if (g->h > STBI_MAX_DIMENSIONS)
+    return stbi__err("too large", "Very large image (corrupt?)");
   if (comp != 0) {
     *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the
                 // comments
@@ -4310,6 +4495,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp,
 
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp) {
   stbi__gif *g = (stbi__gif *)malloc(sizeof(stbi__gif));
+  if (!g) return stbi__err("outofmem", "Out of memory");
   if (!stbi__gif_header(s, g, comp, 1)) {
     free(g);
     stbi__rewind(s);
@@ -4458,7 +4644,7 @@ static unsigned char *stbi__gif_load_next(stbi__context *s, stbi__gif *g,
     if (!g->out || !g->background || !g->history)
       return stbi__errpuc("outofmem", "Out of memory");
 
-    // image is treated as "transparent" at the start - ie, nothing overwrites
+    // image is treated as "transparent" at the start - i.e. nothing overwrites
     // the current background; background colour is only used for pixels that
     // are not rendered first frame, after that "background" color refers to
     // the color that was there the previous frame.
@@ -4469,7 +4655,7 @@ static unsigned char *stbi__gif_load_next(stbi__context *s, stbi__gif *g,
           pcount);  // pixels that were affected previous frame
     first_frame = 1;
   } else {
-    // second frame - how do we dispoase of the previous one?
+    // second frame - how do we dispose of the previous one?
     dispose = (g->eflags & 0x1C) >> 2;
     pcount = g->w * g->h;
 
@@ -4492,10 +4678,10 @@ static unsigned char *stbi__gif_load_next(stbi__context *s, stbi__gif *g,
         }
       }
     } else {
-      // This is a non-disposal case eithe way, so just
+      // This is a non-disposal case either way, so just
       // leave the pixels as is, and they will become the new background
       // 1: do not dispose
-      // 0:  not specified.
+      // 0: not specified.
     }
 
     // background is what out is after the undoing of the previou frame;
@@ -4622,6 +4808,16 @@ static unsigned char *stbi__gif_load_next(stbi__context *s, stbi__gif *g,
   }
 }
 
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, unsigned char *out,
+                                          int **delays) {
+  free(g->out);
+  free(g->history);
+  free(g->background);
+  if (out) free(out);
+  if (delays && *delays) free(*delays);
+  return stbi__errpuc("outofmem", "Out of memory");
+}
+
 static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
                                  int *z, int *comp, int req_comp) {
   if (stbi__gif_test(s)) {
@@ -4644,21 +4840,29 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y,
         ++layers;
         stride = g->w * g->h * 4;
         if (out) {
-          out = (unsigned char *)realloc(out, layers * stride);
-          if (!out) abort();
+          void *tmp = (unsigned char *)realloc(out, layers * stride);
+          if (!tmp)
+            return stbi__load_gif_main_outofmem(g, out, delays);
+          else {
+            out = (unsigned char *)tmp;
+          }
           if (delays) {
-            *delays = (int *)realloc(*delays, sizeof(int) * layers);
-            if (!*delays) abort();
+            int *new_delays = (int *)realloc(*delays, sizeof(int) * layers);
+            if (!new_delays)
+              return stbi__load_gif_main_outofmem(g, out, delays);
+            *delays = new_delays;
           }
         } else {
           out = malloc(layers * stride);
+          if (!out) return stbi__load_gif_main_outofmem(g, out, delays);
           if (delays) {
             *delays = malloc(layers * sizeof(int));
+            if (!*delays) return stbi__load_gif_main_outofmem(g, out, delays);
           }
         }
         memcpy(out + ((layers - 1) * stride), u, stride);
         if (layers >= 2) {
-          two_back = out - 2 * stride;
+          two_back = out + ((layers - 2) * stride);
         }
         if (delays) {
           (*delays)[layers - 1U] = g->delay;
@@ -4720,7 +4924,6 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp) {
 // Known limitations:
 //    Does not support comments in the header section
 //    Does not support ASCII image data (formats P2 and P3)
-//    Does not support 16-bit-per-channel
 
 static int stbi__pnm_test(stbi__context *s) {
   char p, t;
@@ -4737,20 +4940,37 @@ static dontinline void *stbi__pnm_load(stbi__context *s, int *x, int *y,
                                        int *comp, int req_comp,
                                        stbi__result_info *ri) {
   unsigned char *out;
-  if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y,
-                      (int *)&s->img_n)) {
-    return 0;
+  ri->bits_per_channel =
+      stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+  if (ri->bits_per_channel == 0) return 0;
+  if (s->img_y > STBI_MAX_DIMENSIONS) {
+    return stbi__errpuc("too large", "Very large image (corrupt?)");
+  }
+  if (s->img_x > STBI_MAX_DIMENSIONS) {
+    return stbi__errpuc("too large", "Very large image (corrupt?)");
   }
   *x = s->img_x;
   *y = s->img_y;
   if (comp) *comp = s->img_n;
-  if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) {
+  if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y,
+                             ri->bits_per_channel / 8, 0)) {
     return stbi__errpuc("too large", "PNM too large");
   }
-  out = stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
-  stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+  out = stbi__malloc_mad4(s->img_n, s->img_x, s->img_y,
+                          ri->bits_per_channel / 8, 0);
+  if (!stbi__getn(
+          s, out,
+          s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+    free(out);
+    return stbi__errpuc("bad PNM", "PNM file truncated");
+  }
   if (req_comp && req_comp != s->img_n) {
-    out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+    if (ri->bits_per_channel == 16) {
+      out = (unsigned char *)stbi__convert_format16(
+          (uint16_t *)out, s->img_n, req_comp, s->img_x, s->img_y);
+    } else {
+      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+    }
     if (out == NULL) return out;  // stbi__convert_format frees input on failure
   }
   return out;
@@ -4779,6 +4999,12 @@ static int stbi__pnm_getinteger(stbi__context *s, char *c) {
   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
     value = value * 10 + (*c - '0');
     *c = (char)stbi__get8(s);
+    // TODO INT_MAX
+    if ((value > 214748364) || (value == 214748364 && *c > '7')) {
+      return stbi__err(
+          "integer parse overflow",
+          "Parsing an integer in the PPM header overflowed a 32-bit int");
+    }
   }
   return value;
 }
@@ -4802,15 +5028,30 @@ static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp) {
   c = (char)stbi__get8(s);
   stbi__pnm_skip_whitespace(s, &c);
   *x = stbi__pnm_getinteger(s, &c);  // read width
+  if (*x == 0) {
+    return stbi__err("invalid_width",
+                     "PPM image header had zero or overflowing width");
+  }
   stbi__pnm_skip_whitespace(s, &c);
   *y = stbi__pnm_getinteger(s, &c);  // read height
+  if (*y == 0) {
+    return stbi__err("invalid height",
+                     "PPM image header had zero or overflowing height");
+  }
   stbi__pnm_skip_whitespace(s, &c);
   maxv = stbi__pnm_getinteger(s, &c);  // read max value
-  if (maxv > 255)
-    return stbi__err("max value > 255", "PPM image not 8-bit");
-  else {
-    return 1;
-  }
+  if (maxv > 65535)
+    return stbi__err("max value > 65535",
+                     "PPM image supports only 8-bit and 16-bit images");
+  else if (maxv > 255)
+    return 16;
+  else
+    return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s) {
+  if (stbi__pnm_info(s, NULL, NULL, NULL) == 16) return 1;
+  return 0;
 }
 
 static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp) {
@@ -4831,7 +5072,12 @@ static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp) {
 }
 
 static int stbi__is_16_main(stbi__context *s) {
+#ifndef STBI_NO_PNG
   if (stbi__png_is16(s)) return 1;
+#endif
+#ifndef STBI_NO_PNM
+  if (stbi__pnm_is16(s)) return 1;
+#endif
   return 0;
 }
 
@@ -4848,9 +5094,10 @@ int stbi_info_from_file(FILE *f, int *x, int *y, int *comp) {
   int r;
   stbi__context s;
   long pos = ftell(f);
+  if (pos < 0) return stbi__err("bad file", "ftell() failed");
   stbi__start_file(&s, f);
   r = stbi__info_main(&s, x, y, comp);
-  fseek(f, pos, SEEK_SET);
+  if (fseek(f, pos, SEEK_SET)) return stbi__err("bad file", "fseek() failed");
   return r;
 }
 
@@ -4867,9 +5114,10 @@ int stbi_is_16_bit_from_file(FILE *f) {
   int r;
   stbi__context s;
   long pos = ftell(f);
+  if (pos < 0) return stbi__err("bad file", "ftell() failed");
   stbi__start_file(&s, f);
   r = stbi__is_16_main(&s);
-  fseek(f, pos, SEEK_SET);
+  if (fseek(f, pos, SEEK_SET)) return stbi__err("bad file", "fseek() failed");
   return r;
 }
 
diff --git a/third_party/stb/stb_image.h b/third_party/stb/stb_image.h
index 9f3076887..558506908 100644
--- a/third_party/stb/stb_image.h
+++ b/third_party/stb/stb_image.h
@@ -13,12 +13,14 @@ enum {
 struct FILE;
 
 typedef struct {
-  int (*read)(void *user, char *data,
-              int size);  // fill 'data' with 'size' bytes.  return number of
-                          // bytes actually read
-  void (*skip)(void *user, int n);  // skip the next 'n' bytes, or 'unget' the
-                                    // last -n bytes if negative
-  int (*eof)(void *user);  // returns nonzero if we are at end of file/data
+  // fill 'data' with 'size' bytes.  return number of bytes actually read
+  int (*read)(void *user, char *data, int size);
+
+  // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+  void (*skip)(void *user, int n);
+
+  // returns nonzero if we are at end of file/data
+  int (*eof)(void *user);
 } stbi_io_callbacks;
 
 //
@@ -63,7 +65,6 @@ unsigned short *stbi_load_from_file_16(struct FILE *f, int *x, int *y,
                                        int desired_channels);
 
 // get a VERY brief reason for failure
-// NOT THREADSAFE
 const char *stbi_failure_reason(void);
 
 // free the loaded image -- this is just free()
diff --git a/third_party/stb/stb_image_resize.c b/third_party/stb/stb_image_resize.c
index 55d1b8351..24cb373e5 100644
--- a/third_party/stb/stb_image_resize.c
+++ b/third_party/stb/stb_image_resize.c
@@ -1,7 +1,7 @@
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
 │ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2020 Justine Alexandra Roberts Tunney                              │
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
 │ Permission to use, copy, modify, and/or distribute this software for         │
 │ any purpose with or without fee is hereby granted, provided that the         │
@@ -655,9 +655,14 @@ static void stbir__calculate_coefficients_upsample(
     total_filter += coefficient_group[i];
   }
 
-  STBIR_ASSERT(stbir__filter_info_table[filter].kernel(
-                   (float)(in_last_pixel + 1) + 0.5f - in_center_of_out,
-                   1 / scale) == 0);
+  // NOTE(fg): Not actually true in general, nor is there any reason to expect
+  // it should be. It would be true in exact math but is at best approximately
+  // true in floating-point math, and it would not make sense to try and put
+  // actual bounds on this here because it depends on the image aspect ratio
+  // which can get pretty extreme.
+  // STBIR_ASSERT(stbir__filter_info_table[filter].kernel(
+  //                 (float)(in_last_pixel + 1) + 0.5f - in_center_of_out,
+  //                 1 / scale) == 0);
 
   STBIR_ASSERT(total_filter > 0.9);
   STBIR_ASSERT(total_filter < 1.1f);  // Make sure it's not way off.
@@ -701,9 +706,14 @@ static void stbir__calculate_coefficients_downsample(
         stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
   }
 
-  STBIR_ASSERT(stbir__filter_info_table[filter].kernel(
-                   (float)(out_last_pixel + 1) + 0.5f - out_center_of_in,
-                   scale_ratio) == 0);
+  // NOTE(fg): Not actually true in general, nor is there any reason to expect
+  // it should be. It would be true in exact math but is at best approximately
+  // true in floating-point math, and it would not make sense to try and put
+  // actual bounds on this here because it depends on the image aspect ratio
+  // which can get pretty extreme.
+  // STBIR_ASSERT(stbir__filter_info_table[filter].kernel(
+  //                 (float)(out_last_pixel + 1) + 0.5f - out_center_of_in,
+  //                 scale_ratio) == 0);
 
   for (i = out_last_pixel - out_first_pixel; i >= 0; i--) {
     if (coefficient_group[i]) break;
@@ -851,7 +861,7 @@ static float* stbir__get_decode_buffer(stbir__info* stbir_info) {
 }
 
 #define STBIR__DECODE(type, colorspace) \
-  ((type) * (STBIR_MAX_COLORSPACES) + (colorspace))
+  ((int)(type) * (STBIR_MAX_COLORSPACES) + (int)(colorspace))
 
 static void stbir__decode_scanline(stbir__info* stbir_info, int n) {
   int c;
@@ -1199,7 +1209,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info,
           int out_pixel_index = k * 1;
           float coefficient =
               horizontal_coefficients[coefficient_group + k - n0];
-          STBIR_ASSERT(coefficient != 0);
           output_buffer[out_pixel_index + 0] +=
               decode_buffer[in_pixel_index + 0] * coefficient;
         }
@@ -1220,7 +1229,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info,
           int out_pixel_index = k * 2;
           float coefficient =
               horizontal_coefficients[coefficient_group + k - n0];
-          STBIR_ASSERT(coefficient != 0);
           output_buffer[out_pixel_index + 0] +=
               decode_buffer[in_pixel_index + 0] * coefficient;
           output_buffer[out_pixel_index + 1] +=
@@ -1243,7 +1251,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info,
           int out_pixel_index = k * 3;
           float coefficient =
               horizontal_coefficients[coefficient_group + k - n0];
-          STBIR_ASSERT(coefficient != 0);
           output_buffer[out_pixel_index + 0] +=
               decode_buffer[in_pixel_index + 0] * coefficient;
           output_buffer[out_pixel_index + 1] +=
@@ -1268,7 +1275,6 @@ static void stbir__resample_horizontal_downsample(stbir__info* stbir_info,
           int out_pixel_index = k * 4;
           float coefficient =
               horizontal_coefficients[coefficient_group + k - n0];
-          STBIR_ASSERT(coefficient != 0);
           output_buffer[out_pixel_index + 0] +=
               decode_buffer[in_pixel_index + 0] * coefficient;
           output_buffer[out_pixel_index + 1] +=
diff --git a/third_party/stb/stb_image_write.c b/third_party/stb/stb_image_write.c
index 63187511d..1f984bc78 100644
--- a/third_party/stb/stb_image_write.c
+++ b/third_party/stb/stb_image_write.c
@@ -1,123 +1,21 @@
-/* stb_image_write - v1.13 - public domain - http://nothings.org/stb
- * writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
- *                                  no warranty implied; use at your own risk
- *
- * ABOUT:
- *
- *    This file is a library for writing images to stdio or a callback.
- *
- *    The PNG output is not optimal; it is 20-50% larger than the file
- *    written by a decent optimizing implementation; though providing a
- *    custom zlib compress function (see STBIW_ZLIB_COMPRESS) can
- *    mitigate that. This library is designed for source code
- *    compactness and simplicity, not optimal image file size or
- *    run-time performance.
- *
- * USAGE:
- *
- *    There are five functions, one for each image file format:
- *
- *      stbi_write_png
- *      stbi_write_bmp
- *      stbi_write_tga
- *      stbi_write_jpg
- *      stbi_write_hdr
- *
- *      stbi_flip_vertically_on_write
- *
- *    There are also five equivalent functions that use an arbitrary
- *    write function. You are expected to open/close your
- *    file-equivalent before and after calling these:
- *
- *      stbi_write_png_to_func
- *      stbi_write_bmp_to_func
- *      stbi_write_tga_to_func
- *      stbi_write_hdr_to_func
- *      stbi_write_jpg_to_func
- *
- *    where the callback is:
- *       void stbi_write_func(void *context, void *data, int size);
- *
- *    You can configure it with these:
- *       stbi_write_tga_with_rle
- *       stbi_write_png_compression_level
- *       stbi_write_force_png_filter
- *
- *    Each function returns 0 on failure and non-0 on success.
- *
- *    The functions create an image file defined by the parameters. The
- *    image is a rectangle of pixels stored from left-to-right,
- *    top-to-bottom. Each pixel contains 'comp' channels of data stored
- *    interleaved with 8-bits per channel, in the following order: 1=Y,
- *    2=YA, 3=RGB, 4=RGBA. (Y is monochrome color.) The rectangle is 'w'
- *    pixels wide and 'h' pixels tall. The *data pointer points to the
- *    first byte of the top-left-most pixel. For PNG, "stride_in_bytes"
- *    is the distance in bytes from the first byte of a row of pixels to
- *    the first byte of the next row of pixels.
- *
- *    PNG creates output files with the same number of components as the
- *    input. The BMP format expands Y to RGB in the file format and does
- *    not output alpha.
- *
- *    PNG supports writing rectangles of data even when the bytes
- *    storing rows of data are not consecutive in memory (e.g.
- *    sub-rectangles of a larger image), by supplying the stride between
- *    the beginning of adjacent rows. The other formats do not. (Thus
- *    you cannot write a native-format BMP through the BMP writer, both
- *    because it is in BGR order and because it may have padding at the
- *    end of the line.)
- *
- *    PNG allows you to set the deflate compression level by setting the
- *    global variable 'stbi_write_png_compression_level' (it defaults to
- *    8).
- *
- *    HDR expects linear float data. Since the format is always 32-bit
- *    rgb(e) data, alpha (if provided) is discarded, and for monochrome
- *    data it is replicated across all three channels.
- *
- *    TGA supports RLE or non-RLE compressed data. To use
- *    non-RLE-compressed data, set the global variable
- *    'stbi_write_tga_with_rle' to 0.
- *
- *    JPEG does ignore alpha channels in input data; quality is between
- *    1 and 100. Higher quality looks better but results in a bigger
- *    image. JPEG baseline (no JPEG progressive).
- *
- * CREDITS:
- *
- *
- *    Sean Barrett           -    PNG/BMP/TGA
- *    Baldur Karlsson        -    HDR
- *    Jean-Sebastien Guay    -    TGA monochrome
- *    Tim Kelsey             -    misc enhancements
- *    Alan Hickman           -    TGA RLE
- *    Emmanuel Julien        -    initial file IO callback implementation
- *    Jon Olick              -    original jo_jpeg.cpp code
- *    Daniel Gibson          -    integrate JPEG, allow external zlib
- *    Aarni Koskela          -    allow choosing PNG filter
- *
- *    bugfixes:
- *       github:Chribba
- *       Guillaume Chereau
- *       github:jry2
- *       github:romigrou
- *       Sergio Gonzalez
- *       Jonas Karlsson
- *       Filip Wasil
- *       Thatcher Ulrich
- *       github:poppolopoppo
- *       Patrick Boettcher
- *       github:xeekworx
- *       Cap Petschulat
- *       Simon Rodriguez
- *       Ivan Tikhonov
- *       github:ignotion
- *       Adam Schackart
- *
- * LICENSE
- *
- *   Public Domain (www.unlicense.org)
- */
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/stb/stb_image_write.h"
 #include "dsp/core/core.h"
 #include "libc/assert.h"
@@ -131,16 +29,32 @@
 #include "libc/str/str.h"
 #include "third_party/zlib/zlib.h"
 
+asm(".ident\t\"\\n\\n\
+stb_image_write (Public Domain)\\n\
+Credit: Sean Barrett, et al.\\n\
+http://nothings.org/stb\"");
+
 #define STBIW_UCHAR(x)                       (unsigned char)((x)&0xff)
-#define STBIW_REALLOC_SIZED(p, oldsz, newsz) realloc(p, newsz)
+
+#define stbiw__wpng4(o, a, b, c, d)                                           \
+  ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c), \
+   (o)[3] = STBIW_UCHAR(d), (o) += 4)
+#define stbiw__wp32(data, v) \
+  stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
+#define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3])
 
 typedef struct {
   stbi_write_func *func;
   void *context;
+  unsigned char buffer[64];
+  int buf_used;
 } stbi__write_context;
 
-int stbi__flip_vertically_on_write = 0;
+int stbi_write_png_compression_level = 4;
 int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+
+static int stbi__flip_vertically_on_write = 0;
 
 void stbi_flip_vertically_on_write(int flag) {
   stbi__flip_vertically_on_write = flag;
@@ -168,9 +82,6 @@ static void stbi__end_write_file(stbi__write_context *s) {
   fclose((FILE *)s->context);
 }
 
-typedef unsigned int stbiw_uint32;
-typedef int stb_image_write_test[sizeof(stbiw_uint32) == 4 ? 1 : -1];
-
 static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v) {
   while (*fmt) {
     switch (*fmt++) {
@@ -190,7 +101,7 @@ static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v) {
         break;
       }
       case '4': {
-        stbiw_uint32 x = va_arg(v, int);
+        unsigned int x = va_arg(v, int);
         unsigned char b[4];
         b[0] = STBIW_UCHAR(x);
         b[1] = STBIW_UCHAR(x >> 8);
@@ -212,17 +123,31 @@ static void stbiw__writef(stbi__write_context *s, const char *fmt, ...) {
   va_end(v);
 }
 
+static void stbiw__write_flush(stbi__write_context *s) {
+  if (s->buf_used) {
+    s->func(s->context, &s->buffer, s->buf_used);
+    s->buf_used = 0;
+  }
+}
+
 static void stbiw__putc(stbi__write_context *s, unsigned char c) {
   s->func(s->context, &c, 1);
 }
 
+static void stbiw__write1(stbi__write_context *s, unsigned char a) {
+  if ((size_t)s->buf_used + 1 > sizeof(s->buffer)) stbiw__write_flush(s);
+  s->buffer[s->buf_used++] = a;
+}
+
 static void stbiw__write3(stbi__write_context *s, unsigned char a,
                           unsigned char b, unsigned char c) {
-  unsigned char arr[3];
-  arr[0] = a;
-  arr[1] = b;
-  arr[2] = c;
-  s->func(s->context, arr, 3);
+  int n;
+  if ((size_t)s->buf_used + 3 > sizeof(s->buffer)) stbiw__write_flush(s);
+  n = s->buf_used;
+  s->buf_used = n + 3;
+  s->buffer[n + 0] = a;
+  s->buffer[n + 1] = b;
+  s->buffer[n + 2] = c;
 }
 
 static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp,
@@ -231,7 +156,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp,
   unsigned char bg[3] = {255, 0, 255}, px[3];
   int k;
 
-  if (write_alpha < 0) s->func(s->context, &d[comp - 1], 1);
+  if (write_alpha < 0) stbiw__write1(s, d[comp - 1]);
 
   switch (comp) {
     case 2:  // 2 pixels = mono + alpha, alpha is written separately, so same as
@@ -240,7 +165,7 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp,
       if (expand_mono)
         stbiw__write3(s, d[0], d[0], d[0]);  // monochrome bmp
       else
-        s->func(s->context, d, 1);  // monochrome TGA
+        stbiw__write1(s, d[0]);  // monochrome TGA
       break;
     case 4:
       if (!write_alpha) {
@@ -254,14 +179,14 @@ static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp,
       stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
       break;
   }
-  if (write_alpha > 0) s->func(s->context, &d[comp - 1], 1);
+  if (write_alpha > 0) stbiw__write1(s, d[comp - 1]);
 }
 
 static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir,
                                 int x, int y, int comp, void *data,
                                 int write_alpha, int scanline_pad,
                                 int expand_mono) {
-  stbiw_uint32 zero = 0;
+  unsigned int zero = 0;
   int i, j, j_end;
   if (y <= 0) return;
   if (stbi__flip_vertically_on_write) vdir *= -1;
@@ -277,6 +202,7 @@ static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir,
       unsigned char *d = (unsigned char *)data + (j * x + i) * comp;
       stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
     }
+    stbiw__write_flush(s);
     s->func(s->context, &zero, scanline_pad);
   }
 }
@@ -299,25 +225,41 @@ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x,
 
 static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp,
                                const void *data) {
-  int pad = (-x * 3) & 3;
-  return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void *)data, 0, pad,
-                        "11 4 22 4"
-                        "4 44 22 444444",
-                        'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0,
-                        14 + 40,                             // file header
-                        40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);  // bitmap header
+  if (comp != 4) {
+    // write RGB bitmap
+    int pad;
+    pad = (-x * 3) & 3;
+    return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void *)data, 0, pad,
+                          "11 4 22 4"
+                          "4 44 22 444444",
+                          'B', 'M', 14 + 40 + (x * 3 + pad) * y, 0, 0,
+                          14 + 40,                             // file header
+                          40, x, y, 1, 24, 0, 0, 0, 0, 0, 0);  // bitmap header
+  } else {
+    // RGBA bitmaps need a v4 header
+    // use BI_BITFIELDS mode with 32bpp and alpha mask
+    // (straight BI_RGB with alpha mask doesn't work in most readers)
+    return stbiw__outfile(s, -1, -1, x, y, comp, 1, (void *)data, 1, 0,
+                          "11 4 22 4"
+                          "4 44 22 444444 4444 4 444 444 444 444",
+                          'B', 'M', 14 + 108 + x * y * 4, 0, 0,
+                          14 + 108,  // file header
+                          108, x, y, 1, 32, 3, 0, 0, 0, 0, 0, 0xff0000, 0xff00,
+                          0xff, 0xff000000u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                          0);  // bitmap V4 header
+  }
 }
 
 int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y,
                            int comp, const void *data) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   stbi__start_write_callbacks(&s, func, context);
   return stbi_write_bmp_core(&s, x, y, comp, data);
 }
 
 int stbi_write_bmp(char const *filename, int x, int y, int comp,
                    const void *data) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   if (stbi__start_write_file(&s, filename)) {
     int r = stbi_write_bmp_core(&s, x, y, comp, data);
     stbi__end_write_file(&s);
@@ -393,31 +335,32 @@ static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp,
 
         if (diff) {
           unsigned char header = STBIW_UCHAR(len - 1);
-          s->func(s->context, &header, 1);
+          stbiw__write1(s, header);
           for (k = 0; k < len; ++k) {
             stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
           }
         } else {
           unsigned char header = STBIW_UCHAR(len - 129);
-          s->func(s->context, &header, 1);
+          stbiw__write1(s, header);
           stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
         }
       }
     }
+    stbiw__write_flush(s);
   }
   return 1;
 }
 
 int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y,
                            int comp, const void *data) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   stbi__start_write_callbacks(&s, func, context);
   return stbi_write_tga_core(&s, x, y, comp, (void *)data);
 }
 
 int stbi_write_tga(char const *filename, int x, int y, int comp,
                    const void *data) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   if (stbi__start_write_file(&s, filename)) {
     int r = stbi_write_tga_core(&s, x, y, comp, (void *)data);
     stbi__end_write_file(&s);
@@ -426,6 +369,250 @@ int stbi_write_tga(char const *filename, int x, int y, int comp,
     return 0;
 }
 
+/*
+ * PNG writer
+ */
+
+static unsigned char *stbi_zlib_compress(unsigned char *data, int size,
+                                         int *out_len, int quality) {
+  unsigned long newsize;
+  unsigned char *newdata, *trimdata;
+  assert(0 <= size && size <= INT_MAX);
+  if ((newdata = malloc((newsize = compressBound(size)))) &&
+      compress2(newdata, &newsize, data, size,
+                stbi_write_png_compression_level) == Z_OK) {
+    *out_len = newsize;
+    if ((trimdata = realloc(newdata, newsize))) {
+      return trimdata;
+    } else {
+      return newdata;
+    }
+  }
+  free(newdata);
+  return NULL;
+}
+
+static void stbiw__wpcrc(unsigned char **data, int len) {
+  unsigned int crc = crc32(0, *data - len - 4, len + 4);
+  stbiw__wp32(*data, crc);
+}
+
+forceinline unsigned char stbiw__paeth(int a, int b, int c) {
+  int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
+  if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+  if (pb <= pc) return STBIW_UCHAR(b);
+  return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(const unsigned char *pixels,
+                                   int stride_bytes, int width, int height,
+                                   int y, int n, int filter_type,
+                                   signed char *line_buffer) {
+  int mapping[] = {0, 1, 2, 3, 4};
+  int firstmap[] = {0, 1, 0, 5, 6};
+  const unsigned char *z;
+  int *mymap, i, type, signed_stride;
+
+  mymap = (y != 0) ? mapping : firstmap;
+  type = mymap[filter_type];
+  z = pixels +
+      stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
+  signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+
+  if (type == 0) {
+    memcpy(line_buffer, z, width * n);
+    return;
+  }
+
+  for (i = 0; i < n; ++i) {
+    switch (type) {
+      case 1:
+        line_buffer[i] = z[i];
+        break;
+      case 2:
+        line_buffer[i] = z[i] - z[i - signed_stride];
+        break;
+      case 3:
+        line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
+        break;
+      case 4:
+        line_buffer[i] =
+            (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
+        break;
+      case 5:
+        line_buffer[i] = z[i];
+        break;
+      case 6:
+        line_buffer[i] = z[i];
+        break;
+    }
+  }
+
+  switch (type) {
+    case 1:
+      for (i = n; i < width * n; ++i) {
+        line_buffer[i] = z[i] - z[i - n];
+      }
+      break;
+    case 2:
+      for (i = n; i < width * n; ++i) {
+        line_buffer[i] = z[i] - z[i - signed_stride];
+      }
+      break;
+    case 3:
+      for (i = n; i < width * n; ++i) {
+        line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
+      }
+      break;
+    case 4:
+      for (i = n; i < width * n; ++i) {
+        line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride],
+                                             z[i - signed_stride - n]);
+      }
+      break;
+    case 5:
+      for (i = n; i < width * n; ++i) {
+        line_buffer[i] = z[i] - (z[i - n] >> 1);
+      }
+      break;
+    case 6:
+      for (i = n; i < width * n; ++i) {
+        line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
+      }
+      break;
+  }
+}
+
+unsigned char *stbi_write_png_to_mem(const unsigned char *pixels,
+                                     int stride_bytes, int x, int y, int n,
+                                     int *out_len) {
+  int force_filter = stbi_write_force_png_filter;
+  int ctype[5] = {-1, 0, 4, 2, 6};
+  unsigned char sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
+  unsigned char *out, *o, *filt, *zlib;
+  signed char *line_buffer;
+  int j, zlen;
+
+  if (stride_bytes == 0) stride_bytes = x * n;
+
+  if (force_filter >= 5) {
+    force_filter = -1;
+  }
+
+  filt = malloc((x * n + 1) * y);
+  if (!filt) return 0;
+  line_buffer = malloc(x * n);
+  if (!line_buffer) {
+    free(filt);
+    return 0;
+  }
+  for (j = 0; j < y; ++j) {
+    int filter_type;
+    if (force_filter > -1) {
+      filter_type = force_filter;
+      stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter,
+                             line_buffer);
+    } else {  // Estimate the best filter by running through all of them:
+      int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+      for (filter_type = 0; filter_type < 5; filter_type++) {
+        stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type,
+                               line_buffer);
+
+        // Estimate the entropy of the line using this filter; the less, the
+        // better.
+        est = 0;
+        for (i = 0; i < x * n; ++i) {
+          est += abs((signed char)line_buffer[i]);
+        }
+        if (est < best_filter_val) {
+          best_filter_val = est;
+          best_filter = filter_type;
+        }
+      }
+      if (filter_type != best_filter) {  // If the last iteration already got us
+                                         // the best filter, don't redo it
+        stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter,
+                               line_buffer);
+        filter_type = best_filter;
+      }
+    }
+    // when we get here, filter_type contains the filter type, and line_buffer
+    // contains the data
+    filt[j * (x * n + 1)] = (unsigned char)filter_type;
+    memmove(filt + j * (x * n + 1) + 1, line_buffer, x * n);
+  }
+  free(line_buffer);
+  zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen,
+                            stbi_write_png_compression_level);
+  free(filt);
+  if (!zlib) return 0;
+
+  // each tag requires 12 bytes of overhead
+  out = malloc(8 + 12 + 13 + 12 + zlen + 12);
+  if (!out) return 0;
+  *out_len = 8 + 12 + 13 + 12 + zlen + 12;
+
+  o = out;
+  memmove(o, sig, 8);
+  o += 8;
+  stbiw__wp32(o, 13);  // header length
+  stbiw__wptag(o, "IHDR");
+  stbiw__wp32(o, x);
+  stbiw__wp32(o, y);
+  *o++ = 8;
+  *o++ = STBIW_UCHAR(ctype[n]);
+  *o++ = 0;
+  *o++ = 0;
+  *o++ = 0;
+  stbiw__wpcrc(&o, 13);
+
+  stbiw__wp32(o, zlen);
+  stbiw__wptag(o, "IDAT");
+  memmove(o, zlib, zlen);
+  o += zlen;
+  free(zlib);
+  stbiw__wpcrc(&o, zlen);
+
+  stbiw__wp32(o, 0);
+  stbiw__wptag(o, "IEND");
+  stbiw__wpcrc(&o, 0);
+
+  assert(o == out + *out_len);
+
+  return out;
+}
+
+int stbi_write_png(const char *filename, int x, int y, int comp,
+                   const void *data, int stride_bytes) {
+  int len;
+  FILE *f;
+  unsigned char *png;
+  png = stbi_write_png_to_mem(data, stride_bytes, x, y, comp, &len);
+  if (png == NULL) return 0;
+  f = fopen(filename, "wb");
+  if (!f) {
+    free(png);
+    return 0;
+  }
+  fwrite(png, 1, len, f);
+  fclose(f);
+  free(png);
+  return 1;
+}
+
+int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y,
+                           int comp, const void *data, int stride_bytes) {
+  int len;
+  unsigned char *png;
+  png = stbi_write_png_to_mem((const unsigned char *)data, stride_bytes, x, y,
+                              comp, &len);
+  if (png == NULL) return 0;
+  func(context, png, len);
+  free(png);
+  return 1;
+}
+
 /* JPEG writer
  *
  * This is based on Jon Olick's jo_jpeg.cpp:
@@ -472,24 +659,25 @@ static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
 }
 
 static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf,
-                                int *bitCnt, float *CDU, float *fdtbl, int DC,
+                                int *bitCnt, float *CDU, unsigned du_stride,
+                                float *fdtbl, int DC,
                                 const unsigned short HTDC[256][2],
                                 const unsigned short HTAC[256][2]) {
   const unsigned short EOB[2] = {HTAC[0x00][0], HTAC[0x00][1]};
   const unsigned short M16zeroes[2] = {HTAC[0xF0][0], HTAC[0xF0][1]};
-  unsigned i, diff, end0pos;
+  unsigned i, j, diff, end0pos, x, y;
   int DU[64];
 
-  dctjpeg((void *)CDU);
+  dctjpeg((void *)CDU, du_stride / 8);
 
   // Quantize/descale/zigzag the coefficients
-  for (i = 0; i < 64; ++i) {
-    float v = CDU[i] * fdtbl[i];
-    DU[stbiw__jpg_ZigZag[i]] = v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f);
-    // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v +
-    // 0.5f)); ceilf() and floorf() are C99, not C89, but I /think/ they're not
-    // needed here anyway?
-    /* DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f); */
+  for (j = 0, y = 0; y < 8; ++y) {
+    for (x = 0; x < 8; ++x, ++j) {
+      float v;
+      i = y * du_stride + x;
+      v = CDU[i] * fdtbl[j];
+      DU[stbiw__jpg_ZigZag[j]] = v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f);
+    }
   }
 
   // Encode DC
@@ -709,7 +897,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height,
       1.0f * 2.828427125f,         0.785694958f * 2.828427125f,
       0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f};
 
-  int row, col, i, k;
+  int row, col, i, k, subsample;
   float fdtbl_Y[64], fdtbl_UV[64];
   unsigned char YTable[64], UVTable[64];
 
@@ -718,6 +906,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height,
   }
 
   quality = quality ? quality : 97;
+  subsample = quality <= 97 ? 1 : 0;
   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
 
@@ -758,7 +947,7 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height,
         STBIW_UCHAR(width),
         3,
         1,
-        0x11,
+        (unsigned char)(subsample ? 0x22 : 0x11),
         0,
         2,
         0x11,
@@ -802,42 +991,92 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height,
   // Encode 8x8 macroblocks
   {
     static const unsigned short fillBits[] = {0x7F, 7};
-    const unsigned char *imageData = (const unsigned char *)data;
     int DCY = 0, DCU = 0, DCV = 0;
     int bitBuf = 0, bitCnt = 0;
     // comp == 2 is grey+alpha (alpha is ignored)
     int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+    const unsigned char *dataR = (const unsigned char *)data;
+    const unsigned char *dataG = dataR + ofsG;
+    const unsigned char *dataB = dataR + ofsB;
     int x, y, pos;
-    for (y = 0; y < height; y += 8) {
-      for (x = 0; x < width; x += 8) {
-        float YDU[64], UDU[64], VDU[64];
-        for (row = y, pos = 0; row < y + 8; ++row) {
-          // row >= height => use last input row
-          int clamped_row = (row < height) ? row : height - 1;
-          int base_p =
-              (stbi__flip_vertically_on_write ? (height - 1 - clamped_row)
-                                              : clamped_row) *
-              width * comp;
-          for (col = x; col < x + 8; ++col, ++pos) {
-            float r, g, b;
-            // if col >= width => use pixel from last input column
-            int p = base_p + ((col < width) ? col : (width - 1)) * comp;
+    if (subsample) {
+      for (y = 0; y < height; y += 16) {
+        for (x = 0; x < width; x += 16) {
+          float Y[256], U[256], V[256];
+          for (row = y, pos = 0; row < y + 16; ++row) {
+            // row >= height => use last input row
+            int clamped_row = (row < height) ? row : height - 1;
+            int base_p =
+                (stbi__flip_vertically_on_write ? (height - 1 - clamped_row)
+                                                : clamped_row) *
+                width * comp;
+            for (col = x; col < x + 16; ++col, ++pos) {
+              // if col >= width => use pixel from last input column
+              int p = base_p + ((col < width) ? col : (width - 1)) * comp;
+              float r = dataR[p], g = dataG[p], b = dataB[p];
+              Y[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128;
+              U[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b;
+              V[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b;
+            }
+          }
 
-            r = imageData[p + 0];
-            g = imageData[p + ofsG];
-            b = imageData[p + ofsB];
-            YDU[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128;
-            UDU[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b;
-            VDU[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b;
+          DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 0, 16, fdtbl_Y,
+                                     DCY, YDC_HT, YAC_HT);
+          DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 8, 16, fdtbl_Y,
+                                     DCY, YDC_HT, YAC_HT);
+          DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 128, 16, fdtbl_Y,
+                                     DCY, YDC_HT, YAC_HT);
+          DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y + 136, 16, fdtbl_Y,
+                                     DCY, YDC_HT, YAC_HT);
+
+          // subsample U,V
+          {
+            float subU[64], subV[64];
+            int yy, xx;
+            for (yy = 0, pos = 0; yy < 8; ++yy) {
+              for (xx = 0; xx < 8; ++xx, ++pos) {
+                int j = yy * 32 + xx * 2;
+                subU[pos] =
+                    (U[j + 0] + U[j + 1] + U[j + 16] + U[j + 17]) * 0.25f;
+                subV[pos] =
+                    (V[j + 0] + V[j + 1] + V[j + 16] + V[j + 17]) * 0.25f;
+              }
+            }
+            DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subU, 8, fdtbl_UV,
+                                       DCU, UVDC_HT, UVAC_HT);
+            DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, subV, 8, fdtbl_UV,
+                                       DCV, UVDC_HT, UVAC_HT);
           }
         }
+      }
+    } else {
+      for (y = 0; y < height; y += 8) {
+        for (x = 0; x < width; x += 8) {
+          float Y[64], U[64], V[64];
+          for (row = y, pos = 0; row < y + 8; ++row) {
+            // row >= height => use last input row
+            int clamped_row = (row < height) ? row : height - 1;
+            int base_p =
+                (stbi__flip_vertically_on_write ? (height - 1 - clamped_row)
+                                                : clamped_row) *
+                width * comp;
+            for (col = x; col < x + 8; ++col, ++pos) {
+              // if col >= width => use pixel from last input column
+              int p = base_p + ((col < width) ? col : (width - 1)) * comp;
+              float r = dataR[p], g = dataG[p], b = dataB[p];
+              Y[pos] = +0.29900f * r + 0.58700f * g + 0.11400f * b - 128;
+              U[pos] = -0.16874f * r - 0.33126f * g + 0.50000f * b;
+              V[pos] = +0.50000f * r - 0.41869f * g - 0.08131f * b;
+            }
+          }
 
-        DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY,
-                                   YDC_HT, YAC_HT);
-        DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU,
-                                   UVDC_HT, UVAC_HT);
-        DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV,
-                                   UVDC_HT, UVAC_HT);
+          DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, Y, 8, fdtbl_Y, DCY,
+                                     YDC_HT, YAC_HT);
+          DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, U, 8, fdtbl_UV, DCU,
+                                     UVDC_HT, UVAC_HT);
+          DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, V, 8, fdtbl_UV, DCV,
+                                     UVDC_HT, UVAC_HT);
+        }
       }
     }
 
@@ -854,14 +1093,14 @@ static int stbi_write_jpg_core(stbi__write_context *s, int width, int height,
 
 int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y,
                            int comp, const void *data, int quality) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   stbi__start_write_callbacks(&s, func, context);
   return stbi_write_jpg_core(&s, x, y, comp, (void *)data, quality);
 }
 
 int stbi_write_jpg(char const *filename, int x, int y, int comp,
                    const void *data, int quality) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   if (stbi__start_write_file(&s, filename)) {
     int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
     stbi__end_write_file(&s);
@@ -1026,14 +1265,14 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp,
 
 int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y,
                            int comp, const float *data) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   stbi__start_write_callbacks(&s, func, context);
   return stbi_write_hdr_core(&s, x, y, comp, (float *)data);
 }
 
 int stbi_write_hdr(char const *filename, int x, int y, int comp,
                    const float *data) {
-  stbi__write_context s;
+  stbi__write_context s = {0};
   if (stbi__start_write_file(&s, filename)) {
     int r = stbi_write_hdr_core(&s, x, y, comp, (float *)data);
     stbi__end_write_file(&s);
diff --git a/third_party/stb/stb_image_write.h b/third_party/stb/stb_image_write.h
index 2b21f4f15..f1ad8e167 100644
--- a/third_party/stb/stb_image_write.h
+++ b/third_party/stb/stb_image_write.h
@@ -3,7 +3,6 @@
 COSMOPOLITAN_C_START_
 
 extern int stbi_write_png_compression_level;
-extern int stbi__flip_vertically_on_write;
 extern int stbi_write_tga_with_rle;
 extern int stbi_write_force_png_filter;
 
diff --git a/third_party/stb/stb_image_write_png.c b/third_party/stb/stb_image_write_png.c
deleted file mode 100644
index 9ee926f58..000000000
--- a/third_party/stb/stb_image_write_png.c
+++ /dev/null
@@ -1,379 +0,0 @@
-/* stb_image_write - v1.13 - public domain - http://nothings.org/stb
- * writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
- *                                  no warranty implied; use at your own risk
- *
- * ABOUT:
- *
- *    This file is a library for writing images to stdio or a callback.
- *
- *    The PNG output is not optimal; it is 20-50% larger than the file
- *    written by a decent optimizing implementation; though providing a
- *    custom zlib compress function (see STBIW_ZLIB_COMPRESS) can
- *    mitigate that. This library is designed for source code
- *    compactness and simplicity, not optimal image file size or
- *    run-time performance.
- *
- * USAGE:
- *
- *    There are five functions, one for each image file format:
- *
- *      stbi_write_png
- *      stbi_write_bmp
- *      stbi_write_tga
- *      stbi_write_jpg
- *      stbi_write_hdr
- *
- *      stbi_flip_vertically_on_write
- *
- *    There are also five equivalent functions that use an arbitrary
- *    write function. You are expected to open/close your
- *    file-equivalent before and after calling these:
- *
- *      stbi_write_png_to_func
- *      stbi_write_bmp_to_func
- *      stbi_write_tga_to_func
- *      stbi_write_hdr_to_func
- *      stbi_write_jpg_to_func
- *
- *    where the callback is:
- *       void stbi_write_func(void *context, void *data, int size);
- *
- *    You can configure it with these:
- *       stbi_write_tga_with_rle
- *       stbi_write_png_compression_level
- *       stbi_write_force_png_filter
- *
- *    Each function returns 0 on failure and non-0 on success.
- *
- *    The functions create an image file defined by the parameters. The
- *    image is a rectangle of pixels stored from left-to-right,
- *    top-to-bottom. Each pixel contains 'comp' channels of data stored
- *    interleaved with 8-bits per channel, in the following order: 1=Y,
- *    2=YA, 3=RGB, 4=RGBA. (Y is monochrome color.) The rectangle is 'w'
- *    pixels wide and 'h' pixels tall. The *data pointer points to the
- *    first byte of the top-left-most pixel. For PNG, "stride_in_bytes"
- *    is the distance in bytes from the first byte of a row of pixels to
- *    the first byte of the next row of pixels.
- *
- *    PNG creates output files with the same number of components as the
- *    input. The BMP format expands Y to RGB in the file format and does
- *    not output alpha.
- *
- *    PNG supports writing rectangles of data even when the bytes
- *    storing rows of data are not consecutive in memory (e.g.
- *    sub-rectangles of a larger image), by supplying the stride between
- *    the beginning of adjacent rows. The other formats do not. (Thus
- *    you cannot write a native-format BMP through the BMP writer, both
- *    because it is in BGR order and because it may have padding at the
- *    end of the line.)
- *
- *    PNG allows you to set the deflate compression level by setting the
- *    global variable 'stbi_write_png_compression_level' (it defaults to
- *    8).
- *
- *    HDR expects linear float data. Since the format is always 32-bit
- *    rgb(e) data, alpha (if provided) is discarded, and for monochrome
- *    data it is replicated across all three channels.
- *
- *    TGA supports RLE or non-RLE compressed data. To use
- *    non-RLE-compressed data, set the global variable
- *    'stbi_write_tga_with_rle' to 0.
- *
- *    JPEG does ignore alpha channels in input data; quality is between
- *    1 and 100. Higher quality looks better but results in a bigger
- *    image. JPEG baseline (no JPEG progressive).
- *
- * CREDITS:
- *
- *
- *    Sean Barrett           -    PNG/BMP/TGA
- *    Baldur Karlsson        -    HDR
- *    Jean-Sebastien Guay    -    TGA monochrome
- *    Tim Kelsey             -    misc enhancements
- *    Alan Hickman           -    TGA RLE
- *    Emmanuel Julien        -    initial file IO callback implementation
- *    Jon Olick              -    original jo_jpeg.cpp code
- *    Daniel Gibson          -    integrate JPEG, allow external zlib
- *    Aarni Koskela          -    allow choosing PNG filter
- *
- *    bugfixes:
- *       github:Chribba
- *       Guillaume Chereau
- *       github:jry2
- *       github:romigrou
- *       Sergio Gonzalez
- *       Jonas Karlsson
- *       Filip Wasil
- *       Thatcher Ulrich
- *       github:poppolopoppo
- *       Patrick Boettcher
- *       github:xeekworx
- *       Cap Petschulat
- *       Simon Rodriguez
- *       Ivan Tikhonov
- *       github:ignotion
- *       Adam Schackart
- *
- * LICENSE
- *
- *   Public Domain (www.unlicense.org)
- */
-#include "libc/assert.h"
-#include "libc/fmt/conv.h"
-#include "libc/limits.h"
-#include "libc/mem/mem.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "third_party/stb/stb_image_write.h"
-#include "third_party/zlib/zlib.h"
-
-#define STBIW_UCHAR(x) (unsigned char)((x)&0xff)
-#define stbiw__wpng4(o, a, b, c, d)                                           \
-  ((o)[0] = STBIW_UCHAR(a), (o)[1] = STBIW_UCHAR(b), (o)[2] = STBIW_UCHAR(c), \
-   (o)[3] = STBIW_UCHAR(d), (o) += 4)
-#define stbiw__wp32(data, v) \
-  stbiw__wpng4(data, (v) >> 24, (v) >> 16, (v) >> 8, (v));
-#define stbiw__wptag(data, s) stbiw__wpng4(data, s[0], s[1], s[2], s[3])
-
-int stbi_write_png_compression_level = 4;
-int stbi_write_force_png_filter = -1;
-
-static unsigned char *stbi_zlib_compress(unsigned char *data, int size,
-                                         int *out_len, int quality) {
-  unsigned long newsize;
-  unsigned char *newdata, *trimdata;
-  assert(0 <= size && size <= INT_MAX);
-  if ((newdata = malloc((newsize = compressBound(size)))) &&
-      compress2(newdata, &newsize, data, size,
-                stbi_write_png_compression_level) == Z_OK) {
-    *out_len = newsize;
-    if ((trimdata = realloc(newdata, newsize))) {
-      return trimdata;
-    } else {
-      return newdata;
-    }
-  }
-  free(newdata);
-  return NULL;
-}
-
-static void stbiw__wpcrc(unsigned char **data, int len) {
-  unsigned int crc = crc32(0, *data - len - 4, len + 4);
-  stbiw__wp32(*data, crc);
-}
-
-forceinline unsigned char stbiw__paeth(int a, int b, int c) {
-  int p = a + b - c, pa = abs(p - a), pb = abs(p - b), pc = abs(p - c);
-  if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
-  if (pb <= pc) return STBIW_UCHAR(b);
-  return STBIW_UCHAR(c);
-}
-
-// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
-static void stbiw__encode_png_line(const unsigned char *pixels,
-                                   int stride_bytes, int width, int height,
-                                   int y, int n, int filter_type,
-                                   signed char *line_buffer) {
-  int mapping[] = {0, 1, 2, 3, 4};
-  int firstmap[] = {0, 1, 0, 5, 6};
-  const unsigned char *z;
-  int *mymap, i, type, signed_stride;
-
-  mymap = (y != 0) ? mapping : firstmap;
-  type = mymap[filter_type];
-  z = pixels +
-      stride_bytes * (stbi__flip_vertically_on_write ? height - 1 - y : y);
-  signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
-
-  if (type == 0) {
-    memcpy(line_buffer, z, width * n);
-    return;
-  }
-
-  for (i = 0; i < n; ++i) {
-    switch (type) {
-      case 1:
-        line_buffer[i] = z[i];
-        break;
-      case 2:
-        line_buffer[i] = z[i] - z[i - signed_stride];
-        break;
-      case 3:
-        line_buffer[i] = z[i] - (z[i - signed_stride] >> 1);
-        break;
-      case 4:
-        line_buffer[i] =
-            (signed char)(z[i] - stbiw__paeth(0, z[i - signed_stride], 0));
-        break;
-      case 5:
-        line_buffer[i] = z[i];
-        break;
-      case 6:
-        line_buffer[i] = z[i];
-        break;
-    }
-  }
-
-  switch (type) {
-    case 1:
-      for (i = n; i < width * n; ++i) {
-        line_buffer[i] = z[i] - z[i - n];
-      }
-      break;
-    case 2:
-      for (i = n; i < width * n; ++i) {
-        line_buffer[i] = z[i] - z[i - signed_stride];
-      }
-      break;
-    case 3:
-      for (i = n; i < width * n; ++i) {
-        line_buffer[i] = z[i] - ((z[i - n] + z[i - signed_stride]) >> 1);
-      }
-      break;
-    case 4:
-      for (i = n; i < width * n; ++i) {
-        line_buffer[i] = z[i] - stbiw__paeth(z[i - n], z[i - signed_stride],
-                                             z[i - signed_stride - n]);
-      }
-      break;
-    case 5:
-      for (i = n; i < width * n; ++i) {
-        line_buffer[i] = z[i] - (z[i - n] >> 1);
-      }
-      break;
-    case 6:
-      for (i = n; i < width * n; ++i) {
-        line_buffer[i] = z[i] - stbiw__paeth(z[i - n], 0, 0);
-      }
-      break;
-  }
-}
-
-unsigned char *stbi_write_png_to_mem(const unsigned char *pixels,
-                                     int stride_bytes, int x, int y, int n,
-                                     int *out_len) {
-  int force_filter = stbi_write_force_png_filter;
-  int ctype[5] = {-1, 0, 4, 2, 6};
-  unsigned char sig[8] = {137, 80, 78, 71, 13, 10, 26, 10};
-  unsigned char *out, *o, *filt, *zlib;
-  signed char *line_buffer;
-  int j, zlen;
-
-  if (stride_bytes == 0) stride_bytes = x * n;
-
-  if (force_filter >= 5) {
-    force_filter = -1;
-  }
-
-  filt = malloc((x * n + 1) * y);
-  if (!filt) return 0;
-  line_buffer = malloc(x * n);
-  if (!line_buffer) {
-    free(filt);
-    return 0;
-  }
-  for (j = 0; j < y; ++j) {
-    int filter_type;
-    if (force_filter > -1) {
-      filter_type = force_filter;
-      stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter,
-                             line_buffer);
-    } else {  // Estimate the best filter by running through all of them:
-      int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
-      for (filter_type = 0; filter_type < 5; filter_type++) {
-        stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type,
-                               line_buffer);
-
-        // Estimate the entropy of the line using this filter; the less, the
-        // better.
-        est = 0;
-        for (i = 0; i < x * n; ++i) {
-          est += abs((signed char)line_buffer[i]);
-        }
-        if (est < best_filter_val) {
-          best_filter_val = est;
-          best_filter = filter_type;
-        }
-      }
-      if (filter_type != best_filter) {  // If the last iteration already got us
-                                         // the best filter, don't redo it
-        stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter,
-                               line_buffer);
-        filter_type = best_filter;
-      }
-    }
-    // when we get here, filter_type contains the filter type, and line_buffer
-    // contains the data
-    filt[j * (x * n + 1)] = (unsigned char)filter_type;
-    memmove(filt + j * (x * n + 1) + 1, line_buffer, x * n);
-  }
-  free(line_buffer);
-  zlib = stbi_zlib_compress(filt, y * (x * n + 1), &zlen,
-                            stbi_write_png_compression_level);
-  free(filt);
-  if (!zlib) return 0;
-
-  // each tag requires 12 bytes of overhead
-  out = malloc(8 + 12 + 13 + 12 + zlen + 12);
-  if (!out) return 0;
-  *out_len = 8 + 12 + 13 + 12 + zlen + 12;
-
-  o = out;
-  memmove(o, sig, 8);
-  o += 8;
-  stbiw__wp32(o, 13);  // header length
-  stbiw__wptag(o, "IHDR");
-  stbiw__wp32(o, x);
-  stbiw__wp32(o, y);
-  *o++ = 8;
-  *o++ = STBIW_UCHAR(ctype[n]);
-  *o++ = 0;
-  *o++ = 0;
-  *o++ = 0;
-  stbiw__wpcrc(&o, 13);
-
-  stbiw__wp32(o, zlen);
-  stbiw__wptag(o, "IDAT");
-  memmove(o, zlib, zlen);
-  o += zlen;
-  free(zlib);
-  stbiw__wpcrc(&o, zlen);
-
-  stbiw__wp32(o, 0);
-  stbiw__wptag(o, "IEND");
-  stbiw__wpcrc(&o, 0);
-
-  assert(o == out + *out_len);
-
-  return out;
-}
-
-int stbi_write_png(const char *filename, int x, int y, int comp,
-                   const void *data, int stride_bytes) {
-  int len;
-  FILE *f;
-  unsigned char *png;
-  png = stbi_write_png_to_mem(data, stride_bytes, x, y, comp, &len);
-  if (png == NULL) return 0;
-  f = fopen(filename, "wb");
-  if (!f) {
-    free(png);
-    return 0;
-  }
-  fwrite(png, 1, len, f);
-  fclose(f);
-  free(png);
-  return 1;
-}
-
-int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y,
-                           int comp, const void *data, int stride_bytes) {
-  int len;
-  unsigned char *png;
-  png = stbi_write_png_to_mem((const unsigned char *)data, stride_bytes, x, y,
-                              comp, &len);
-  if (png == NULL) return 0;
-  func(context, png, len);
-  free(png);
-  return 1;
-}
diff --git a/third_party/stb/stb_rect_pack.c b/third_party/stb/stb_rect_pack.c
index 052a2c3f1..65fab7afa 100644
--- a/third_party/stb/stb_rect_pack.c
+++ b/third_party/stb/stb_rect_pack.c
@@ -1,29 +1,20 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:3;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=3 sts=3 sw=3 fenc=utf-8                               :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
 │                                                                              │
-│  stb_truetype                                                                │
-│  Copyright 2017 Sean Barrett                                                 │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
 │                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/stb/stb_rect_pack.h"
 #include "libc/assert.h"
@@ -41,8 +32,6 @@ asm(".include \"libc/disclaimer.inc\"");
 // Useful for e.g. packing rectangular textures into an atlas.
 // Does not do rotation.
 //
-// in the file that you want to have the implementation.
-//
 // Not necessarily the awesomest packing method, but better than
 // the totally naive one in stb_truetype (which is primarily what
 // this is meant to replace).
@@ -390,7 +379,11 @@ static int rect_height_compare(const void *a, const void *b)
       return -1;
    if (p->h < q->h)
       return  1;
-   return (p->w > q->w) ? -1 : (p->w < q->w);
+   if (p->w > q->w)
+      return -1;
+   if (p->w < q->w)
+      return  1;
+   return (p->was_packed < q->was_packed) ? -1 : (p->was_packed > q->was_packed);
 }
 
 static int rect_original_order(const void *a, const void *b)
diff --git a/third_party/stb/stb_truetype.c b/third_party/stb/stb_truetype.c
index 742b03469..a59346df3 100644
--- a/third_party/stb/stb_truetype.c
+++ b/third_party/stb/stb_truetype.c
@@ -1,5 +1,5 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:3;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=3 sts=3 sw=3 fenc=utf-8                               :vi │
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
 ╞══════════════════════════════════════════════════════════════════════════════╡
 │                                                                              │
 │  stb_truetype                                                                │
diff --git a/third_party/stb/stb_vorbis.c b/third_party/stb/stb_vorbis.c
index 88124b77d..3b3da48f3 100644
--- a/third_party/stb/stb_vorbis.c
+++ b/third_party/stb/stb_vorbis.c
@@ -32,6 +32,7 @@
 //    manxorist@github   saga musix          github:infatum
 //    Timur Gagiev       Maxwell Koo
 //
+
 #include "third_party/stb/stb_vorbis.h"
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
@@ -45,6 +46,11 @@
 #include "libc/mem/mem.h"
 #include "libc/str/str.h"
 
+asm(".ident\t\"\\n\\n\
+stb_vorbis (Public Domain)\\n\
+Credit: Sean Barrett, et al.\\n\
+http://nothings.org/stb\"");
+
 // STB_VORBIS_NO_PUSHDATA_API
 //     does not compile the code for the various stb_vorbis_*_pushdata()
 //     functions
@@ -343,6 +349,10 @@ struct stb_vorbis {
   unsigned int temp_memory_required;
   unsigned int setup_temp_memory_required;
 
+  char *vendor;
+  int comment_list_length;
+  char **comment_list;
+
   // input config
 #ifndef STB_VORBIS_NO_STDIO
   FILE *f;
@@ -358,8 +368,11 @@ struct stb_vorbis {
 
   uint8 push_mode;
 
+  // the page to seek to when seeking to start, may be zero
   uint32 first_audio_page_offset;
 
+  // p_first is the page on which the first audio packet ends
+  // (but not necessarily the page on which it starts)
   ProbedPage p_first, p_last;
 
   // memory management
@@ -493,7 +506,7 @@ static dontinline void *make_block_array(void *mem, int count, int size) {
 }
 
 static dontinline void *setup_malloc(vorb *f, int sz) {
-  sz = (sz + 3) & ~3;
+  sz = (sz + 7) & ~7;  // round up to nearest 8 for alignment of future allocs.
   f->setup_memory_required += sz;
   if (f->alloc.alloc_buffer) {
     void *p = (char *)f->alloc.alloc_buffer + f->setup_offset;
@@ -510,7 +523,7 @@ static dontinline void setup_free(vorb *f, void *p) {
 }
 
 static dontinline void *setup_temp_malloc(vorb *f, int sz) {
-  sz = (sz + 3) & ~3;
+  sz = (sz + 7) & ~7;  // round up to nearest 8 for alignment of future allocs.
   if (f->alloc.alloc_buffer) {
     if (f->temp_offset - sz < f->setup_offset) return NULL;
     f->temp_offset -= sz;
@@ -521,7 +534,7 @@ static dontinline void *setup_temp_malloc(vorb *f, int sz) {
 
 static dontinline void setup_temp_free(vorb *f, void *p, int sz) {
   if (f->alloc.alloc_buffer) {
-    f->temp_offset += (sz + 3) & ~3;
+    f->temp_offset += (sz + 7) & ~7;
     return;
   }
   free(p);
@@ -593,7 +606,7 @@ static float float32_unpack(uint32 x) {
   uint32 sign = x & 0x80000000;
   uint32 exp = (x & 0x7fe00000) >> 21;
   double res = sign ? -(double)mantissa : (double)mantissa;
-  return (float)ldexp((float)res, exp - 788);
+  return (float)ldexp((float)res, (int)exp - 788);
 }
 
 // zlib & jpeg huffman tables assume that the output symbols
@@ -636,6 +649,8 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values) {
     assert(c->sorted_entries == 0);
     return TRUE;
   }
+  // no error return required, code reading lens checks this
+  assert(len[k] < 32);
   // add to the list
   add_entry(c, 0, k, m++, len[k], values);
   // add all available leaves
@@ -648,6 +663,8 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values) {
     uint32 res;
     int z = len[i], y;
     if (z == NO_CODE) continue;
+    // no error return required, code reading lens checks this
+    assert(z < 32);
     // find lowest available leaf (should always be earliest,
     // which is what the specification calls for)
     // note that this property, and the fact we can never have
@@ -659,12 +676,10 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values) {
       return FALSE;
     }
     res = available[z];
-    assert(z >= 0 && z < 32);
     available[z] = 0;
     add_entry(c, ReverseBits32(res), i, m++, len[i], values);
     // propagate availability up the tree
     if (z != len[i]) {
-      assert(len[i] >= 0 && len[i] < 32);
       for (y = len[i]; y > z; --y) {
         assert(available[y] == 0);
         available[y] = res + (1 << (32 - y));
@@ -991,6 +1006,9 @@ static int capture_pattern(vorb *f) {
 
 static int start_page_no_capturepattern(vorb *f) {
   uint32 loc0, loc1, n;
+  if (f->first_decode && !IS_PUSH_MODE(f)) {
+    f->p_first.page_start = stb_vorbis_get_file_offset(f) - 4;
+  }
   // stream structure version
   if (0 != get8(f)) return error(f, VORBIS_invalid_stream_structure_version);
   // header flag
@@ -1027,14 +1045,12 @@ static int start_page_no_capturepattern(vorb *f) {
   }
   if (f->first_decode) {
     int i, len;
-    ProbedPage p;
     len = 0;
     for (i = 0; i < f->segment_count; ++i) len += f->segments[i];
     len += 27 + f->segment_count;
-    p.page_start = f->first_audio_page_offset;
-    p.page_end = p.page_start + len;
-    p.last_decoded_sample = loc0;
-    f->p_first = p;
+
+    f->p_first.page_end = f->p_first.page_start + len;
+    f->p_first.last_decoded_sample = loc0;
   }
   f->next_seg = 0;
   return TRUE;
@@ -1124,6 +1140,15 @@ static int get8_packet(vorb *f) {
   return x;
 }
 
+static int get32_packet(vorb *f) {
+  uint32 x;
+  x = get8_packet(f);
+  x += (uint32)get8_packet(f) << 8;
+  x += (uint32)get8_packet(f) << 16;
+  x += (uint32)get8_packet(f) << 24;
+  return x;
+}
+
 static void flush_packet(vorb *f) {
   while (get8_packet_raw(f) != EOP)
     ;
@@ -1153,7 +1178,7 @@ static uint32 get_bits(vorb *f, int n) {
       f->valid_bits += 8;
     }
   }
-  if (f->valid_bits < 0) return 0;
+  assert(f->valid_bits >= n);
   z = f->acc & ((1 << n) - 1);
   f->acc >>= n;
   f->valid_bits -= n;
@@ -1225,7 +1250,7 @@ static int codebook_decode_scalar_raw(vorb *f, Codebook *c) {
   assert(!c->sparse);
   for (i = 0; i < c->entries; ++i) {
     if (c->codeword_lengths[i] == NO_CODE) continue;
-    if (c->codewords[i] == (f->acc & ((1 << c->codeword_lengths[i]) - 1))) {
+    if (c->codewords[i] == (f->acc & ((1u << c->codeword_lengths[i]) - 1))) {
       if (f->valid_bits >= c->codeword_lengths[i]) {
         f->acc >>= c->codeword_lengths[i];
         f->valid_bits -= c->codeword_lengths[i];
@@ -1414,7 +1439,8 @@ static int codebook_decode_deinterleave_repeat(vorb *f, Codebook *c,
     // buffer (len*ch), our current offset within it (p_inter*ch)+(c_inter),
     // and the length we'll be using (effective)
     if (c_inter + p_inter * ch + effective > len * ch) {
-      effective = len * ch - (p_inter * ch - c_inter);
+      // https://github.com/nothings/stb/pull/1490
+      effective = len * ch - (p_inter * ch + c_inter);
     }
 
 #ifdef STB_VORBIS_DIVIDES_IN_CODEBOOK
@@ -1717,49 +1743,7 @@ static void decode_residue(vorb *f, float *residue_buffers[], int ch, int n,
           ++class_set;
 #endif
         }
-      } else if (ch == 1) {
-        while (pcount < part_read) {
-          int z = r->begin + pcount * r->part_size;
-          int c_inter = 0, p_inter = z;
-          if (pass_ == 0) {
-            Codebook *c = f->codebooks + r->classbook;
-            int q;
-            DECODE(q, f, c);
-            if (q == EOP) goto done;
-#ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
-            part_classdata[0][class_set] = r->classdata[q];
-#else
-            for (i = classwords - 1; i >= 0; --i) {
-              classifications[0][i + pcount] = q % r->classifications;
-              q /= r->classifications;
-            }
-#endif
-          }
-          for (i = 0; i < classwords && pcount < part_read; ++i, ++pcount) {
-            int z = r->begin + pcount * r->part_size;
-#ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
-            int c = part_classdata[0][class_set][i];
-#else
-            int c = classifications[0][pcount];
-#endif
-            int b = r->residue_books[c][pass_];
-            if (b >= 0) {
-              Codebook *book = f->codebooks + b;
-              if (!codebook_decode_deinterleave_repeat(f, book, residue_buffers,
-                                                       ch, &c_inter, &p_inter,
-                                                       n, r->part_size))
-                goto done;
-            } else {
-              z += r->part_size;
-              c_inter = 0;
-              p_inter = z;
-            }
-          }
-#ifndef STB_VORBIS_DIVIDES_IN_RESIDUE
-          ++class_set;
-#endif
-        }
-      } else {
+      } else if (ch > 2) {
         while (pcount < part_read) {
           int z = r->begin + pcount * r->part_size;
           int c_inter = z % ch, p_inter = z / ch;
@@ -2165,34 +2149,33 @@ static void imdct_step3_inner_s_loop_ld654(int n, float *e, int i_off, float *A,
 
   while (z > base) {
     float k00, k11;
+    float l00, l11;
 
     k00 = z[-0] - z[-8];
     k11 = z[-1] - z[-9];
+    l00 = z[-2] - z[-10];
+    l11 = z[-3] - z[-11];
     z[-0] = z[-0] + z[-8];
     z[-1] = z[-1] + z[-9];
-    z[-8] = k00;
-    z[-9] = k11;
-
-    k00 = z[-2] - z[-10];
-    k11 = z[-3] - z[-11];
     z[-2] = z[-2] + z[-10];
     z[-3] = z[-3] + z[-11];
-    z[-10] = (k00 + k11) * A2;
-    z[-11] = (k11 - k00) * A2;
+    z[-8] = k00;
+    z[-9] = k11;
+    z[-10] = (l00 + l11) * A2;
+    z[-11] = (l11 - l00) * A2;
 
-    k00 = z[-12] - z[-4];  // reverse to avoid a unary negation
+    k00 = z[-4] - z[-12];
     k11 = z[-5] - z[-13];
+    l00 = z[-6] - z[-14];
+    l11 = z[-7] - z[-15];
     z[-4] = z[-4] + z[-12];
     z[-5] = z[-5] + z[-13];
-    z[-12] = k11;
-    z[-13] = k00;
-
-    k00 = z[-14] - z[-6];  // reverse to avoid a unary negation
-    k11 = z[-7] - z[-15];
     z[-6] = z[-6] + z[-14];
     z[-7] = z[-7] + z[-15];
-    z[-14] = (k00 + k11) * A2;
-    z[-15] = (k00 - k11) * A2;
+    z[-12] = k11;
+    z[-13] = -k00;
+    z[-14] = (l11 - l00) * A2;
+    z[-15] = (l00 + l11) * -A2;
 
     iter_54(z);
     iter_54(z - 8);
@@ -2630,7 +2613,8 @@ void inverse_mdct_naive(float *buffer, int n)
 #endif
 
 static float *get_window(vorb *f, int len) {
-  len <<= 1;
+  // https://github.com/nothings/stb/pull/1499
+  len = (unsigned int)len << 1;
   if (len == f->blocksize_0) return f->window[0];
   if (len == f->blocksize_1) return f->window[1];
   return NULL;
@@ -2755,8 +2739,8 @@ static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start,
                                      int right_end, int *p_left) {
   Mapping *map;
   int i, j, k, n, n2;
-  int zero_channel[256];
-  int really_zero_channel[256];
+  int zero_channel[256] = {0};
+  int really_zero_channel[256] = {0};
 
   // WINDOWING
 
@@ -2959,7 +2943,9 @@ static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start,
     // this isn't to spec, but spec would require us to read ahead
     // and decode the size of all current frames--could be done,
     // but presumably it's not a commonly used feature
-    f->current_loc = -n2;  // start of first frame is positioned for discard
+    f->current_loc = 0u - n2;  // start of first frame is positioned for discard
+                               // (NB this is an intentional unsigned
+                               // overflow wrap-around)
     // we might have to discard samples "from" the next frame too,
     // if we're lapping a large block then a small at the start?
     f->discard_samples_deferred = n - right_end;
@@ -3089,7 +3075,7 @@ static int vorbis_pump_first_frame(stb_vorbis *f) {
 }
 
 #ifndef STB_VORBIS_NO_PUSHDATA_API
-static int is_whole_packet_present(stb_vorbis *f, int end_page) {
+static int is_whole_packet_present(stb_vorbis *f) {
   // make sure that we have the packet available before continuing...
   // this requires a full ogg parse, but we know we can fetch from f->stream
 
@@ -3109,8 +3095,6 @@ static int is_whole_packet_present(stb_vorbis *f, int end_page) {
         break;
     }
     // either this continues, or it ends it...
-    if (end_page)
-      if (s < f->segment_count - 1) return error(f, VORBIS_invalid_stream);
     if (s == f->segment_count) s = -1;  // set 'crosses page' flag
     if (p > f->stream_end) return error(f, VORBIS_need_more_data);
     first = FALSE;
@@ -3144,8 +3128,6 @@ static int is_whole_packet_present(stb_vorbis *f, int end_page) {
       p += q[s];
       if (q[s] < 255) break;
     }
-    if (end_page)
-      if (s < n - 1) return error(f, VORBIS_invalid_stream);
     if (s == n) s = -1;  // set 'crosses page' flag
     if (p > f->stream_end) return error(f, VORBIS_need_more_data);
     first = FALSE;
@@ -3160,6 +3142,7 @@ static int start_decoder(vorb *f) {
   int longest_floorlist = 0;
 
   // first page, first packet
+  f->first_decode = TRUE;
 
   if (!start_page(f)) return FALSE;
   // validate page flag
@@ -3218,6 +3201,50 @@ static int start_decoder(vorb *f) {
   if (!start_page(f)) return FALSE;
 
   if (!start_packet(f)) return FALSE;
+
+  if (!next_segment(f)) return FALSE;
+
+  if (get8_packet(f) != VORBIS_packet_comment)
+    return error(f, VORBIS_invalid_setup);
+
+  for (i = 0; i < 6; ++i) header[i] = get8_packet(f);
+
+  if (!vorbis_validate(header)) return error(f, VORBIS_invalid_setup);
+  // file vendor
+  len = get32_packet(f);
+  f->vendor = (char *)setup_malloc(f, sizeof(char) * (len + 1));
+  if (f->vendor == NULL) return error(f, VORBIS_outofmem);
+  for (i = 0; i < len; ++i) {
+    f->vendor[i] = get8_packet(f);
+  }
+  f->vendor[len] = (char)'\0';
+  // user comments
+  f->comment_list_length = get32_packet(f);
+  f->comment_list = NULL;
+  if (f->comment_list_length > 0) {
+    f->comment_list =
+        (char **)setup_malloc(f, sizeof(char *) * (f->comment_list_length));
+    if (f->comment_list == NULL) return error(f, VORBIS_outofmem);
+  }
+
+  for (i = 0; i < f->comment_list_length; ++i) {
+    len = get32_packet(f);
+    f->comment_list[i] = (char *)setup_malloc(f, sizeof(char) * (len + 1));
+    if (f->comment_list[i] == NULL) return error(f, VORBIS_outofmem);
+
+    for (j = 0; j < len; ++j) {
+      f->comment_list[i][j] = get8_packet(f);
+    }
+    f->comment_list[i][len] = (char)'\0';
+  }
+
+  // framing_flag
+  x = get8_packet(f);
+  if (!(x & 1)) return error(f, VORBIS_invalid_setup);
+
+  skip(f, f->bytes_in_seg);
+  f->bytes_in_seg = 0;
+
   do {
     len = next_segment(f);
     skip(f, len);
@@ -3229,7 +3256,7 @@ static int start_decoder(vorb *f) {
 
 #ifndef STB_VORBIS_NO_PUSHDATA_API
   if (IS_PUSH_MODE(f)) {
-    if (!is_whole_packet_present(f, TRUE)) {
+    if (!is_whole_packet_present(f)) {
       // convert error in ogg header to write type
       if (f->error == VORBIS_invalid_stream) f->error = VORBIS_invalid_setup;
       return FALSE;
@@ -3302,7 +3329,10 @@ static int start_decoder(vorb *f) {
         if (present) {
           lengths[j] = get_bits(f, 5) + 1;
           ++total;
-          if (lengths[j] == 32) return error(f, VORBIS_invalid_setup);
+          if (lengths[j] == 32) {
+            if (c->sparse) setup_temp_free(f, lengths, c->entries);
+            return error(f, VORBIS_invalid_setup);
+          }
         } else {
           lengths[j] = NO_CODE;
         }
@@ -3315,7 +3345,10 @@ static int start_decoder(vorb *f) {
         f->setup_temp_memory_required = c->entries;
 
       c->codeword_lengths = (uint8 *)setup_malloc(f, c->entries);
-      if (c->codeword_lengths == NULL) return error(f, VORBIS_outofmem);
+      if (c->codeword_lengths == NULL) {
+        setup_temp_free(f, lengths, c->entries);
+        return error(f, VORBIS_outofmem);
+      }
       memcpy(c->codeword_lengths, lengths, c->entries);
       setup_temp_free(f, lengths,
                       c->entries);  // note this is only safe if there have been
@@ -3349,13 +3382,22 @@ static int start_decoder(vorb *f) {
       unsigned int size;
       if (c->sorted_entries) {
         c->codeword_lengths = (uint8 *)setup_malloc(f, c->sorted_entries);
-        if (!c->codeword_lengths) return error(f, VORBIS_outofmem);
+        if (!c->codeword_lengths) {
+          setup_temp_free(f, lengths, c->entries);
+          return error(f, VORBIS_outofmem);
+        }
         c->codewords = (uint32 *)setup_temp_malloc(
             f, sizeof(*c->codewords) * c->sorted_entries);
-        if (!c->codewords) return error(f, VORBIS_outofmem);
+        if (!c->codewords) {
+          setup_temp_free(f, lengths, c->entries);
+          return error(f, VORBIS_outofmem);
+        }
         values =
             (uint32 *)setup_temp_malloc(f, sizeof(*values) * c->sorted_entries);
-        if (!values) return error(f, VORBIS_outofmem);
+        if (!values) {
+          setup_temp_free(f, lengths, c->entries);
+          return error(f, VORBIS_outofmem);
+        }
       }
       size = c->entries +
              (sizeof(*c->codewords) + sizeof(*values)) * c->sorted_entries;
@@ -3364,7 +3406,10 @@ static int start_decoder(vorb *f) {
     }
 
     if (!compute_codewords(c, lengths, c->entries, values)) {
-      if (c->sparse) setup_temp_free(f, values, 0);
+      if (c->sparse) {
+        setup_temp_free(f, values, 0);
+        setup_temp_free(f, lengths, c->entries);
+      }
       return error(f, VORBIS_invalid_setup);
     }
 
@@ -3372,12 +3417,18 @@ static int start_decoder(vorb *f) {
       // allocate an extra slot for sentinels
       c->sorted_codewords = (uint32 *)setup_malloc(
           f, sizeof(*c->sorted_codewords) * (c->sorted_entries + 1));
-      if (c->sorted_codewords == NULL) return error(f, VORBIS_outofmem);
+      if (c->sorted_codewords == NULL) {
+        if (c->sparse) setup_temp_free(f, lengths, c->entries);
+        return error(f, VORBIS_outofmem);
+      }
       // allocate an extra slot at the front so that c->sorted_values[-1] is
       // defined so that we can catch that case without an extra if
       c->sorted_values = (int *)setup_malloc(
           f, sizeof(*c->sorted_values) * (c->sorted_entries + 1));
-      if (c->sorted_values == NULL) return error(f, VORBIS_outofmem);
+      if (c->sorted_values == NULL) {
+        if (c->sparse) setup_temp_free(f, lengths, c->entries);
+        return error(f, VORBIS_outofmem);
+      }
       ++c->sorted_values;
       c->sorted_values[-1] = -1;
       compute_sorted_huffman(c, lengths, values);
@@ -3446,8 +3497,7 @@ static int start_decoder(vorb *f) {
           unsigned int div = 1;
           for (k = 0; k < c->dimensions; ++k) {
             int off = (z / div) % c->lookup_values;
-            float val = mults[off];
-            val = mults[off] * c->delta_value + c->minimum_value + last;
+            float val = mults[off] * c->delta_value + c->minimum_value + last;
             c->multiplicands[j * c->dimensions + k] = val;
             if (c->sequence_p) last = val;
             if (k + 1 < c->dimensions) {
@@ -3532,7 +3582,7 @@ static int start_decoder(vorb *f) {
             return error(f, VORBIS_invalid_setup);
         }
         for (k = 0; k < 1 << g->class_subclasses[j]; ++k) {
-          g->subclass_books[j][k] = get_bits(f, 8) - 1;
+          g->subclass_books[j][k] = (int16)get_bits(f, 8) - 1;
           if (g->subclass_books[j][k] >= f->codebook_count)
             return error(f, VORBIS_invalid_setup);
         }
@@ -3560,7 +3610,7 @@ static int start_decoder(vorb *f) {
       for (j = 0; j < g->values; ++j) g->sorted_order[j] = (uint8)p[j].id;
       // precompute the neighbors
       for (j = 2; j < g->values; ++j) {
-        int low, hi;
+        int low = 0, hi = 0;
         neighbors(g->Xlist, j, &low, &hi);
         g->neighbors[j][0] = low;
         g->neighbors[j][1] = hi;
@@ -3738,7 +3788,9 @@ static int start_decoder(vorb *f) {
     int i, max_part_read = 0;
     for (i = 0; i < f->residue_count; ++i) {
       Residue *r = f->residue_config + i;
-      unsigned int actual_size = f->blocksize_1 / 2;
+      unsigned int rtype = f->residue_types[i];
+      unsigned int actual_size =
+          rtype == 2 ? f->blocksize_1 : f->blocksize_1 / 2;
       unsigned int limit_r_begin =
           r->begin < actual_size ? r->begin : actual_size;
       unsigned int limit_r_end = r->end < actual_size ? r->end : actual_size;
@@ -3761,8 +3813,6 @@ static int start_decoder(vorb *f) {
       f->temp_memory_required = imdct_mem;
   }
 
-  f->first_decode = TRUE;
-
   if (f->alloc.alloc_buffer) {
     assert(f->temp_offset == f->alloc.alloc_buffer_length_in_bytes);
     // check if there's enough temp memory so we don't error later
@@ -3771,13 +3821,30 @@ static int start_decoder(vorb *f) {
       return error(f, VORBIS_outofmem);
   }
 
-  f->first_audio_page_offset = stb_vorbis_get_file_offset(f);
+  // @TODO: stb_vorbis_seek_start expects first_audio_page_offset to point
+  // to a page without PAGEFLAG_continued_packet, so this either points
+  // to the first page, or the page after the end of the headers. It might
+  // be cleaner to point to a page in the middle of the headers, when that's
+  // the page where the first audio packet starts, but we'd have to also
+  // correctly skip the end of any continued packet in stb_vorbis_seek_start.
+  if (f->next_seg == -1) {
+    f->first_audio_page_offset = stb_vorbis_get_file_offset(f);
+  } else {
+    f->first_audio_page_offset = 0;
+  }
 
   return TRUE;
 }
 
 static void vorbis_deinit(stb_vorbis *p) {
   int i, j;
+
+  setup_free(p, p->vendor);
+  for (i = 0; i < p->comment_list_length; ++i) {
+    setup_free(p, p->comment_list[i]);
+  }
+  setup_free(p, p->comment_list);
+
   if (p->residue_config) {
     for (i = 0; i < p->residue_count; ++i) {
       Residue *r = p->residue_config + i;
@@ -3840,8 +3907,7 @@ static void vorbis_init(stb_vorbis *p, const stb_vorbis_alloc *z) {
   memset(p, 0, sizeof(*p));  // NULL out all malloc'd pointers to start
   if (z) {
     p->alloc = *z;
-    p->alloc.alloc_buffer_length_in_bytes =
-        (p->alloc.alloc_buffer_length_in_bytes + 3) & ~3;
+    p->alloc.alloc_buffer_length_in_bytes &= ~7;
     p->temp_offset = p->alloc.alloc_buffer_length_in_bytes;
   }
   p->eof = 0;
@@ -3873,6 +3939,14 @@ stb_vorbis_info stb_vorbis_get_info(stb_vorbis *f) {
   return d;
 }
 
+stb_vorbis_comment stb_vorbis_get_comment(stb_vorbis *f) {
+  stb_vorbis_comment d;
+  d.vendor = f->vendor;
+  d.comment_list_length = f->comment_list_length;
+  d.comment_list = f->comment_list;
+  return d;
+}
+
 int stb_vorbis_get_error(stb_vorbis *f) {
   int e = f->error;
   f->error = VORBIS__no_error;
@@ -4007,7 +4081,7 @@ int stb_vorbis_decode_frame_pushdata(
   f->error = VORBIS__no_error;
 
   // check that we have the entire packet in memory
-  if (!is_whole_packet_present(f, FALSE)) {
+  if (!is_whole_packet_present(f)) {
     *samples = 0;
     return 0;
   }
@@ -4069,6 +4143,7 @@ stb_vorbis *stb_vorbis_open_pushdata(
       *error = VORBIS_need_more_data;
     else
       *error = p.error;
+    vorbis_deinit(&p);
     return NULL;
   }
   f = vorbis_alloc(&p);
@@ -4121,7 +4196,7 @@ static uint32 vorbis_find_page(stb_vorbis *f, uint32 *end, uint32 *last) {
         if (f->eof) return 0;
         if (header[4] != 0) goto invalid;
         goal = header[22] + (header[23] << 8) + (header[24] << 16) +
-               (header[25] << 24);
+               ((uint32)header[25] << 24);
         for (i = 22; i < 26; ++i) header[i] = 0;
         crc = 0;
         for (i = 0; i < 27; ++i) crc = crc32_update(crc, header[i]);
@@ -4232,8 +4307,8 @@ static int go_to_page_before(stb_vorbis *f, unsigned int limit_offset) {
 static int seek_to_sample_coarse(stb_vorbis *f, uint32 sample_number) {
   ProbedPage left, right, mid;
   int i, start_seg_with_known_loc, end_pos, page_start;
-  uint32 delta, stream_length, padding;
-  double offset, bytes_per_sample;
+  uint32 delta, stream_length, padding, last_sample_limit;
+  double offset = 0.0, bytes_per_sample = 0.0;
   int probe = 0;
 
   bytes_per_sample = 2; /* TODO(jart): ???? */
@@ -4249,9 +4324,9 @@ static int seek_to_sample_coarse(stb_vorbis *f, uint32 sample_number) {
   // indicates should be the granule position (give or take one)).
   padding = ((f->blocksize_1 - f->blocksize_0) >> 2);
   if (sample_number < padding)
-    sample_number = 0;
+    last_sample_limit = 0;
   else
-    sample_number -= padding;
+    last_sample_limit = sample_number - padding;
 
   left = f->p_first;
   while (left.last_decoded_sample == ~0U) {
@@ -4264,8 +4339,11 @@ static int seek_to_sample_coarse(stb_vorbis *f, uint32 sample_number) {
   assert(right.last_decoded_sample != ~0U);
 
   // starting from the start is handled differently
-  if (sample_number <= left.last_decoded_sample) {
-    if (stb_vorbis_seek_start(f)) return 1;
+  if (last_sample_limit <= left.last_decoded_sample) {
+    if (stb_vorbis_seek_start(f)) {
+      if (f->current_loc > sample_number) return error(f, VORBIS_seek_failed);
+      return 1;
+    }
     return 0;
   }
 
@@ -4284,10 +4362,10 @@ static int seek_to_sample_coarse(stb_vorbis *f, uint32 sample_number) {
           bytes_per_sample = data_bytes / right.last_decoded_sample;
           offset =
               left.page_start +
-              bytes_per_sample * (sample_number - left.last_decoded_sample);
+              bytes_per_sample * (last_sample_limit - left.last_decoded_sample);
         } else {
           // second probe (try to bound the other side)
-          double error = ((double)sample_number - mid.last_decoded_sample) *
+          double error = ((double)last_sample_limit - mid.last_decoded_sample) *
                          bytes_per_sample;
           if (error >= 0 && error < 8000) error = 8000;
           if (error < 0 && error > -8000) error = -8000;
@@ -4318,13 +4396,15 @@ static int seek_to_sample_coarse(stb_vorbis *f, uint32 sample_number) {
     }
 
     // if we've just found the last page again then we're in a tricky file,
-    // and we're close enough.
-    if (mid.page_start == right.page_start) break;
-
-    if (sample_number < mid.last_decoded_sample)
-      right = mid;
-    else
-      left = mid;
+    // and we're close enough (if it wasn't an interpolation probe).
+    if (mid.page_start == right.page_start) {
+      if (probe >= 2 || delta <= 65536) break;
+    } else {
+      if (last_sample_limit < mid.last_decoded_sample)
+        right = mid;
+      else
+        left = mid;
+    }
 
     ++probe;
   }
@@ -4437,8 +4517,8 @@ int stb_vorbis_seek_frame(stb_vorbis *f, unsigned int sample_number) {
       flush_packet(f);
     }
   }
-  // the next frame will start with the sample
-  assert(f->current_loc == sample_number);
+  // the next frame should start with the sample
+  if (f->current_loc != sample_number) return error(f, VORBIS_seek_failed);
   return 1;
 }
 
@@ -4514,7 +4594,8 @@ unsigned int stb_vorbis_stream_length_in_samples(stb_vorbis *f) {
         // set. whoops!
         break;
       }
-      previous_safe = last_page_loc + 1;
+      // NOTE: not used after this point, but note for debugging
+      // previous_safe = last_page_loc + 1;
       last_page_loc = stb_vorbis_get_file_offset(f);
     }
 
@@ -4618,7 +4699,10 @@ stb_vorbis *stb_vorbis_open_filename(const char *filename, int *error,
 stb_vorbis *stb_vorbis_open_memory(const unsigned char *data, int len,
                                    int *error, const stb_vorbis_alloc *alloc) {
   stb_vorbis *f, p;
-  if (data == NULL) return NULL;
+  if (!data) {
+    if (error) *error = VORBIS_unexpected_eof;
+    return NULL;
+  }
   vorbis_init(&p, alloc);
   p.stream = (uint8 *)data;
   p.stream_end = (uint8 *)data + len;
@@ -4684,18 +4768,18 @@ static void copy_samples(short *dest, float *src, int len) {
   for (i = 0; i < len; ++i) {
     FASTDEF(temp);
     int v = FAST_SCALED_FLOAT_TO_INT(temp, src[i], 15);
-    if ((unsigned int)(v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
+    if (((unsigned int)v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
     dest[i] = v;
   }
 }
 
 static void compute_samples(int mask, short *output, int num_c, float **data,
                             int d_offset, int len) {
-#define BUFFER_SIZE 32
-  float buffer[BUFFER_SIZE];
-  int i, j, o, n = BUFFER_SIZE;
+#define STB_BUFFER_SIZE 32
+  float buffer[STB_BUFFER_SIZE];
+  int i, j, o, n = STB_BUFFER_SIZE;
   check_endianness();
-  for (o = 0; o < len; o += BUFFER_SIZE) {
+  for (o = 0; o < len; o += STB_BUFFER_SIZE) {
     memset(buffer, 0, sizeof(buffer));
     if (o + n > len) n = len - o;
     for (j = 0; j < num_c; ++j) {
@@ -4706,20 +4790,21 @@ static void compute_samples(int mask, short *output, int num_c, float **data,
     for (i = 0; i < n; ++i) {
       FASTDEF(temp);
       int v = FAST_SCALED_FLOAT_TO_INT(temp, buffer[i], 15);
-      if ((unsigned int)(v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
+      if (((unsigned int)v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
       output[o + i] = v;
     }
   }
+#undef STB_BUFFER_SIZE
 }
 
 static void compute_stereo_samples(short *output, int num_c, float **data,
                                    int d_offset, int len) {
-#define BUFFER_SIZE 32
-  float buffer[BUFFER_SIZE];
-  int i, j, o, n = BUFFER_SIZE >> 1;
+#define STB_BUFFER_SIZE 32
+  float buffer[STB_BUFFER_SIZE];
+  int i, j, o, n = STB_BUFFER_SIZE >> 1;
   // o is the offset in the source data
   check_endianness();
-  for (o = 0; o < len; o += (BUFFER_SIZE >> 1)) {
+  for (o = 0; o < len; o += (STB_BUFFER_SIZE >> 1)) {
     // o2 is the offset in the output data
     int o2 = o << 1;
     memset(buffer, 0, sizeof(buffer));
@@ -4744,10 +4829,11 @@ static void compute_stereo_samples(short *output, int num_c, float **data,
     for (i = 0; i < (n << 1); ++i) {
       FASTDEF(temp);
       int v = FAST_SCALED_FLOAT_TO_INT(temp, buffer[i], 15);
-      if ((unsigned int)(v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
+      if (((unsigned int)v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
       output[o2 + i] = v;
     }
   }
+#undef STB_BUFFER_SIZE
 }
 
 static void convert_samples_short(int buf_c, short **buffer, int b_offset,
@@ -4771,7 +4857,7 @@ static void convert_samples_short(int buf_c, short **buffer, int b_offset,
 
 int stb_vorbis_get_frame_short(stb_vorbis *f, int num_c, short **buffer,
                                int num_samples) {
-  float **output;
+  float **output = NULL;
   int len = stb_vorbis_get_frame_float(f, NULL, &output);
   if (len > num_samples) len = num_samples;
   if (len) convert_samples_short(num_c, buffer, 0, f->channels, output, 0, len);
@@ -4796,7 +4882,7 @@ static void convert_channels_short_interleaved(int buf_c, short *buffer,
         float f = data[i][d_offset + j];
         int v =
             FAST_SCALED_FLOAT_TO_INT(temp, f, 15);  // data[i][d_offset+j],15);
-        if ((unsigned int)(v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
+        if (((unsigned int)v + 32768) > 65535) v = v < 0 ? -32768 : 32767;
         *buffer++ = v;
       }
       for (; i < buf_c; ++i) *buffer++ = 0;
@@ -4824,8 +4910,6 @@ int stb_vorbis_get_samples_short_interleaved(stb_vorbis *f, int channels,
   float **outputs;
   int len = num_shorts / channels;
   int n = 0;
-  int z = f->channels;
-  if (z > channels) z = channels;
   while (n < len) {
     int k = f->channel_buffer_end - f->channel_buffer_start;
     if (n + k >= len) k = len - n;
@@ -4846,8 +4930,6 @@ int stb_vorbis_get_samples_short(stb_vorbis *f, int channels, short **buffer,
                                  int len) {
   float **outputs;
   int n = 0;
-  int z = f->channels;
-  if (z > channels) z = channels;
   while (n < len) {
     int k = f->channel_buffer_end - f->channel_buffer_start;
     if (n + k >= len) k = len - n;
diff --git a/third_party/stb/stb_vorbis.h b/third_party/stb/stb_vorbis.h
index 758e251b4..4510b0188 100644
--- a/third_party/stb/stb_vorbis.h
+++ b/third_party/stb/stb_vorbis.h
@@ -43,9 +43,18 @@ typedef struct {
   int max_frame_size;
 } stb_vorbis_info;
 
+typedef struct {
+  char *vendor;
+  int comment_list_length;
+  char **comment_list;
+} stb_vorbis_comment;
+
 // get general information about the file
 stb_vorbis_info stb_vorbis_get_info(stb_vorbis *f);
 
+// get ogg comments
+stb_vorbis_comment stb_vorbis_get_comment(stb_vorbis *f);
+
 // get the last error detected (clears it, too)
 int stb_vorbis_get_error(stb_vorbis *f);
 
@@ -119,6 +128,12 @@ int stb_vorbis_decode_frame_pushdata(
 // channel. In other words, (*output)[0][0] contains the first sample from
 // the first channel, and (*output)[1][0] contains the first sample from
 // the second channel.
+//
+// *output points into stb_vorbis's internal output buffer storage; these
+// buffers are owned by stb_vorbis and application code should not free
+// them or modify their contents. They are transient and will be overwritten
+// once you ask for more data to get decoded, so be sure to grab any data
+// you need before then.
 
 void stb_vorbis_flush_pushdata(stb_vorbis *f);
 // inform stb_vorbis that your next datablock will not be contiguous with
diff --git a/tool/viz/derasterize.c b/tool/viz/derasterize.c
index 27c44c6ed..9f43b398d 100644
--- a/tool/viz/derasterize.c
+++ b/tool/viz/derasterize.c
@@ -551,8 +551,8 @@ static int ParseNumberOption(const char *arg) {
   return x;
 }
 
-static void PrintUsage(int rc, FILE *f) {
-  fputs(HELPTEXT, f);
+static void PrintUsage(int rc, int fd) {
+  tinyprint(fd, HELPTEXT, NULL);
   exit(rc);
 }
 
@@ -573,9 +573,12 @@ static void GetOpts(int argc, char *argv[]) {
         break;
       case '?':
       case 'H':
-        PrintUsage(EXIT_SUCCESS, stdout);
       default:
-        PrintUsage(EX_USAGE, stderr);
+        if (opt == optopt) {
+          PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
+        } else {
+          PrintUsage(EX_USAGE, STDERR_FILENO);
+        }
     }
   }
 }
diff --git a/tool/viz/memzoom.c b/tool/viz/memzoom.c
index 63c98dced..a0bcf9297 100644
--- a/tool/viz/memzoom.c
+++ b/tool/viz/memzoom.c
@@ -45,6 +45,7 @@
 #include "libc/str/unicode.h"
 #include "libc/sysv/consts/ex.h"
 #include "libc/sysv/consts/exit.h"
+#include "libc/sysv/consts/fileno.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/o.h"
 #include "libc/sysv/consts/poll.h"
@@ -63,7 +64,7 @@ DESCRIPTION\n\
 \n\
 FLAGS\n\
 \n\
-  -h         help\n\
+  -h or -?   help\n\
   -z         zoom\n\
   -m         morton ordering\n\
   -H         hilbert ordering\n\
@@ -887,10 +888,8 @@ static void MemZoom(void) {
   } while (!(action & INTERRUPTED));
 }
 
-static wontreturn void PrintUsage(int rc) {
-  Write("SYNOPSIS\n\n  ");
-  Write(program_invocation_name);
-  Write(USAGE);
+static wontreturn void PrintUsage(int rc, int fd) {
+  tinyprint(fd, "SYNOPSIS\n\n ", program_invocation_name, USAGE, NULL);
   exit(rc);
 }
 
@@ -898,7 +897,7 @@ static void GetOpts(int argc, char *argv[]) {
   int opt;
   char *p;
   fps = 10;
-  while ((opt = getopt(argc, argv, "hzHNWf:p:")) != -1) {
+  while ((opt = getopt(argc, argv, "?hmzHNWf:p:")) != -1) {
     switch (opt) {
       case 'z':
         ++zoom;
@@ -927,9 +926,13 @@ static void GetOpts(int argc, char *argv[]) {
         }
         break;
       case 'h':
-        PrintUsage(EXIT_SUCCESS);
+      case '?':
       default:
-        PrintUsage(EX_USAGE);
+        if (opt == optopt) {
+          PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
+        } else {
+          PrintUsage(EX_USAGE, STDERR_FILENO);
+        }
     }
   }
   if (pid) {
@@ -941,10 +944,10 @@ static void GetOpts(int argc, char *argv[]) {
     stpcpy(p, "/maps");
   } else {
     if (optind == argc) {
-      PrintUsage(EX_USAGE);
+      PrintUsage(EX_USAGE, STDERR_FILENO);
     }
     if (!memccpy(path, argv[optind], '\0', sizeof(path))) {
-      PrintUsage(EX_USAGE);
+      PrintUsage(EX_USAGE, STDERR_FILENO);
     }
   }
 }
diff --git a/tool/viz/od16.c b/tool/viz/od16.c
index 16e221fc6..9fabccc9d 100644
--- a/tool/viz/od16.c
+++ b/tool/viz/od16.c
@@ -16,6 +16,7 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
 #include "libc/errno.h"
 #include "libc/fmt/conv.h"
 #include "libc/log/check.h"
@@ -25,6 +26,7 @@
 #include "libc/str/str.h"
 #include "libc/sysv/consts/ex.h"
 #include "libc/sysv/consts/exit.h"
+#include "libc/sysv/consts/fileno.h"
 #include "third_party/getopt/getopt.internal.h"
 
 #define USAGE \
@@ -36,17 +38,15 @@ Flags:\n\
   -c INT\n\
   -w INT     width (aka cols) [default 8]\n\
   -o PATH    output path [default -]\n\
-  -h         shows this information\n\
+  -h or -?   shows this information\n\
 \n"
 
 static long width_;
 static FILE *in_, *out_;
 static char *inpath_, *outpath_;
 
-void PrintUsage(int rc, FILE *f) {
-  fputs("Usage: ", f);
-  fputs(program_invocation_name, f);
-  fputs(USAGE, f);
+void PrintUsage(int rc, int fd) {
+  tinyprint(fd, "Usage: ", program_invocation_name, USAGE, NULL);
   exit(rc);
 }
 
@@ -63,11 +63,14 @@ void GetOpts(int *argc, char *argv[]) {
       case 'w':
         width_ = strtol(optarg, NULL, 0);
         break;
-      case '?':
       case 'h':
-        PrintUsage(EXIT_SUCCESS, stdout);
+      case '?':
       default:
-        PrintUsage(EX_USAGE, stderr);
+        if (opt == optopt) {
+          PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
+        } else {
+          PrintUsage(EX_USAGE, STDERR_FILENO);
+        }
     }
   }
   if (optind == *argc) {
diff --git a/tool/viz/printansi.c b/tool/viz/printansi.c
index 6e98e6014..810d8bb79 100644
--- a/tool/viz/printansi.c
+++ b/tool/viz/printansi.c
@@ -71,8 +71,8 @@ static struct Flags {
   enum TtyQuantizationAlgorithm quant;
 } g_flags;
 
-static wontreturn void PrintUsage(int rc, FILE *f) {
-  fprintf(f, "Usage: %s%s", program_invocation_name, "\
+static wontreturn void PrintUsage(int rc, int fd) {
+  tinyprint(fd, "Usage: ", program_invocation_name, "\
  [FLAGS] [PATH]\n\
 \n\
 FLAGS\n\
@@ -86,7 +86,7 @@ EXAMPLES\n\
 \n\
   printansi.com -w80 -h40 logo.png\n\
 \n\
-\n");
+\n", NULL);
   exit(rc);
 }
 
@@ -107,7 +107,7 @@ static void GetOpts(int *argc, char *argv[]) {
   g_flags.blocks = IsWindows() ? kTtyBlocksCp437 : kTtyBlocksUnicode;
   if (*argc == 2 &&
       (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-help") == 0)) {
-    PrintUsage(EXIT_SUCCESS, stdout);
+    PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
   }
   while ((opt = getopt(*argc, argv, "?ivpfrtxads234o:w:h:")) != -1) {
     switch (opt) {
@@ -162,9 +162,12 @@ static void GetOpts(int *argc, char *argv[]) {
         ++__log_level;
         break;
       case '?':
-        PrintUsage(EXIT_SUCCESS, stdout);
       default:
-        PrintUsage(EX_USAGE, stderr);
+        if (opt == optopt) {
+          PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
+        } else {
+          PrintUsage(EX_USAGE, STDERR_FILENO);
+        }
     }
   }
   if (optind == *argc) {
diff --git a/tool/viz/printimage.c b/tool/viz/printimage.c
index e82c39b57..7c2d35c67 100644
--- a/tool/viz/printimage.c
+++ b/tool/viz/printimage.c
@@ -66,8 +66,8 @@ static struct Flags {
 
 struct winsize g_winsize;
 
-static wontreturn void PrintUsage(int rc, FILE *f) {
-  fprintf(f, "Usage: %s%s", program_invocation_name, "\
+static wontreturn void PrintUsage(int rc, int fd) {
+  tinyprint(fd, "Usage: ", program_invocation_name, "\
  [FLAGS] [PATH]\n\
 \n\
 FLAGS\n\
@@ -94,7 +94,7 @@ FLAGS\n\
 EXAMPLES\n\
 \n\
   printimage.com -sxd lemurs.jpg  # 256-color dither unsharp\n\
-\n");
+\n", NULL);
   exit(rc);
 }
 
@@ -114,7 +114,7 @@ static void GetOpts(int *argc, char *argv[]) {
   g_flags.blocks = IsWindows() ? kTtyBlocksCp437 : kTtyBlocksUnicode;
   if (*argc == 2 &&
       (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-help") == 0)) {
-    PrintUsage(EXIT_SUCCESS, stdout);
+    PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
   }
   while ((opt = getopt(*argc, argv, "?vpmfirtxads234o:w:h:")) != -1) {
     switch (opt) {
@@ -170,9 +170,12 @@ static void GetOpts(int *argc, char *argv[]) {
         ++__log_level;
         break;
       case '?':
-        PrintUsage(EXIT_SUCCESS, stdout);
       default:
-        PrintUsage(EX_USAGE, stderr);
+        if (opt == optopt) {
+          PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
+        } else {
+          PrintUsage(EX_USAGE, STDERR_FILENO);
+        }
     }
   }
   g_winsize.ws_col = 80;
@@ -435,7 +438,7 @@ int main(int argc, char *argv[]) {
   int i;
   ShowCrashReports();
   GetOpts(&argc, argv);
-  if (optind == argc) PrintUsage(0, stdout);
+  if (optind == argc) PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
   stbi_set_unpremultiply_on_load(true);
   for (i = optind; i < argc; ++i) {
     WithImageFile(argv[i], ProcessImage);
diff --git a/tool/viz/printvideo.c b/tool/viz/printvideo.c
index ccff3df9f..727afad59 100644
--- a/tool/viz/printvideo.c
+++ b/tool/viz/printvideo.c
@@ -123,7 +123,7 @@ Flags & Keyboard Shortcuts:\n\
   -v         increases verbosity        [flag]\n\
   -L PATH    redirects stderr to path   [flag]\n\
   -y         yes to interactive prompts [flag]\n\
-  -h         shows this information     [flag]\n\
+  -h or -?   shows this information     [flag]\n\
   UP/DOWN    adjust volume              [keyboard]\n\
   CTRL+L     redraw                     [keyboard]\n\
   CTRL+Z     suspend                    [keyboard]\n\
@@ -1374,10 +1374,8 @@ static bool CanPlayAudio(void) {
   }
 }
 
-static void PrintUsage(int rc, FILE *f) {
-  fputs("Usage: ", f);
-  fputs(program_invocation_name, f);
-  fputs(USAGE, f);
+static void PrintUsage(int rc, int fd) {
+  tinyprint(fd, "Usage: ", program_invocation_name, USAGE, NULL);
   exit(rc);
 }
 
@@ -1399,12 +1397,15 @@ static void GetOpts(int argc, char *argv[]) {
       case 'Y':
         yonly_ = true;
         break;
-      case '?':
       case 'h':
-        PrintUsage(EXIT_SUCCESS, stdout);
+      case '?':
       default:
         if (!ProcessOptKey(opt)) {
-          PrintUsage(EX_USAGE, stderr);
+          if (opt == optopt) {
+            PrintUsage(EXIT_SUCCESS, STDOUT_FILENO);
+          } else {
+            PrintUsage(EX_USAGE, STDERR_FILENO);
+          }
         }
     }
   }
@@ -1562,7 +1563,7 @@ int main(int argc, char *argv[]) {
   fullclear_ = true;
   GetOpts(argc, argv);
   if (!tuned_) PickDefaults();
-  if (optind == argc) PrintUsage(EX_USAGE, stderr);
+  if (optind == argc) PrintUsage(EX_USAGE, STDERR_FILENO);
   patharg_ = argv[optind];
   s = commandvenv("SOX", "sox");
   sox_ = s ? strdup(s) : 0;