Make AARCH64 harder, better, faster, stronger

- Perform some housekeeping on scalar math function code - Import ARM's Optimized Routines for SIMD string processing - Upgrade to latest Chromium zlib and enable more SIMD optimizations
2025-07-12 14:09:12 +00:00 · 2023-05-15 01:51:29 -07:00 · 2023-05-15 01:51:29 -07:00 · cc1732bc42
commit cc1732bc42
parent 550b52abf6
143 changed files with 15661 additions and 1329 deletions
--- a/third_party/radpajama/common-gptneox.h
+++ b/third_party/radpajama/common-gptneox.h
@ -1,3 +1,4 @@
+// -*- c++ -*-
 #ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
 #define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
 #include "libc/macros.internal.h"
--- a/third_party/radpajama/gptneox-util.h
+++ b/third_party/radpajama/gptneox-util.h
@ -1,3 +1,4 @@
+// -*- c++ -*-
 #ifndef GPTNEOX_UTIL_H
 #define GPTNEOX_UTIL_H
 #include "libc/calls/calls.h"
--- a/third_party/radpajama/gptneox.h
+++ b/third_party/radpajama/gptneox.h
@ -1,3 +1,4 @@
+// -*- c++ -*-
 #ifndef GPTNEOX_H
 #define GPTNEOX_H
 // clang-format off
--- a/third_party/zip/crc32.c
+++ b/third_party/zip/crc32.c
@ -687,8 +687,6 @@ ulg crc32(crc, buf, len)
   pointer, then initialize the crc shift register contents instead.
   Return the current crc in either case. */
 {
-  return crc32_z(crc,buf,len);
-
  register z_uint4 c;
  register ZCONST ulg near *crc_32_tab;

--- a/third_party/zlib/LICENSE.chromium
+++ b/third_party/zlib/LICENSE.chromium
@ -0,0 +1,27 @@
+// Copyright 2015 The Chromium Authors
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google LLC nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/third_party/zlib/LICENSE.zlib
+++ b/third_party/zlib/LICENSE.zlib
@ -0,0 +1,22 @@
+Copyright notice:
+
+ (C) 1995-2022 Jean-loup Gailly and Mark Adler
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+
+  Jean-loup Gailly        Mark Adler
+  jloup@gzip.org          madler@alumni.caltech.edu
--- a/third_party/zlib/README.cosmo
+++ b/third_party/zlib/README.cosmo
@ -11,23 +11,9 @@ ORIGIN
  The zlib sources were obtained from Chromium's zlib fork.

    https://chromium.googlesource.com/chromium/src/third_party/zlib
-    commit 8f22e90f007a7dd466b426513725c13191248315
-    Author: Hans Wennborg <hans@chromium.org>
-    Date:   Fri Sep 16 16:14:51 2022 +0000
-
-    [zlib][fuzz] Cap the input size for zlib_inflate_with_header_fuzzer
-    
-    To prevent timeouts when processing large inputs with small chunk sizes.
-    
-    Bug: 1362206
-    Change-Id: Ie21ea48abf85ee49897243857bf84b0f32d24bd5
-    Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3899099
-    Reviewed-by: Adenilson Cavalcanti <cavalcantii@chromium.org>
-    Auto-Submit: Hans Wennborg <hans@chromium.org>
-    Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
-    Cr-Commit-Position: refs/heads/main@{#1048044}
-    NOKEYCHECK=True
-    GitOrigin-RevId: fd75b8c2768e7cc3a3e7a06bc563bb03c5ba0ec2
+    commit 14dd4c4455602c9b71a1a89b5cafd1f4030d2e3f
+    Author: Adenilson Cavalcanti <cavalcantii@chromium.org>
+    Date:   Tue Apr 11 17:40:40 2023 +0000

  The source code for puff was obtained from zlib itself:

@ -42,7 +28,7 @@ LOCAL CHANGES

  - Changed Trace(stderr) calls to use kprintf()

-  - We use our own crc32() implementation from LIBC_STR
+  - Made the type signature of crc32_z() less obnoxious

  - Fix a Chromium Zlib regression where malloc() failures inside
    deflateInit2() will result in a segmentation fault
--- a/third_party/zlib/adler32.c
+++ b/third_party/zlib/adler32.c
@ -1,22 +1,15 @@
+// clang-format off
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
 │vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
 ╚─────────────────────────────────────────────────────────────────────────────*/
+// clang-format off
 /* adler32.c -- compute the Adler-32 checksum of a data stream
 * Copyright (C) 1995-2011, 2016 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
-#include "libc/nexgen32e/x86feature.h"
-#include "third_party/zlib/macros.internal.h"
-#include "third_party/zlib/zconf.h"
+
 #include "third_party/zlib/zutil.internal.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
-// clang-format off
-
-/* @(#) $Id$ */
+#include "third_party/zlib/macros.internal.h"

 local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));

@ -70,7 +63,10 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
 #  define MOD63(a) a %= BASE
 #endif

-uint32_t ZLIB_INTERNAL adler32_simd_(uint32_t, const unsigned char *, z_size_t);
+#include "third_party/zlib/cpu_features.internal.h"
+#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
+#include "third_party/zlib/adler32_simd.inc"
+#endif

 /* ========================================================================= */
 uLong ZEXPORT adler32_z(adler, buf, len)
@ -82,7 +78,7 @@ uLong ZEXPORT adler32_z(adler, buf, len)
    unsigned n;

 #if defined(ADLER32_SIMD_SSSE3)
-    if (buf != Z_NULL && len >= 64 && X86_HAVE(SSSE3))
+    if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
        return adler32_simd_(adler, buf, len);
 #elif defined(ADLER32_SIMD_NEON)
    if (buf != Z_NULL && len >= 64)
@ -104,9 +100,24 @@ uLong ZEXPORT adler32_z(adler, buf, len)
        return adler | (sum2 << 16);
    }

+#if defined(ADLER32_SIMD_SSSE3)
+    /*
+     * Use SSSE3 to compute the adler32. Since this routine can be
+     * freely used, check CPU features here. zlib convention is to
+     * call adler32(0, NULL, 0), before making calls to adler32().
+     * So this is a good early (and infrequent) place to cache CPU
+     * features for those later, more interesting adler32() calls.
+     */
+    if (buf == Z_NULL) {
+        if (!len) /* Assume user is calling adler32(0, NULL, 0); */
+            cpu_check_features();
+        return 1L;
+    }
+#else
    /* initial Adler-32 value (deferred check for len == 1 speed) */
    if (buf == Z_NULL)
        return 1L;
+#endif

    /* in case short lengths are provided, keep it somewhat fast */
    if (len < 16) {
--- a/third_party/zlib/adler32_simd.c
+++ b/third_party/zlib/adler32_simd.c
@ -0,0 +1,371 @@
+asm(".ident\t\"\\n\\n\
+Chromium (BSD-3 License)\\n\
+Copyright 2017 The Chromium Authors\"");
+// clang-format off
+
+/* adler32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ *
+ * Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
+ * the sum of N input data bytes D1 ... DN,
+ *
+ *   A = A0 + D1 + D2 + ... + DN
+ *
+ * where A0 is the initial value.
+ *
+ * SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
+ * for example) and accumulating the byte sums can use SSE shuffle-adds (see
+ * the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
+ * similar instructions.
+ *
+ * The adler32 B value (aka s2) sums the A values from each step:
+ *
+ *   B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
+ *
+ *       B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
+ *
+ * B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
+ *
+ *   B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
+ *
+ * Adjacent blocks of 32 input bytes can be iterated with the expressions to
+ * compute the adler32 s1 s2 of M >> 32 input bytes [1].
+ *
+ * As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
+ * overflow the precision of their integer representation (bad). However, s1
+ * and s2 also need to be computed modulo the adler BASE value (reduced). If
+ * at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
+ * a uint32_t type (the NMAX constraint) [2].
+ *
+ * [1] the iterative equations for s2 contain constant factors; these can be
+ * hoisted from the n-blocks do loop of the SIMD code.
+ *
+ * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
+ * of the adler s1 s2 of uint32_t type (see adler32.c).
+ */
+
+#include "third_party/zlib/adler32_simd.inc"
+
+/* Definitions from adler32.c: largest prime smaller than 65536 */
+#define BASE 65521U
+/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
+#define NMAX 5552
+
+#if defined(ADLER32_SIMD_SSSE3)
+
+#include "third_party/intel/tmmintrin.internal.h"
+
+uint32_t ZLIB_INTERNAL adler32_simd_(  /* SSSE3 */
+    uint32_t adler,
+    const unsigned char *buf,
+    z_size_t len)
+{
+    /*
+     * Split Adler-32 into component sums.
+     */
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = adler >> 16;
+
+    /*
+     * Process the data in blocks.
+     */
+    const unsigned BLOCK_SIZE = 1 << 5;
+
+    z_size_t blocks = len / BLOCK_SIZE;
+    len -= blocks * BLOCK_SIZE;
+
+    while (blocks)
+    {
+        unsigned n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
+        if (n > blocks)
+            n = (unsigned) blocks;
+        blocks -= n;
+
+        const __m128i tap1 =
+            _mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17);
+        const __m128i tap2 =
+            _mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+        const __m128i zero =
+            _mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+        const __m128i ones =
+            _mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1);
+
+        /*
+         * Process n blocks of data. At most NMAX data bytes can be
+         * processed before s2 must be reduced modulo BASE.
+         */
+        __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
+        __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
+        __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
+
+        do {
+            /*
+             * Load 32 input bytes.
+             */
+            const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
+            const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));
+
+            /*
+             * Add previous block byte sum to v_ps.
+             */
+            v_ps = _mm_add_epi32(v_ps, v_s1);
+
+            /*
+             * Horizontally add the bytes for s1, multiply-adds the
+             * bytes by [ 32, 31, 30, ... ] for s2.
+             */
+            v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
+            const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
+            v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
+
+            v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
+            const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
+            v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
+
+            buf += BLOCK_SIZE;
+
+        } while (--n);
+
+        v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
+
+        /*
+         * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
+         */
+
+#define S23O1 _MM_SHUFFLE(2,3,0,1)  /* A B C D -> B A D C */
+#define S1O32 _MM_SHUFFLE(1,0,3,2)  /* A B C D -> C D A B */
+
+        v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
+        v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
+
+        s1 += _mm_cvtsi128_si32(v_s1);
+
+        v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
+        v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
+
+        s2 = _mm_cvtsi128_si32(v_s2);
+
+#undef S23O1
+#undef S1O32
+
+        /*
+         * Reduce.
+         */
+        s1 %= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Handle leftover data.
+     */
+    if (len) {
+        if (len >= 16) {
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            len -= 16;
+        }
+
+        while (len--) {
+            s2 += (s1 += *buf++);
+        }
+
+        if (s1 >= BASE)
+            s1 -= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Return the recombined sums.
+     */
+    return s1 | (s2 << 16);
+}
+
+#elif defined(ADLER32_SIMD_NEON)
+
+#include "third_party/aarch64/arm_neon.h"
+
+uint32_t ZLIB_INTERNAL adler32_simd_(  /* NEON */
+    uint32_t adler,
+    const unsigned char *buf,
+    z_size_t len)
+{
+    /*
+     * Split Adler-32 into component sums.
+     */
+    uint32_t s1 = adler & 0xffff;
+    uint32_t s2 = adler >> 16;
+
+    /*
+     * Serially compute s1 & s2, until the data is 16-byte aligned.
+     */
+    if ((uintptr_t)buf & 15) {
+        while ((uintptr_t)buf & 15) {
+            s2 += (s1 += *buf++);
+            --len;
+        }
+
+        if (s1 >= BASE)
+            s1 -= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Process the data in blocks.
+     */
+    const unsigned BLOCK_SIZE = 1 << 5;
+
+    z_size_t blocks = len / BLOCK_SIZE;
+    len -= blocks * BLOCK_SIZE;
+
+    while (blocks)
+    {
+        unsigned n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
+        if (n > blocks)
+            n = (unsigned) blocks;
+        blocks -= n;
+
+        /*
+         * Process n blocks of data. At most NMAX data bytes can be
+         * processed before s2 must be reduced modulo BASE.
+         */
+        uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n };
+        uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
+
+        uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
+        uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
+        uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
+        uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
+
+        do {
+            /*
+             * Load 32 input bytes.
+             */
+            const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf));
+            const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16));
+
+            /*
+             * Add previous block byte sum to v_s2.
+             */
+            v_s2 = vaddq_u32(v_s2, v_s1);
+
+            /*
+             * Horizontally add the bytes for s1.
+             */
+            v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
+
+            /*
+             * Vertically add the bytes for s2.
+             */
+            v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1));
+            v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
+            v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2));
+            v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
+
+            buf += BLOCK_SIZE;
+
+        } while (--n);
+
+        v_s2 = vshlq_n_u32(v_s2, 5);
+
+        /*
+         * Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
+         */
+        v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1),
+            (uint16x4_t) { 32, 31, 30, 29 });
+        v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1),
+            (uint16x4_t) { 28, 27, 26, 25 });
+        v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2),
+            (uint16x4_t) { 24, 23, 22, 21 });
+        v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2),
+            (uint16x4_t) { 20, 19, 18, 17 });
+        v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3),
+            (uint16x4_t) { 16, 15, 14, 13 });
+        v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3),
+            (uint16x4_t) { 12, 11, 10,  9 });
+        v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4),
+            (uint16x4_t) {  8,  7,  6,  5 });
+        v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4),
+            (uint16x4_t) {  4,  3,  2,  1 });
+
+        /*
+         * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
+         */
+        uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
+        uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
+        uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
+
+        s1 += vget_lane_u32(s1s2, 0);
+        s2 += vget_lane_u32(s1s2, 1);
+
+        /*
+         * Reduce.
+         */
+        s1 %= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Handle leftover data.
+     */
+    if (len) {
+        if (len >= 16) {
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+            s2 += (s1 += *buf++);
+
+            len -= 16;
+        }
+
+        while (len--) {
+            s2 += (s1 += *buf++);
+        }
+
+        if (s1 >= BASE)
+            s1 -= BASE;
+        s2 %= BASE;
+    }
+
+    /*
+     * Return the recombined sums.
+     */
+    return s1 | (s2 << 16);
+}
+
+#endif  /* ADLER32_SIMD_SSSE3 */
--- a/third_party/zlib/adler32_simd.inc
+++ b/third_party/zlib/adler32_simd.inc
@ -0,0 +1,15 @@
+/* adler32_simd.h
+ *
+ * Copyright 2017 The Chromium Authors
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h"
+#include "third_party/zlib/zconf.h"
+#include "third_party/zlib/zutil.internal.h"
+
+uint32_t ZLIB_INTERNAL adler32_simd_(uint32_t adler, const unsigned char *buf,
+                                     z_size_t len);
--- a/third_party/zlib/adler32simd.c
+++ b/third_party/zlib/adler32simd.c
@ -1,198 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2017 The Chromium Authors                                          │
-│ Use of this source code is governed by the BSD-style licenses that can       │
-│ be found in the third_party/zlib/LICENSE file.                               │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/intel/emmintrin.internal.h"
-#include "third_party/intel/tmmintrin.internal.h"
-#include "third_party/zlib/internal.h"
-
-asm(".ident\t\"\\n\\n\
-Chromium (BSD-3 License)\\n\
-Copyright 2017 The Chromium Authors\"");
-asm(".include \"libc/disclaimer.inc\"");
-
-/**
- * Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
- * the sum of N input data bytes D1 ... DN,
- *
- *   A = A0 + D1 + D2 + ... + DN
- *
- * where A0 is the initial value.
- *
- * SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
- * for example) and accumulating the byte sums can use SSE shuffle-adds (see
- * the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
- * similar instructions.
- *
- * The adler32 B value (aka s2) sums the A values from each step:
- *
- *   B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
- *
- *       B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
- *
- * B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
- *
- *   B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
- *
- * Adjacent blocks of 32 input bytes can be iterated with the expressions to
- * compute the adler32 s1 s2 of M >> 32 input bytes [1].
- *
- * As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
- * overflow the precision of their integer representation (bad). However, s1
- * and s2 also need to be computed modulo the adler BASE value (reduced). If
- * at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
- * a uint32_t type (the NMAX constraint) [2].
- *
- * [1] the iterative equations for s2 contain constant factors; these can be
- * hoisted from the n-blocks do loop of the SIMD code.
- *
- * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
- * of the adler s1 s2 of uint32_t type (see adler32.c).
- */
-
-/* Definitions from adler32.c: largest prime smaller than 65536 */
-#define BASE 65521U
-/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
-#define NMAX 5552
-
-#ifdef ADLER32_SIMD_SSSE3
-
-uint32_t adler32_simd_(uint32_t adler, const unsigned char *buf, size_t len) {
-  /*
-   * Split Adler-32 into component sums.
-   */
-  uint32_t s1 = adler & 0xffff;
-  uint32_t s2 = adler >> 16;
-
-  /*
-   * Process the data in blocks.
-   */
-  const unsigned BLOCK_SIZE = 1 << 5;
-
-  size_t blocks = len / BLOCK_SIZE;
-  len -= blocks * BLOCK_SIZE;
-
-  while (blocks) {
-    unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
-    if (n > blocks) n = (unsigned)blocks;
-    blocks -= n;
-
-    const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23,
-                                       22, 21, 20, 19, 18, 17);
-    const __m128i tap2 =
-        _mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
-    const __m128i zero =
-        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-    const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
-
-    /*
-     * Process n blocks of data. At most NMAX data bytes can be
-     * processed before s2 must be reduced modulo BASE.
-     */
-    __m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
-    __m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
-    __m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
-
-    do {
-      /*
-       * Load 32 input bytes.
-       */
-      const __m128i bytes1 = _mm_loadu_si128((__m128i *)(buf));
-      const __m128i bytes2 = _mm_loadu_si128((__m128i *)(buf + 16));
-
-      /*
-       * Add previous block byte sum to v_ps.
-       */
-      v_ps = _mm_add_epi32(v_ps, v_s1);
-
-      /*
-       * Horizontally add the bytes for s1, multiply-adds the
-       * bytes by [ 32, 31, 30, ... ] for s2.
-       */
-      v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
-      const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
-      v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
-
-      v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
-      const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
-      v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
-
-      buf += BLOCK_SIZE;
-
-    } while (--n);
-
-    v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
-
-    /*
-     * Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
-     */
-
-#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
-#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
-
-    v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
-    v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
-
-    s1 += _mm_cvtsi128_si32(v_s1);
-
-    v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
-    v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
-
-    s2 = _mm_cvtsi128_si32(v_s2);
-
-#undef S23O1
-#undef S1O32
-
-    /*
-     * Reduce.
-     */
-    s1 %= BASE;
-    s2 %= BASE;
-  }
-
-  /*
-   * Handle leftover data.
-   */
-  if (len) {
-    if (len >= 16) {
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-      s2 += (s1 += *buf++);
-
-      len -= 16;
-    }
-
-    while (len--) {
-      s2 += (s1 += *buf++);
-    }
-
-    if (s1 >= BASE) s1 -= BASE;
-    s2 %= BASE;
-  }
-
-  /*
-   * Return the recombined sums.
-   */
-  return s1 | (s2 << 16);
-}
-
-#endif /* ADLER32_SIMD_SSSE3 */
--- a/third_party/zlib/chunkcopy.inc
+++ b/third_party/zlib/chunkcopy.inc
@ -0,0 +1,491 @@
+// clang-format off
+/* chunkcopy.h -- fast chunk copy and set operations
+ * Copyright (C) 2017 ARM, Inc.
+ * Copyright 2017 The Chromium Authors
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#ifndef CHUNKCOPY_H
+#define CHUNKCOPY_H
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h"
+#include "third_party/zlib/zutil.internal.h"
+
+#define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
+
+#if __STDC_VERSION__ >= 199901L
+#define Z_RESTRICT restrict
+#else
+#define Z_RESTRICT
+#endif
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
+#define Z_BUILTIN_MEMCPY __builtin_memcpy
+#else
+#define Z_BUILTIN_MEMCPY zmemcpy
+#endif
+
+#if defined(INFLATE_CHUNK_SIMD_NEON)
+#include "third_party/aarch64/arm_neon.h"
+typedef uint8x16_t z_vec128i_t;
+#elif defined(INFLATE_CHUNK_SIMD_SSE2)
+#include "third_party/intel/emmintrin.internal.h"
+typedef __m128i z_vec128i_t;
+#else
+#error chunkcopy.h inflate chunk SIMD is not defined for your build target
+#endif
+
+/*
+ * Suppress MSan errors about copying uninitialized bytes (crbug.com/1376033).
+ */
+#define Z_DISABLE_MSAN
+#if defined(__has_feature)
+  #if __has_feature(memory_sanitizer)
+    #undef Z_DISABLE_MSAN
+    #define Z_DISABLE_MSAN __attribute__((no_sanitize("memory")))
+  #endif
+#endif
+
+/*
+ * chunk copy type: the z_vec128i_t type size should be exactly 128-bits
+ * and equal to CHUNKCOPY_CHUNK_SIZE.
+ */
+#define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
+
+Z_STATIC_ASSERT(vector_128_bits_wide,
+                CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
+
+/*
+ * Ask the compiler to perform a wide, unaligned load with a machine
+ * instruction appropriate for the z_vec128i_t type.
+ */
+static inline z_vec128i_t loadchunk(
+    const unsigned char FAR* s) Z_DISABLE_MSAN {
+  z_vec128i_t v;
+  Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
+  return v;
+}
+
+/*
+ * Ask the compiler to perform a wide, unaligned store with a machine
+ * instruction appropriate for the z_vec128i_t type.
+ */
+static inline void storechunk(
+    unsigned char FAR* d,
+    const z_vec128i_t v) {
+  Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
+}
+
+/*
+ * Perform a memcpy-like operation, assuming that length is non-zero and that
+ * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
+ * the length is shorter than this.
+ *
+ * It also guarantees that it will properly unroll the data if the distance
+ * between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
+ * in chunkcopy_relaxed().
+ *
+ * Aside from better memory bus utilisation, this means that short copies
+ * (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
+ * without iteration, which will hopefully make the branch prediction more
+ * reliable.
+ */
+static inline unsigned char FAR* chunkcopy_core(
+    unsigned char FAR* out,
+    const unsigned char FAR* from,
+    unsigned len) Z_DISABLE_MSAN {
+  const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
+  storechunk(out, loadchunk(from));
+  out += bump;
+  from += bump;
+  len /= CHUNKCOPY_CHUNK_SIZE;
+  while (len-- > 0) {
+    storechunk(out, loadchunk(from));
+    out += CHUNKCOPY_CHUNK_SIZE;
+    from += CHUNKCOPY_CHUNK_SIZE;
+  }
+  return out;
+}
+
+/*
+ * Like chunkcopy_core(), but avoid writing beyond of legal output.
+ *
+ * Accepts an additional pointer to the end of safe output.  A generic safe
+ * copy would use (out + len), but it's normally the case that the end of the
+ * output buffer is beyond the end of the current copy, and this can still be
+ * exploited.
+ */
+static inline unsigned char FAR* chunkcopy_core_safe(
+    unsigned char FAR* out,
+    const unsigned char FAR* from,
+    unsigned len,
+    unsigned char FAR* limit) {
+  Assert(out + len <= limit, "chunk copy exceeds safety limit");
+  if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
+    const unsigned char FAR* Z_RESTRICT rfrom = from;
+    Assert((uintptr_t)out - (uintptr_t)from >= len,
+           "invalid restrict in chunkcopy_core_safe");
+    Assert((uintptr_t)from - (uintptr_t)out >= len,
+           "invalid restrict in chunkcopy_core_safe");
+    if (len & 8) {
+      Z_BUILTIN_MEMCPY(out, rfrom, 8);
+      out += 8;
+      rfrom += 8;
+    }
+    if (len & 4) {
+      Z_BUILTIN_MEMCPY(out, rfrom, 4);
+      out += 4;
+      rfrom += 4;
+    }
+    if (len & 2) {
+      Z_BUILTIN_MEMCPY(out, rfrom, 2);
+      out += 2;
+      rfrom += 2;
+    }
+    if (len & 1) {
+      *out++ = *rfrom++;
+    }
+    return out;
+  }
+  return chunkcopy_core(out, from, len);
+}
+
+/*
+ * Perform short copies until distance can be rewritten as being at least
+ * CHUNKCOPY_CHUNK_SIZE.
+ *
+ * Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
+ * bytes of output even if the copy is shorter than this.  This assumption
+ * holds within zlib inflate_fast(), which starts every iteration with at
+ * least 258 bytes of output space available (258 being the maximum length
+ * output from a single token; see inffast.c).
+ */
+static inline unsigned char FAR* chunkunroll_relaxed(
+    unsigned char FAR* out,
+    unsigned FAR* dist,
+    unsigned FAR* len) Z_DISABLE_MSAN {
+  const unsigned char FAR* from = out - *dist;
+  while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
+    storechunk(out, loadchunk(from));
+    out += *dist;
+    *len -= *dist;
+    *dist += *dist;
+  }
+  return out;
+}
+
+#if defined(INFLATE_CHUNK_SIMD_NEON)
+/*
+ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
+ * every 64-bit component of the 128-bit result (64-bit int splat).
+ */
+static inline z_vec128i_t v_load64_dup(const void* src) {
+  return vcombine_u8(vld1_u8(src), vld1_u8(src));
+}
+
+/*
+ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
+ * every 32-bit component of the 128-bit result (32-bit int splat).
+ */
+static inline z_vec128i_t v_load32_dup(const void* src) {
+  int32_t i32;
+  Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
+  return vreinterpretq_u8_s32(vdupq_n_s32(i32));
+}
+
+/*
+ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
+ * every 16-bit component of the 128-bit result (16-bit int splat).
+ */
+static inline z_vec128i_t v_load16_dup(const void* src) {
+  int16_t i16;
+  Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
+  return vreinterpretq_u8_s16(vdupq_n_s16(i16));
+}
+
+/*
+ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
+ * component of the 128-bit result (8-bit int splat).
+ */
+static inline z_vec128i_t v_load8_dup(const void* src) {
+  return vld1q_dup_u8((const uint8_t*)src);
+}
+
+/*
+ * v_store_128(): store the 128-bit vec in a memory destination (that might
+ * not be 16-byte aligned) void* out.
+ */
+static inline void v_store_128(void* out, const z_vec128i_t vec) {
+  vst1q_u8(out, vec);
+}
+
+#elif defined(INFLATE_CHUNK_SIMD_SSE2)
+/*
+ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
+ * every 64-bit component of the 128-bit result (64-bit int splat).
+ */
+static inline z_vec128i_t v_load64_dup(const void* src) {
+  int64_t i64;
+  Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
+  return _mm_set1_epi64x(i64);
+}
+
+/*
+ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
+ * every 32-bit component of the 128-bit result (32-bit int splat).
+ */
+static inline z_vec128i_t v_load32_dup(const void* src) {
+  int32_t i32;
+  Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
+  return _mm_set1_epi32(i32);
+}
+
+/*
+ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
+ * every 16-bit component of the 128-bit result (16-bit int splat).
+ */
+static inline z_vec128i_t v_load16_dup(const void* src) {
+  int16_t i16;
+  Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
+  return _mm_set1_epi16(i16);
+}
+
+/*
+ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
+ * component of the 128-bit result (8-bit int splat).
+ */
+static inline z_vec128i_t v_load8_dup(const void* src) {
+  return _mm_set1_epi8(*(const char*)src);
+}
+
+/*
+ * v_store_128(): store the 128-bit vec in a memory destination (that might
+ * not be 16-byte aligned) void* out.
+ */
+static inline void v_store_128(void* out, const z_vec128i_t vec) {
+  _mm_storeu_si128((__m128i*)out, vec);
+}
+#endif
+
+/*
+ * Perform an overlapping copy which behaves as a memset() operation, but
+ * supporting periods other than one, and assume that length is non-zero and
+ * that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
+ * even if the length is shorter than this.
+ */
+static inline unsigned char FAR* chunkset_core(
+    unsigned char FAR* out,
+    unsigned period,
+    unsigned len) {
+  z_vec128i_t v;
+  const int bump = ((len - 1) % sizeof(v)) + 1;
+
+  switch (period) {
+    case 1:
+      v = v_load8_dup(out - 1);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      while (len > 0) {
+        v_store_128(out, v);
+        out += sizeof(v);
+        len -= sizeof(v);
+      }
+      return out;
+    case 2:
+      v = v_load16_dup(out - 2);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      if (len > 0) {
+        v = v_load16_dup(out - 2);
+        do {
+          v_store_128(out, v);
+          out += sizeof(v);
+          len -= sizeof(v);
+        } while (len > 0);
+      }
+      return out;
+    case 4:
+      v = v_load32_dup(out - 4);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      if (len > 0) {
+        v = v_load32_dup(out - 4);
+        do {
+          v_store_128(out, v);
+          out += sizeof(v);
+          len -= sizeof(v);
+        } while (len > 0);
+      }
+      return out;
+    case 8:
+      v = v_load64_dup(out - 8);
+      v_store_128(out, v);
+      out += bump;
+      len -= bump;
+      if (len > 0) {
+        v = v_load64_dup(out - 8);
+        do {
+          v_store_128(out, v);
+          out += sizeof(v);
+          len -= sizeof(v);
+        } while (len > 0);
+      }
+      return out;
+  }
+  out = chunkunroll_relaxed(out, &period, &len);
+  return chunkcopy_core(out, out - period, len);
+}
+
+/*
+ * Perform a memcpy-like operation, but assume that length is non-zero and that
+ * it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
+ * the length is shorter than this.
+ *
+ * Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
+ * of overlapping buffers, regardless of the distance between the pointers.
+ * This is reflected in the `restrict`-qualified pointers, allowing the
+ * compiler to re-order loads and stores.
+ */
+static inline unsigned char FAR* chunkcopy_relaxed(
+    unsigned char FAR* Z_RESTRICT out,
+    const unsigned char FAR* Z_RESTRICT from,
+    unsigned len) {
+  Assert((uintptr_t)out - (uintptr_t)from >= len,
+         "invalid restrict in chunkcopy_relaxed");
+  Assert((uintptr_t)from - (uintptr_t)out >= len,
+         "invalid restrict in chunkcopy_relaxed");
+  return chunkcopy_core(out, from, len);
+}
+
+/*
+ * Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
+ *
+ * Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
+ * behaviour of overlapping buffers, regardless of the distance between the
+ * pointers.  This is reflected in the `restrict`-qualified pointers, allowing
+ * the compiler to re-order loads and stores.
+ *
+ * Accepts an additional pointer to the end of safe output.  A generic safe
+ * copy would use (out + len), but it's normally the case that the end of the
+ * output buffer is beyond the end of the current copy, and this can still be
+ * exploited.
+ */
+static inline unsigned char FAR* chunkcopy_safe(
+    unsigned char FAR* out,
+    const unsigned char FAR* Z_RESTRICT from,
+    unsigned len,
+    unsigned char FAR* limit) {
+  Assert(out + len <= limit, "chunk copy exceeds safety limit");
+  Assert((uintptr_t)out - (uintptr_t)from >= len,
+         "invalid restrict in chunkcopy_safe");
+  Assert((uintptr_t)from - (uintptr_t)out >= len,
+         "invalid restrict in chunkcopy_safe");
+
+  return chunkcopy_core_safe(out, from, len, limit);
+}
+
+/*
+ * Perform chunky copy within the same buffer, where the source and destination
+ * may potentially overlap.
+ *
+ * Assumes that len > 0 on entry, and that it's safe to write at least
+ * CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
+ */
+static inline unsigned char FAR* chunkcopy_lapped_relaxed(
+    unsigned char FAR* out,
+    unsigned dist,
+    unsigned len) {
+  if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
+    return chunkset_core(out, dist, len);
+  }
+  return chunkcopy_core(out, out - dist, len);
+}
+
+/*
+ * Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
+ * output.
+ *
+ * Accepts an additional pointer to the end of safe output.  A generic safe
+ * copy would use (out + len), but it's normally the case that the end of the
+ * output buffer is beyond the end of the current copy, and this can still be
+ * exploited.
+ */
+static inline unsigned char FAR* chunkcopy_lapped_safe(
+    unsigned char FAR* out,
+    unsigned dist,
+    unsigned len,
+    unsigned char FAR* limit) {
+  Assert(out + len <= limit, "chunk copy exceeds safety limit");
+  if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) {
+    /* TODO(cavalcantii): try harder to optimise this */
+    while (len-- > 0) {
+      *out = *(out - dist);
+      out++;
+    }
+    return out;
+  }
+  return chunkcopy_lapped_relaxed(out, dist, len);
+}
+
+/* TODO(cavalcanti): see crbug.com/1110083. */
+static inline unsigned char FAR* chunkcopy_safe_ugly(unsigned char FAR* out,
+                                                     unsigned dist,
+                                                     unsigned len,
+                                                     unsigned char FAR* limit) {
+#if defined(__GNUC__) && !defined(__clang__)
+  /* Speed is the same as using chunkcopy_safe
+     w/ GCC on ARM (tested gcc 6.3 and 7.5) and avoids
+     undefined behavior.
+  */
+  return chunkcopy_core_safe(out, out - dist, len, limit);
+#elif defined(__clang__) && defined(ARMV8_OS_ANDROID) && !defined(__aarch64__)
+  /* Seems to perform better on 32bit (i.e. Android). */
+  return chunkcopy_core_safe(out, out - dist, len, limit);
+#else
+  /* Seems to perform better on 64bit. */
+  return chunkcopy_lapped_safe(out, dist, len, limit);
+#endif
+}
+
+/*
+ * The chunk-copy code above deals with writing the decoded DEFLATE data to
+ * the output with SIMD methods to increase decode speed. Reading the input
+ * to the DEFLATE decoder with a wide, SIMD method can also increase decode
+ * speed. This option is supported on little endian machines, and reads the
+ * input data in 64-bit (8 byte) chunks.
+ */
+
+#ifdef INFLATE_CHUNK_READ_64LE
+/*
+ * Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
+ */
+typedef uint64_t inflate_holder_t;
+
+/*
+ * Ask the compiler to perform a wide, unaligned load of a uint64_t using a
+ * machine instruction appropriate for the uint64_t type.
+ */
+static inline inflate_holder_t read64le(const unsigned char FAR *in) {
+    inflate_holder_t input;
+    Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
+    return input;
+}
+#else
+/*
+ * Otherwise, buffer the input bits using zlib's default input buffer type.
+ */
+typedef unsigned long inflate_holder_t;
+
+#endif /* INFLATE_CHUNK_READ_64LE */
+
+#undef Z_STATIC_ASSERT
+#undef Z_RESTRICT
+#undef Z_BUILTIN_MEMCPY
+#undef Z_DISABLE_MSAN
+
+#endif /* CHUNKCOPY_H */
--- a/third_party/zlib/compress.c
+++ b/third_party/zlib/compress.c
@ -8,11 +8,6 @@
 #include "third_party/zlib/internal.h"
 #include "third_party/zlib/macros.internal.h"
 #include "third_party/zlib/zlib.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /* @(#) $Id$ */
--- a/third_party/zlib/cpu_features.c
+++ b/third_party/zlib/cpu_features.c
@ -0,0 +1,37 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/dce.h"
+#include "libc/runtime/runtime.h"
+#include "libc/sysv/consts/auxv.h"
+#include "libc/sysv/consts/hwap.h"
+#include "third_party/zlib/cpu_features.internal.h"
+#include "third_party/zlib/zlib.h"
+
+int arm_cpu_enable_crc32;
+int arm_cpu_enable_pmull;
+
+void(cpu_check_features)(void) {
+#if defined(__aarch64__) && defined(__ARM_NEON)
+  if (IsLinux()) {
+    unsigned long features = getauxval(AT_HWCAP);
+    arm_cpu_enable_crc32 = !!(features & HWCAP_CRC32);
+    arm_cpu_enable_pmull = !!(features & HWCAP_PMULL);
+  }
+#endif
+}
--- a/third_party/zlib/cpu_features.internal.h
+++ b/third_party/zlib/cpu_features.internal.h
@ -0,0 +1,24 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_
+#define COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_
+#include "libc/nexgen32e/x86feature.h"
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+#ifdef __x86_64__
+
+#define x86_cpu_enable_sse2   X86_HAVE(SSE2)
+#define x86_cpu_enable_ssse3  X86_HAVE(SSSE3)
+#define x86_cpu_enable_simd   (X86_HAVE(SSE4_2) && X86_HAVE(PCLMUL))
+#define x86_cpu_enable_avx512 X86_HAVE(AVX512F)
+#define cpu_check_features()  (void)0
+
+#elif defined(__aarch64__)
+
+#define cpu_check_features zlib_cpu_check_features
+_Hide extern int arm_cpu_enable_crc32;
+_Hide extern int arm_cpu_enable_pmull;
+_Hide void cpu_check_features(void);
+
+#endif
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_ */
--- a/third_party/zlib/crc32.c
+++ b/third_party/zlib/crc32.c
@ -13,11 +13,6 @@
 #include "third_party/zlib/deflate.internal.h"
 #include "third_party/zlib/internal.h"
 #include "third_party/zlib/macros.internal.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /* @(#) $Id$ */
@ -33,8 +28,19 @@ asm(".include \"libc/disclaimer.inc\"");
  produced, so that this one source file can be compiled to an executable.
 */

+#ifdef MAKECRCH
+//#  include <stdio.h>
+#  ifndef DYNAMIC_CRC_TABLE
+#    define DYNAMIC_CRC_TABLE
+#  endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+#include "third_party/zlib/deflate.internal.h"
+#include "third_party/zlib/cpu_features.internal.h"
+#include "third_party/zlib/zutil.internal.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
+
 #if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
-#  include "crc32_simd.h"
+#include "third_party/zlib/crc32_simd.internal.h"
 #endif

 /*
@ -106,16 +112,25 @@ asm(".include \"libc/disclaimer.inc\"");
 #  endif
 #endif

-/* Local functions. */
-local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
-local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
-
 /* If available, use the ARM processor CRC32 instruction. */
 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
    && defined(USE_CANONICAL_ARMV8_CRC32)
 #  define ARMCRC32_CANONICAL_ZLIB
 #endif

+/* Local functions. */
+local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
+local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
+
+#if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
+    local z_word_t byte_swap OF((z_word_t word));
+#endif
+
+#if defined(W) && !defined(ARMCRC32_CANONICAL_ZLIB)
+    local z_crc_t crc_word OF((z_word_t data));
+    local z_word_t crc_word_big OF((z_word_t data));
+#endif
+
 #if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
 /*
  Swap the bytes in a z_word_t to convert between little and big endian. Any
@ -149,7 +164,6 @@ local z_word_t byte_swap(word)
 /* CRC polynomial. */
 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */

-#define DYNAMIC_CRC_TABLE
 #ifdef DYNAMIC_CRC_TABLE

 local z_crc_t FAR crc_table[256];
@ -534,7 +548,7 @@ local void braid(ltl, big, n, w)
 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
 * of x for combining CRC-32s, all made by make_crc_table().
 */
-# include "crc32.h"
+#include "third_party/zlib/crc32.inc"
 #endif /* DYNAMIC_CRC_TABLE */

 /* ========================================================================
@ -617,15 +631,15 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
 #define Z_BATCH_MIN 800             /* fewest words in a final batch */

-#error this is arm?
-unsigned long ZEXPORT crc32_z(crc, buf, len)
-    unsigned long crc;
-    const unsigned char FAR *buf;
-    z_size_t len;
+uint32_t ZEXPORT crc32_z(crc, buf_, len)
+    uint32_t crc;
+    const void FAR *buf_;
+    size_t len;
 {
    z_crc_t val;
    z_word_t crc1, crc2;
    const z_word_t *word;
+    const unsigned char FAR *buf = buf_;
    z_word_t val0, val1, val2;
    z_size_t last, last2, i;
    z_size_t num;
@ -743,13 +757,13 @@ local z_word_t crc_word_big(data)

 #endif

-#if 0 /* [jart] favor LIBC_STR crc32() */
 /* ========================================================================= */
-unsigned long ZEXPORT crc32_z(crc, buf, len)
-    unsigned long crc;
-    const unsigned char FAR *buf;
-    z_size_t len;
+uint32_t ZEXPORT crc32_z(crc, buf_, len)
+    uint32_t crc;
+    const void FAR *buf_;
+    size_t len;
 {
+    const unsigned char FAR *buf = buf_;
    /*
     * zlib convention is to call crc32(0, NULL, 0); before making
     * calls to crc32(). So this is a good, early (and infrequent)
@ -767,7 +781,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
    }

 #endif
-#if defined(CRC32_SIMD_SSE42_PCLMUL)
+#if defined(CRC32_SIMD_AVX512_PCLMUL)
+    if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
+        /* crc32 64-byte chunks */
+        z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
+        crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
+        /* check remaining data */
+        len -= chunk_size;
+        if (!len)
+            return crc;
+        /* Fall into the default crc32 for the remaining data. */
+        buf += chunk_size;
+    }
+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
    if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
        /* crc32 16-byte chunks */
        z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
@ -1114,11 +1140,9 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
    /* Return the CRC, post-conditioned. */
    return crc ^ 0xffffffff;
 }
-#endif

 #endif

-#if 0 /* [jart] favor LIBC_STR crc32() */
 /* ========================================================================= */
 unsigned long ZEXPORT crc32(crc, buf, len)
    unsigned long crc;
@ -1162,7 +1186,6 @@ unsigned long ZEXPORT crc32(crc, buf, len)
 #endif
    return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
 }
-#endif

 /* ========================================================================= */
 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
--- a/third_party/zlib/crc32.inc
+++ b/third_party/zlib/crc32.inc
--- a/third_party/zlib/crc32_simd.c
+++ b/third_party/zlib/crc32_simd.c
@ -0,0 +1,626 @@
+/* crc32_simd.c
+ *
+ * Copyright 2017 The Chromium Authors
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+asm(".ident\t\"\\n\\n\
+Chromium (BSD-3 License)\\n\
+Copyright 2017 The Chromium Authors\"");
+// clang-format off
+
+#include "third_party/zlib/crc32_simd.internal.h"
+#if defined(CRC32_SIMD_AVX512_PCLMUL)
+
+/*
+ * crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 256, and a multiple of 64. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+
+#include "third_party/intel/emmintrin.internal.h"
+#include "third_party/intel/smmintrin.internal.h"
+#include "third_party/intel/wmmintrin.internal.h"
+#include "third_party/intel/immintrin.internal.h"
+
+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(  /* AVX512+PCLMUL */
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3,k4
+     * are similar to those given at the end of the paper, and remaining
+     * constants and CRC32+Barrett polynomials remain unchanged.
+     *
+     * Replace the index of x from 128 to 512. As follows:
+     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
+     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
+     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
+     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
+     */
+    static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
+                                                0x011542778a, 0x01322d1430,
+                                                0x011542778a, 0x01322d1430,
+                                                0x011542778a, 0x01322d1430 };
+    static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
+                                                0x0154442bd4, 0x01c6e41596,
+                                                0x0154442bd4, 0x01c6e41596,
+                                                0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    __m128i a0, a1, a2, a3;
+
+    /*
+     * There's at least one block of 256.
+     */
+    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+
+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+
+    x0 = _mm512_load_si512((__m512i *)k1k2);
+
+    buf += 256;
+    len -= 256;
+
+    /*
+     * Parallel fold blocks of 256, if any.
+     */
+    while (len >= 256)
+    {
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+
+
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+
+        y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+        y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+        y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+        y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+
+        x1 = _mm512_xor_si512(x1, x5);
+        x2 = _mm512_xor_si512(x2, x6);
+        x3 = _mm512_xor_si512(x3, x7);
+        x4 = _mm512_xor_si512(x4, x8);
+
+        x1 = _mm512_xor_si512(x1, y5);
+        x2 = _mm512_xor_si512(x2, y6);
+        x3 = _mm512_xor_si512(x3, y7);
+        x4 = _mm512_xor_si512(x4, y8);
+
+        buf += 256;
+        len -= 256;
+    }
+
+    /*
+     * Fold into 512-bits.
+     */
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x2);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x3);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x4);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    /*
+     * Single fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x2 = _mm512_loadu_si512((__m512i *)buf);
+
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x1 = _mm512_xor_si512(x1, x2);
+        x1 = _mm512_xor_si512(x1, x5);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold 512-bits to 384-bits.
+     */
+    a0 = _mm_load_si128((__m128i *)k5k6);
+
+    a1 = _mm512_extracti32x4_epi32(x1, 0);
+    a2 = _mm512_extracti32x4_epi32(x1, 1);
+
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 384-bits to 256-bits.
+     */
+    a2 = _mm512_extracti32x4_epi32(x1, 2);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 256-bits to 128-bits.
+     */
+    a2 = _mm512_extracti32x4_epi32(x1, 3);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    a1 = _mm_srli_si128(a1, 8);
+    a1 = _mm_xor_si128(a1, a2);
+
+    a0 = _mm_loadl_epi64((__m128i*)k7k8);
+    a2 = _mm_srli_si128(a1, 4);
+    a1 = _mm_and_si128(a1, a3);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    a0 = _mm_load_si128((__m128i*)poly);
+
+    a2 = _mm_and_si128(a1, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
+    a2 = _mm_and_si128(a2, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Return the crc32.
+     */
+    return _mm_extract_epi32(a1, 1);
+}
+
+#elif defined(CRC32_SIMD_SSE42_PCLMUL)
+
+/*
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16.
+ */
+
+#include "third_party/intel/emmintrin.internal.h"
+#include "third_party/intel/smmintrin.internal.h"
+#include "third_party/intel/wmmintrin.internal.h"
+
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+    /*
+     * There's at least one block of 64.
+     */
+    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+
+    x0 = _mm_load_si128((__m128i *)k1k2);
+
+    buf += 64;
+    len -= 64;
+
+    /*
+     * Parallel fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+        x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+        x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+        x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+        x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+
+        y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+        y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+        y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+        y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+
+        x1 = _mm_xor_si128(x1, x5);
+        x2 = _mm_xor_si128(x2, x6);
+        x3 = _mm_xor_si128(x3, x7);
+        x4 = _mm_xor_si128(x4, x8);
+
+        x1 = _mm_xor_si128(x1, y5);
+        x2 = _mm_xor_si128(x2, y6);
+        x3 = _mm_xor_si128(x3, y7);
+        x4 = _mm_xor_si128(x4, y8);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold into 128-bits.
+     */
+    x0 = _mm_load_si128((__m128i *)k3k4);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x2);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x3);
+    x1 = _mm_xor_si128(x1, x5);
+
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x4);
+    x1 = _mm_xor_si128(x1, x5);
+
+    /*
+     * Single fold blocks of 16, if any.
+     */
+    while (len >= 16)
+    {
+        x2 = _mm_loadu_si128((__m128i *)buf);
+
+        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+        x1 = _mm_xor_si128(x1, x2);
+        x1 = _mm_xor_si128(x1, x5);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    x1 = _mm_srli_si128(x1, 8);
+    x1 = _mm_xor_si128(x1, x2);
+
+    x0 = _mm_loadl_epi64((__m128i*)k5k0);
+
+    x2 = _mm_srli_si128(x1, 4);
+    x1 = _mm_and_si128(x1, x3);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    x0 = _mm_load_si128((__m128i*)poly);
+
+    x2 = _mm_and_si128(x1, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+    x2 = _mm_and_si128(x2, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+
+    /*
+     * Return the crc32.
+     */
+    return _mm_extract_epi32(x1, 1);
+}
+
+#elif defined(CRC32_ARMV8_CRC32)
+
+/* CRC32 checksums using ARMv8-a crypto instructions.
+ */
+
+#if defined(__clang__)
+/* We need some extra types for using PMULL.
+ */
+#if defined(__aarch64__)
+#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_acle.h"
+#endif
+
+/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
+ * armv8 target, which is incompatible with ThinLTO optimizations on Android.
+ * (Namely, mixing and matching different module-level targets makes ThinLTO
+ * warn, and Android defaults to armv7-a. This restriction does not apply to
+ * function-level `target`s, however.)
+ *
+ * Since we only need four crc intrinsics, and since clang's implementation of
+ * those are just wrappers around compiler builtins, it's simplest to #define
+ * those builtins directly. If this #define list grows too much (or we depend on
+ * an intrinsic that isn't a trivial wrapper), we may have to find a better way
+ * to go about this.
+ *
+ * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
+ * feature for this target (ignoring feature)." This appears to be a harmless
+ * bug in clang.
+ *
+ * These definitions must appear *after* including arm_acle.h otherwise that
+ * header may end up defining functions named __builtin_arm_crc32* that call
+ * themselves, creating an infinite loop when the intrinsic is called.
+ */
+/* XXX: Cannot hook into builtins with XCode for arm64. */
+#if !defined(ARMV8_OS_MACOS)
+#define __crc32b __builtin_arm_crc32b
+#define __crc32d __builtin_arm_crc32d
+#define __crc32w __builtin_arm_crc32w
+#define __crc32cw __builtin_arm_crc32cw
+#endif
+
+#if defined(__aarch64__)
+#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
+#else  // !defined(__aarch64__)
+#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
+#endif  // defined(__aarch64__)
+
+#elif defined(__GNUC__)
+/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
+ * allowed. We can just include arm_acle.h.
+ */
+#include "third_party/aarch64/arm_neon.h"
+#include "third_party/aarch64/arm_acle.h"
+#define TARGET_ARMV8_WITH_CRC
+#else  // !defined(__GNUC__) && !defined(_aarch64__)
+#error ARM CRC32 SIMD extensions only supported for Clang and GCC
+#endif
+
+TARGET_ARMV8_WITH_CRC
+uint32_t ZLIB_INTERNAL armv8_crc32_little(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    uint32_t c = (uint32_t) ~crc;
+
+    while (len && ((uintptr_t)buf & 7)) {
+        c = __crc32b(c, *buf++);
+        --len;
+    }
+
+    const uint64_t *buf8 = (const uint64_t *)buf;
+
+    while (len >= 64) {
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        c = __crc32d(c, *buf8++);
+        len -= 64;
+    }
+
+    while (len >= 8) {
+        c = __crc32d(c, *buf8++);
+        len -= 8;
+    }
+
+    buf = (const unsigned char *)buf8;
+
+    while (len--) {
+        c = __crc32b(c, *buf++);
+    }
+
+    return ~c;
+}
+
+#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
+
+/*
+ * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
+        : "=w" (r) : "w" (a), "w" (b) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
+        : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
+        : "=w" (r) : "w" (a), "w" (b) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+    uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+    /*
+     * There's at least one block of 64.
+     */
+    x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
+    x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
+    x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
+    x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+    x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
+
+    x0 = vld1q_u64(k1k2);
+
+    buf += 64;
+    len -= 64;
+
+    /*
+     * Parallel fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x5 = (uint64x2_t) pmull_lo(x1, x0);
+        x6 = (uint64x2_t) pmull_lo(x2, x0);
+        x7 = (uint64x2_t) pmull_lo(x3, x0);
+        x8 = (uint64x2_t) pmull_lo(x4, x0);
+
+        y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
+        y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
+        y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
+        y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+        x1 = (uint64x2_t) pmull_hi(x1, x0);
+        x2 = (uint64x2_t) pmull_hi(x2, x0);
+        x3 = (uint64x2_t) pmull_hi(x3, x0);
+        x4 = (uint64x2_t) pmull_hi(x4, x0);
+
+        x1 = veorq_u64(x1, x5);
+        x2 = veorq_u64(x2, x6);
+        x3 = veorq_u64(x3, x7);
+        x4 = veorq_u64(x4, x8);
+
+        x1 = veorq_u64(x1, y5);
+        x2 = veorq_u64(x2, y6);
+        x3 = veorq_u64(x3, y7);
+        x4 = veorq_u64(x4, y8);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold into 128-bits.
+     */
+    x0 = vld1q_u64(k3k4);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x2);
+    x1 = veorq_u64(x1, x5);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x3);
+    x1 = veorq_u64(x1, x5);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x4);
+    x1 = veorq_u64(x1, x5);
+
+    /*
+     * Single fold blocks of 16, if any.
+     */
+    while (len >= 16)
+    {
+        x2 = vld1q_u64((const uint64_t *)buf);
+
+        x5 = (uint64x2_t) pmull_lo(x1, x0);
+        x1 = (uint64x2_t) pmull_hi(x1, x0);
+        x1 = veorq_u64(x1, x2);
+        x1 = veorq_u64(x1, x5);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
+
+    x2 = (uint64x2_t) pmull_01(x1, x0);
+    x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
+    x3 = (uint64x2_t) vld1q_u32(mask);
+    x1 = veorq_u64(x1, x2);
+
+    x0 = vld1q_u64(k5k0);
+
+    x2 = (uint64x2_t) pmull_01(x2, x0);
+    x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
+    x1 = vandq_u64(x1, x3);
+    x1 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = veorq_u64(x1, x2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    x0 = vld1q_u64(poly);
+
+    x2 = vandq_u64(x1, x3);
+    x2 = (uint64x2_t) pmull_01(x2, x0);
+    x2 = vandq_u64(x2, x3);
+    x2 = (uint64x2_t) pmull_lo(x2, x0);
+    x1 = veorq_u64(x1, x2);
+
+    /*
+     * Return the crc32.
+     */
+    return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
+}
+#endif /* aarch64 specific code. */
+
+#endif
--- a/third_party/zlib/crc32_simd.internal.h
+++ b/third_party/zlib/crc32_simd.internal.h
@ -0,0 +1,57 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_
+#define COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_
+#include "third_party/zlib/deflate.internal.h"
+#include "third_party/zlib/zconf.h"
+#include "third_party/zlib/zutil.internal.h"
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+// clang-format off
+
+/*
+ * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16.
+ */
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
+                                         z_size_t len,
+                                         uint32_t crc);
+
+uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
+                                          z_size_t len,
+                                          uint32_t crc);
+
+/*
+ * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
+ * for computing the crc32 of an arbitrary length buffer.
+ */
+#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
+#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
+#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
+#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63
+
+/*
+ * CRC32 checksums using ARMv8-a crypto instructions.
+ */
+uint32_t ZLIB_INTERNAL armv8_crc32_little(const unsigned char* buf,
+                                          z_size_t len,
+                                          uint32_t crc);
+
+/* aarch64 specific code. */
+#if defined(__aarch64__)
+
+/* 128 is the sweet spot at the time of coding (late 2020). */
+#define Z_CRC32_PMULL_MINIMUM_LENGTH 128
+#define Z_CRC32_PMULL_CHUNKSIZE_MASK 15
+
+/*
+ * CRC32 checksums using ARMv8-a PMULL instructions, where the buffer
+ * length must be at least 64, and a multiple of 16.
+ */
+uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(const unsigned char* buf,
+                                                z_size_t len,
+                                                uint32_t crc);
+
+#endif
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_ */
--- a/third_party/zlib/crc_folding.c
+++ b/third_party/zlib/crc_folding.c
@ -0,0 +1,503 @@
+#include "libc/fmt/conv.h"
+#include "libc/inttypes.h"
+#include "libc/limits.h"
+#include "libc/literal.h"
+#include "libc/str/str.h"
+#include "third_party/intel/emmintrin.internal.h"
+#include "third_party/intel/immintrin.internal.h"
+#include "third_party/intel/wmmintrin.internal.h"
+#include "third_party/zlib/deflate.internal.h"
+// clang-format off
+
+/*
+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
+ * instruction.
+ *
+ * A white paper describing this algorithm can be found at:
+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ *
+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
+ * Authors:
+ * 	Wajdi Feghali   <wajdi.k.feghali@intel.com>
+ * 	Jim Guilford    <james.guilford@intel.com>
+ * 	Vinodh Gopal    <vinodh.gopal@intel.com>
+ * 	Erdinc Ozturk   <erdinc.ozturk@intel.com>
+ * 	Jim Kukunas     <james.t.kukunas@linux.intel.com>
+ *
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef CRC32_SIMD_SSE42_PCLMUL
+
+#define CRC_LOAD(s) \
+    do { \
+        __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
+        __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
+        __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
+        __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
+        __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
+
+#define CRC_SAVE(s) \
+        _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
+        _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
+        _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
+        _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
+        _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
+    } while (0);
+
+ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
+{
+    CRC_LOAD(s)
+
+    xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+    xmm_crc1 = _mm_setzero_si128();
+    xmm_crc2 = _mm_setzero_si128();
+    xmm_crc3 = _mm_setzero_si128();
+
+    CRC_SAVE(s)
+
+    s->strm->adler = 0;
+}
+
+local void fold_1(deflate_state *const s,
+        __m128i *xmm_crc0, __m128i *xmm_crc1,
+        __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+    const __m128i xmm_fold4 = _mm_set_epi32(
+            0x00000001, 0x54442bd4,
+            0x00000001, 0xc6e41596);
+
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc3, ps_res;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
+
+    *xmm_crc0 = *xmm_crc1;
+    *xmm_crc1 = *xmm_crc2;
+    *xmm_crc2 = x_tmp3;
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+local void fold_2(deflate_state *const s,
+        __m128i *xmm_crc0, __m128i *xmm_crc1,
+        __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+    const __m128i xmm_fold4 = _mm_set_epi32(
+            0x00000001, 0x54442bd4,
+            0x00000001, 0xc6e41596);
+
+    __m128i x_tmp3, x_tmp2;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
+
+    x_tmp3 = *xmm_crc3;
+    x_tmp2 = *xmm_crc2;
+
+    *xmm_crc3 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
+
+    *xmm_crc2 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
+
+    *xmm_crc0 = x_tmp2;
+    *xmm_crc1 = x_tmp3;
+    *xmm_crc2 = _mm_castps_si128(ps_res20);
+    *xmm_crc3 = _mm_castps_si128(ps_res31);
+}
+
+local void fold_3(deflate_state *const s,
+        __m128i *xmm_crc0, __m128i *xmm_crc1,
+        __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+    const __m128i xmm_fold4 = _mm_set_epi32(
+            0x00000001, 0x54442bd4,
+            0x00000001, 0xc6e41596);
+
+    __m128i x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
+
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc3 = *xmm_crc2;
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
+
+    *xmm_crc2 = *xmm_crc1;
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
+
+    *xmm_crc1 = *xmm_crc0;
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
+
+    *xmm_crc0 = x_tmp3;
+    *xmm_crc1 = _mm_castps_si128(ps_res10);
+    *xmm_crc2 = _mm_castps_si128(ps_res21);
+    *xmm_crc3 = _mm_castps_si128(ps_res32);
+}
+
+local void fold_4(deflate_state *const s,
+        __m128i *xmm_crc0, __m128i *xmm_crc1,
+        __m128i *xmm_crc2, __m128i *xmm_crc3)
+{
+    const __m128i xmm_fold4 = _mm_set_epi32(
+            0x00000001, 0x54442bd4,
+            0x00000001, 0xc6e41596);
+
+    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
+    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
+    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
+
+    x_tmp0 = *xmm_crc0;
+    x_tmp1 = *xmm_crc1;
+    x_tmp2 = *xmm_crc2;
+    x_tmp3 = *xmm_crc3;
+
+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
+    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
+    ps_t0 = _mm_castsi128_ps(x_tmp0);
+    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
+
+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
+    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
+    ps_t1 = _mm_castsi128_ps(x_tmp1);
+    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
+
+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
+    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
+    ps_t2 = _mm_castsi128_ps(x_tmp2);
+    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
+
+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
+    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    ps_t3 = _mm_castsi128_ps(x_tmp3);
+    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
+
+    *xmm_crc0 = _mm_castps_si128(ps_res0);
+    *xmm_crc1 = _mm_castps_si128(ps_res1);
+    *xmm_crc2 = _mm_castps_si128(ps_res2);
+    *xmm_crc3 = _mm_castps_si128(ps_res3);
+}
+
+local const unsigned zalign(32) pshufb_shf_table[60] = {
+	0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
+	0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
+	0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
+	0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
+	0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
+	0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
+	0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl  9 (16 - 7)/shr7 */
+	0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl  8 (16 - 8)/shr8 */
+	0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl  7 (16 - 9)/shr9 */
+	0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl  6 (16 -10)/shr10*/
+	0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl  5 (16 -11)/shr11*/
+	0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl  4 (16 -12)/shr12*/
+	0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
+	0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
+	0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
+};
+
+local void partial_fold(deflate_state *const s, const size_t len,
+        __m128i *xmm_crc0, __m128i *xmm_crc1,
+        __m128i *xmm_crc2, __m128i *xmm_crc3,
+        __m128i *xmm_crc_part)
+{
+
+    const __m128i xmm_fold4 = _mm_set_epi32(
+            0x00000001, 0x54442bd4,
+            0x00000001, 0xc6e41596);
+    const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
+
+    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
+    __m128i xmm_a0_0, xmm_a0_1;
+    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
+
+    xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
+    xmm_shr = xmm_shl;
+    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
+
+    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
+
+    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
+    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
+    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
+
+    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
+    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
+    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
+
+    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
+    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
+    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
+
+    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
+    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
+    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
+
+    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
+    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
+
+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
+    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
+    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
+
+    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
+    ps_res = _mm_xor_ps(ps_res, psa0_1);
+
+    *xmm_crc3 = _mm_castps_si128(ps_res);
+}
+
+ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
+        unsigned char *dst, const unsigned char *src, long len)
+{
+    unsigned long algn_diff;
+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+
+    CRC_LOAD(s)
+
+    if (len < 16) {
+        if (len == 0)
+            return;
+        goto partial;
+    }
+
+    algn_diff = (0 - (uintptr_t)src) & 0xF;
+    if (algn_diff) {
+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
+        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+
+        dst += algn_diff;
+        src += algn_diff;
+        len -= algn_diff;
+
+        partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
+            &xmm_crc_part);
+    }
+
+    while ((len -= 64) >= 0) {
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+
+        fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+
+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
+
+        src += 64;
+        dst += 64;
+    }
+
+    /*
+     * len = num bytes left - 64
+     */
+    if (len + 16 >= 0) {
+        len += 16;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
+
+        fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+
+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
+
+        if (len == 0)
+            goto done;
+
+        dst += 48;
+        src += 48;
+    } else if (len + 32 >= 0) {
+        len += 32;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
+
+        fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+
+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
+
+        if (len == 0)
+            goto done;
+
+        dst += 32;
+        src += 32;
+    } else if (len + 48 >= 0) {
+        len += 48;
+
+        xmm_t0 = _mm_load_si128((__m128i *)src);
+
+        fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+
+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
+
+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
+
+        if (len == 0)
+            goto done;
+
+        dst += 16;
+        src += 16;
+    } else {
+        len += 64;
+        if (len == 0)
+            goto done;
+    }
+
+partial:
+
+#if defined(_MSC_VER)
+    /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
+    {
+        int32_t parts[4] = {0, 0, 0, 0};
+        memcpy(&parts, src, len);
+        xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
+    }
+#else
+    {
+        int64_t parts[2] = {0, 0};
+        memcpy(&parts, src, len);
+        xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
+    }
+#endif
+
+    _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
+    partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
+        &xmm_crc_part);
+done:
+    CRC_SAVE(s)
+}
+
+local const unsigned zalign(16) crc_k[] = {
+    0xccaa009e, 0x00000000, /* rk1 */
+    0x751997d0, 0x00000001, /* rk2 */
+    0xccaa009e, 0x00000000, /* rk5 */
+    0x63cd6124, 0x00000001, /* rk6 */
+    0xf7011640, 0x00000001, /* rk7 */
+    0xdb710640, 0x00000001  /* rk8 */
+};
+
+local const unsigned zalign(16) crc_mask[4] = {
+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
+};
+
+local const unsigned zalign(16) crc_mask2[4] = {
+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
+};
+
+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
+{
+    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
+    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
+
+    unsigned crc;
+    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
+
+    __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
+    __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
+    __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
+    __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
+
+    /*
+     * k1
+     */
+    crc_fold = _mm_load_si128((__m128i *)crc_k);
+
+    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
+    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
+    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
+
+    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
+    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
+    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
+
+    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
+    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+
+    /*
+     * k5
+     */
+    crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+
+    xmm_crc0 = xmm_crc3;
+    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
+
+    /*
+     * k7
+     */
+    xmm_crc1 = xmm_crc3;
+    xmm_crc2 = xmm_crc3;
+    crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
+
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
+
+    xmm_crc2 = xmm_crc3;
+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
+
+    crc = _mm_extract_epi32(xmm_crc3, 2);
+    return ~crc;
+}
+
+#endif  /* CRC32_SIMD_SSE42_PCLMUL */
--- a/third_party/zlib/deflate.c
+++ b/third_party/zlib/deflate.c
@ -5,13 +5,15 @@
 * Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
+
+#include "libc/assert.h"
+#include "third_party/zlib/cpu_features.internal.h"
 #include "third_party/zlib/deflate.internal.h"
-#include "third_party/zlib/insert_string.internal.h"
 #include "third_party/zlib/internal.h"
 #include "third_party/zlib/zutil.internal.h"

 asm(".ident\t\"\\n\\n\
-deflate 1.2.12.1 (zlib License)\\n\
+zlib 1.2.13 (zlib License)\\n\
 Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
 Invented 1990 Phillip Walter Katz\"");
 // clang-format off
@ -62,6 +64,12 @@ Invented 1990 Phillip Walter Katz\"");

 /* @(#) $Id$ */

+#if defined(DEFLATE_SLIDE_HASH_SSE2) || defined(DEFLATE_SLIDE_HASH_NEON)
+#include "third_party/zlib/slide_hash_simd.inc"
+#endif
+
+#include "third_party/zlib/insert_string.inc"
+
 #ifdef FASTEST
 /* See http://crbug.com/1113596 */
 #error "FASTEST is not supported in Chromium's zlib."
@ -101,13 +109,7 @@ local void lm_init        OF((deflate_state *s));
 local void putShortMSB    OF((deflate_state *s, uInt b));
 local void flush_pending  OF((z_streamp strm));
 local unsigned read_buf   OF((z_streamp strm, Bytef *buf, unsigned size));
-#ifdef ASMV
-#  pragma message("Assembler code may have bugs -- use at your own risk")
-      void match_init OF((void)); /* asm code initialization */
-      uInt longest_match  OF((deflate_state *s, IPos cur_match));
-#else
 local uInt longest_match  OF((deflate_state *s, IPos cur_match));
-#endif

 #ifdef ZLIB_DEBUG
 local  void check_match OF((deflate_state *s, IPos start, IPos match,
@ -181,9 +183,9 @@ local const config configuration_table[10] = {
 */
 #define CLEAR_HASH(s) \
    do { \
-        s->head[s->hash_size-1] = NIL; \
+        s->head[s->hash_size - 1] = NIL; \
        zmemzero((Bytef *)s->head, \
-                 (unsigned)(s->hash_size-1)*sizeof(*s->head)); \
+                 (unsigned)(s->hash_size - 1)*sizeof(*s->head)); \
    } while (0)

 /* ===========================================================================
@ -245,6 +247,14 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
    deflate_state *s;
    int wrap = 1;

+    // Needed to activate optimized insert_string() that helps compression
+    // for all wrapper formats (e.g. RAW, ZLIB, GZIP).
+    // Feature detection is not triggered while using RAW mode (i.e. we never
+    // call crc32() with a NULL buffer).
+#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
+    cpu_check_features();
+#endif
+
    if (strm == Z_NULL) return Z_STREAM_ERROR;

    strm->msg = Z_NULL;
@ -271,6 +281,8 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)

    if (windowBits < 0) { /* suppress zlib wrapper */
        wrap = 0;
+        if (windowBits < -15)
+            return Z_STREAM_ERROR;
        windowBits = -windowBits;
    }
 #ifdef GZIP
@ -300,7 +312,7 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
    s->chromium_zlib_hash = 0;
 #if !defined(USE_ZLIB_RABIN_KARP_ROLLING_HASH)
  #if defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
-    if (X86_HAVE(SSE4_2))
+    if (x86_cpu_enable_simd)
      s->chromium_zlib_hash = 1;
  #elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_ARMV8_CRC32)
    if (arm_cpu_enable_crc32)
@ -315,16 +327,15 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)

    s->hash_size = 1 << s->hash_bits;
    s->hash_mask = s->hash_size - 1;
-    s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
+    s->hash_shift =  ((s->hash_bits + MIN_MATCH-1) / MIN_MATCH);

    s->window = (Bytef *) ZALLOC(strm,
                                 s->w_size + window_padding,
                                 2*sizeof(Byte));
-
    /* Avoid use of unitialized values in the window, see crbug.com/1137613 and
     * crbug.com/1144420 */
    if (s->window) { /* [jart] fix regression in malloc failure checking */
-      zmemzero(s->window, (s->w_size + window_padding) * (2 * sizeof(Byte)));
+        zmemzero(s->window, (s->w_size + window_padding) * (2 * sizeof(Byte)));
    }
    s->prev   = (Posf *)  ZALLOC(strm, s->w_size, sizeof(Pos));
    /* Avoid use of uninitialized value, see:
@ -355,11 +366,11 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
     * sym_buf value to read moves forward three bytes. From that symbol, up to
     * 31 bits are written to pending_buf. The closest the written pending_buf
     * bits gets to the next sym_buf symbol to read is just before the last
-     * code is written. At that time, 31*(n-2) bits have been written, just
-     * after 24*(n-2) bits have been consumed from sym_buf. sym_buf starts at
-     * 8*n bits into pending_buf. (Note that the symbol buffer fills when n-1
+     * code is written. At that time, 31*(n - 2) bits have been written, just
+     * after 24*(n - 2) bits have been consumed from sym_buf. sym_buf starts at
+     * 8*n bits into pending_buf. (Note that the symbol buffer fills when n - 1
     * symbols are written.) The closest the writing gets to what is unread is
-     * then n+14 bits. Here n is lit_bufsize, which is 16384 by default, and
+     * then n + 14 bits. Here n is lit_bufsize, which is 16384 by default, and
     * can range from 128 to 32768.
     *
     * Therefore, at a minimum, there are 142 bits of space between what is
@ -404,7 +415,7 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
 /* =========================================================================
 * Check for a valid deflate stream state. Return 0 if ok, 1 if not.
 */
-local int deflateStateCheck (strm)
+local int deflateStateCheck(strm)
    z_streamp strm;
 {
    deflate_state *s;
@ -427,7 +438,7 @@ local int deflateStateCheck (strm)
 }

 /* ========================================================================= */
-int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
+int ZEXPORT deflateSetDictionary(strm, dictionary, dictLength)
    z_streamp strm;
    const Bytef *dictionary;
    uInt  dictLength;
@ -492,7 +503,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
 }

 /* ========================================================================= */
-int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength)
+int ZEXPORT deflateGetDictionary(strm, dictionary, dictLength)
    z_streamp strm;
    Bytef *dictionary;
    uInt  *dictLength;
@ -514,7 +525,7 @@ int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength)
 }

 /* ========================================================================= */
-int ZEXPORT deflateResetKeep (strm)
+int ZEXPORT deflateResetKeep(strm)
    z_streamp strm;
 {
    deflate_state *s;
@ -552,7 +563,7 @@ int ZEXPORT deflateResetKeep (strm)
 }

 /* ========================================================================= */
-int ZEXPORT deflateReset (strm)
+int ZEXPORT deflateReset(strm)
    z_streamp strm;
 {
    int ret;
@ -564,7 +575,7 @@ int ZEXPORT deflateReset (strm)
 }

 /* ========================================================================= */
-int ZEXPORT deflateSetHeader (strm, head)
+int ZEXPORT deflateSetHeader(strm, head)
    z_streamp strm;
    gz_headerp head;
 {
@ -575,7 +586,7 @@ int ZEXPORT deflateSetHeader (strm, head)
 }

 /* ========================================================================= */
-int ZEXPORT deflatePending (strm, pending, bits)
+int ZEXPORT deflatePending(strm, pending, bits)
    unsigned *pending;
    int *bits;
    z_streamp strm;
@ -589,7 +600,7 @@ int ZEXPORT deflatePending (strm, pending, bits)
 }

 /* ========================================================================= */
-int ZEXPORT deflatePrime (strm, bits, value)
+int ZEXPORT deflatePrime(strm, bits, value)
    z_streamp strm;
    int bits;
    int value;
@ -684,36 +695,50 @@ int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
 }

 /* =========================================================================
- * For the default windowBits of 15 and memLevel of 8, this function returns
- * a close to exact, as well as small, upper bound on the compressed size.
- * They are coded as constants here for a reason--if the #define's are
- * changed, then this function needs to be changed as well.  The return
- * value for 15 and 8 only works for those exact settings.
+ * For the default windowBits of 15 and memLevel of 8, this function returns a
+ * close to exact, as well as small, upper bound on the compressed size. This
+ * is an expansion of ~0.03%, plus a small constant.
 *
- * For any setting other than those defaults for windowBits and memLevel,
- * the value returned is a conservative worst case for the maximum expansion
- * resulting from using fixed blocks instead of stored blocks, which deflate
- * can emit on compressed data for some combinations of the parameters.
+ * For any setting other than those defaults for windowBits and memLevel, one
+ * of two worst case bounds is returned. This is at most an expansion of ~4% or
+ * ~13%, plus a small constant.
 *
- * This function could be more sophisticated to provide closer upper bounds for
- * every combination of windowBits and memLevel.  But even the conservative
- * upper bound of about 14% expansion does not seem onerous for output buffer
- * allocation.
+ * Both the 0.03% and 4% derive from the overhead of stored blocks. The first
+ * one is for stored blocks of 16383 bytes (memLevel == 8), whereas the second
+ * is for stored blocks of 127 bytes (the worst case memLevel == 1). The
+ * expansion results from five bytes of header for each stored block.
+ *
+ * The larger expansion of 13% results from a window size less than or equal to
+ * the symbols buffer size (windowBits <= memLevel + 7). In that case some of
+ * the data being compressed may have slid out of the sliding window, impeding
+ * a stored block from being emitted. Then the only choice is a fixed or
+ * dynamic block, where a fixed block limits the maximum expansion to 9 bits
+ * per 8-bit byte, plus 10 bits for every block. The smallest block size for
+ * which this can occur is 255 (memLevel == 2).
+ *
+ * Shifts are used to approximate divisions, for speed.
 */
 uLong ZEXPORT deflateBound(strm, sourceLen)
    z_streamp strm;
    uLong sourceLen;
 {
    deflate_state *s;
-    uLong complen, wraplen;
+    uLong fixedlen, storelen, wraplen;

-    /* conservative upper bound for compressed data */
-    complen = sourceLen +
-              ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 5;
+    /* upper bound for fixed blocks with 9-bit literals and length 255
+       (memLevel == 2, which is the lowest that may not use stored blocks) --
+       ~13% overhead plus a small constant */
+    fixedlen = sourceLen + (sourceLen >> 3) + (sourceLen >> 8) +
+               (sourceLen >> 9) + 4;

-    /* if can't get parameters, return conservative bound plus zlib wrapper */
+    /* upper bound for stored blocks with length 127 (memLevel == 1) --
+       ~4% overhead plus a small constant */
+    storelen = sourceLen + (sourceLen >> 5) + (sourceLen >> 7) +
+               (sourceLen >> 11) + 7;
+
+    /* if can't get parameters, return larger bound plus a zlib wrapper */
    if (deflateStateCheck(strm))
-        return complen + 6;
+        return (fixedlen > storelen ? fixedlen : storelen) + 6;

    /* compute wrapper length */
    s = strm->state;
@ -750,11 +775,12 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
        wraplen = 6;
    }

-    /* if not default parameters, return conservative bound */
+    /* if not default parameters, return one of the conservative bounds */
    if (s->w_bits != 15 || s->hash_bits != 8 + 7)
-        return complen + wraplen;
+        return (s->w_bits <= s->hash_bits ? fixedlen : storelen) + wraplen;

-    /* default settings: return tight bound for that case */
+    /* default settings: return tight bound for that case -- ~0.03% overhead
+       plus a small constant */
    return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
           (sourceLen >> 25) + 13 - 6 + wraplen;
 }
@ -764,7 +790,7 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
 * IN assertion: the stream state is correct and there is enough room in
 * pending_buf.
 */
-local void putShortMSB (s, b)
+local void putShortMSB(s, b)
    deflate_state *s;
    uInt b;
 {
@ -811,7 +837,7 @@ local void flush_pending(strm)
    } while (0)

 /* ========================================================================= */
-int ZEXPORT deflate (strm, flush)
+int ZEXPORT deflate(strm, flush)
    z_streamp strm;
    int flush;
 {
@ -866,7 +892,7 @@ int ZEXPORT deflate (strm, flush)
        s->status = BUSY_STATE;
    if (s->status == INIT_STATE) {
        /* zlib header */
-        uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
+        uInt header = (Z_DEFLATED + ((s->w_bits - 8) << 4)) << 8;
        uInt level_flags;

        if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2)
@ -1127,7 +1153,7 @@ int ZEXPORT deflate (strm, flush)
 }

 /* ========================================================================= */
-int ZEXPORT deflateEnd (strm)
+int ZEXPORT deflateEnd(strm)
    z_streamp strm;
 {
    int status;
@ -1153,7 +1179,7 @@ int ZEXPORT deflateEnd (strm)
 * To simplify the source, this is not supported for 16-bit MSDOS (which
 * doesn't have enough memory anyway to duplicate compression states).
 */
-int ZEXPORT deflateCopy (dest, source)
+int ZEXPORT deflateCopy(dest, source)
    z_streamp dest;
    z_streamp source;
 {
@ -1243,7 +1269,7 @@ local unsigned read_buf(strm, buf, size)
 /* ===========================================================================
 * Initialize the "longest match" routines for a new zlib stream
 */
-local void lm_init (s)
+local void lm_init(s)
    deflate_state *s;
 {
    s->window_size = (ulg)2L*s->w_size;
@ -1264,11 +1290,6 @@ local void lm_init (s)
    s->match_length = s->prev_length = MIN_MATCH-1;
    s->match_available = 0;
    s->ins_h = 0;
-#ifndef FASTEST
-#ifdef ASMV
-    match_init(); /* initialize the asm code */
-#endif
-#endif
 }

 #ifndef FASTEST
@ -1281,10 +1302,6 @@ local void lm_init (s)
 *   string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
 * OUT assertion: the match length is not greater than s->lookahead.
 */
-#ifndef ASMV
-/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
- * match.S. The code will be functionally equivalent.
- */
 local uInt longest_match(s, cur_match)
    deflate_state *s;
    IPos cur_match;                             /* current match */
@ -1309,10 +1326,10 @@ local uInt longest_match(s, cur_match)
     */
    register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
    register ush scan_start = *(ushf*)scan;
-    register ush scan_end   = *(ushf*)(scan+best_len-1);
+    register ush scan_end   = *(ushf*)(scan + best_len - 1);
 #else
    register Bytef *strend = s->window + s->strstart + MAX_MATCH;
-    register Byte scan_end1  = scan[best_len-1];
+    register Byte scan_end1  = scan[best_len - 1];
    register Byte scan_end   = scan[best_len];
 #endif

@ -1330,7 +1347,8 @@ local uInt longest_match(s, cur_match)
     */
    if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead;

-    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+           "need lookahead");

    do {
        Assert(cur_match < s->strstart, "no future");
@ -1348,15 +1366,15 @@ local uInt longest_match(s, cur_match)
        /* This code assumes sizeof(unsigned short) == 2. Do not use
         * UNALIGNED_OK if your compiler uses a different size.
         */
-        if (*(ushf*)(match+best_len-1) != scan_end ||
+        if (*(ushf*)(match + best_len - 1) != scan_end ||
            *(ushf*)match != scan_start) continue;

        /* It is not necessary to compare scan[2] and match[2] since they are
         * always equal when the other bytes match, given that the hash keys
         * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
-         * strstart+3, +5, ... up to strstart+257. We check for insufficient
+         * strstart + 3, + 5, up to strstart + 257. We check for insufficient
         * lookahead only every 4th comparison; the 128th check will be made
-         * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
+         * at strstart + 257. If MAX_MATCH-2 is not a multiple of 8, it is
         * necessary to put more guard bytes at the end of the window, or
         * to check more often for insufficient lookahead.
         */
@ -1372,28 +1390,29 @@ local uInt longest_match(s, cur_match)
        }
        scan++, match++;
        do {
-        } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
-                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
-                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
-                 *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
+        } while (*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
+                 *(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
+                 *(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
+                 *(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
                 scan < strend);
        /* The funny "do {}" generates better code on most compilers */

-        /* Here, scan <= window+strstart+257 */
-        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+        /* Here, scan <= window + strstart + 257 */
+        Assert(scan <= s->window+(unsigned)(s->window_size - 1),
+               "wild scan");
        if (*scan == *match) scan++;

-        len = (MAX_MATCH - 1) - (int)(strend-scan);
+        len = (MAX_MATCH - 1) - (int)(strend - scan);
        scan = strend - (MAX_MATCH-1);

 #else /* UNALIGNED_OK */

        if (match[best_len]   != scan_end  ||
-            match[best_len-1] != scan_end1 ||
+            match[best_len - 1] != scan_end1 ||
            *match            != *scan     ||
            *++match          != scan[1])      continue;

-        /* The check at best_len-1 can be removed because it will be made
+        /* The check at best_len - 1 can be removed because it will be made
         * again later. (This heuristic is not always a win.)
         * It is not necessary to compare scan[2] and match[2] since they
         * are always equal when the other bytes match, given that
@ -1412,7 +1431,7 @@ local uInt longest_match(s, cur_match)
        }

        /* We check for insufficient lookahead only every 8th comparison;
-         * the 256th check will be made at strstart+258.
+         * the 256th check will be made at strstart + 258.
         */
        do {
        } while (*++scan == *++match && *++scan == *++match &&
@ -1421,7 +1440,8 @@ local uInt longest_match(s, cur_match)
                 *++scan == *++match && *++scan == *++match &&
                 scan < strend);

-        Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+        Assert(scan <= s->window + (unsigned)(s->window_size - 1),
+               "wild scan");

        len = MAX_MATCH - (int)(strend - scan);
        scan = strend - MAX_MATCH;
@ -1433,9 +1453,9 @@ local uInt longest_match(s, cur_match)
            best_len = len;
            if (len >= nice_match) break;
 #ifdef UNALIGNED_OK
-            scan_end = *(ushf*)(scan+best_len-1);
+            scan_end = *(ushf*)(scan + best_len - 1);
 #else
-            scan_end1  = scan[best_len-1];
+            scan_end1  = scan[best_len - 1];
            scan_end   = scan[best_len];
 #endif
        }
@ -1445,7 +1465,6 @@ local uInt longest_match(s, cur_match)
    if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
    return s->lookahead;
 }
-#endif /* ASMV */

 #else /* FASTEST */

@ -1466,7 +1485,8 @@ local uInt longest_match(s, cur_match)
     */
    Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");

-    Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+           "need lookahead");

    Assert(cur_match < s->strstart, "no future");

@ -1476,7 +1496,7 @@ local uInt longest_match(s, cur_match)
     */
    if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1;

-    /* The check at best_len-1 can be removed because it will be made
+    /* The check at best_len - 1 can be removed because it will be made
     * again later. (This heuristic is not always a win.)
     * It is not necessary to compare scan[2] and match[2] since they
     * are always equal when the other bytes match, given that
@ -1486,7 +1506,7 @@ local uInt longest_match(s, cur_match)
    Assert(*scan == *match, "match[2]?");

    /* We check for insufficient lookahead only every 8th comparison;
-     * the 256th check will be made at strstart+258.
+     * the 256th check will be made at strstart + 258.
     */
    do {
    } while (*++scan == *++match && *++scan == *++match &&
@ -1495,7 +1515,7 @@ local uInt longest_match(s, cur_match)
             *++scan == *++match && *++scan == *++match &&
             scan < strend);

-    Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
+    Assert(scan <= s->window + (unsigned)(s->window_size - 1), "wild scan");

    len = MAX_MATCH - (int)(strend - scan);

@ -1531,7 +1551,7 @@ local void check_match(s, start, match, length)
        z_error(__FILE__, __LINE__, "invalid match");
    }
    if (z_verbose > 1) {
-        kprintf("\\[%d,%d]", start-match, length);
+        kprintf("\\[%d,%d]", start - match, length);
        do { kprintf("%c", s->window[start++]); } while (--length != 0);
    }
 }
@ -1577,9 +1597,9 @@ local void fill_window(s)
        /* If the window is almost full and there is insufficient lookahead,
         * move the upper half to the lower one to make room in the upper half.
         */
-        if (s->strstart >= wsize+MAX_DIST(s)) {
+        if (s->strstart >= wsize + MAX_DIST(s)) {

-            zmemcpy(s->window, s->window+wsize, (unsigned)wsize - more);
+            zmemcpy(s->window, s->window + wsize, (unsigned)wsize - more);
            s->match_start -= wsize;
            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
            s->block_start -= (long) wsize;
@ -1934,7 +1954,7 @@ local block_state deflate_fast(s, flush)
            if (s->lookahead == 0) break; /* flush the current block */
        }

-        /* Insert the string window[strstart .. strstart+2] in the
+        /* Insert the string window[strstart .. strstart + 2] in the
         * dictionary, and set hash_head to the head of the hash chain:
         */
        hash_head = NIL;
@ -1984,7 +2004,7 @@ local block_state deflate_fast(s, flush)

                if (!s->chromium_zlib_hash) {
                  s->ins_h = s->window[s->strstart];
-                  UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+                  UPDATE_HASH(s, s->ins_h, s->window[s->strstart + 1]);
 #if MIN_MATCH != 3
                  Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
@ -1996,7 +2016,7 @@ local block_state deflate_fast(s, flush)
        } else {
            /* No match, output a literal byte */
            Tracevv(("%c", s->window[s->strstart]));
-            _tr_tally_lit (s, s->window[s->strstart], bflush);
+            _tr_tally_lit(s, s->window[s->strstart], bflush);
            s->lookahead--;
            s->strstart++;
        }
@ -2040,7 +2060,7 @@ local block_state deflate_slow(s, flush)
            if (s->lookahead == 0) break; /* flush the current block */
        }

-        /* Insert the string window[strstart .. strstart+2] in the
+        /* Insert the string window[strstart .. strstart + 2] in the
         * dictionary, and set hash_head to the head of the hash chain:
         */
        hash_head = NIL;
@ -2085,20 +2105,20 @@ local block_state deflate_slow(s, flush)
            if (s->prev_match == -1) {
                /* The window has slid one byte past the previous match,
                 * so the first byte cannot be compared. */
-                check_match(s, s->strstart, s->prev_match+1, s->prev_length-1);
+                check_match(s, s->strstart, s->prev_match + 1, s->prev_length - 1);
            } else {
-                check_match(s, s->strstart-1, s->prev_match, s->prev_length);
+                check_match(s, s->strstart - 1, s->prev_match, s->prev_length);
            }

-            _tr_tally_dist(s, s->strstart -1 - s->prev_match,
+            _tr_tally_dist(s, s->strstart - 1 - s->prev_match,
                           s->prev_length - MIN_MATCH, bflush);

            /* Insert in hash table all strings up to the end of the match.
-             * strstart-1 and strstart are already inserted. If there is not
+             * strstart - 1 and strstart are already inserted. If there is not
             * enough lookahead, the last two strings are not inserted in
             * the hash table.
             */
-            s->lookahead -= s->prev_length-1;
+            s->lookahead -= s->prev_length - 1;
            s->prev_length -= 2;
            do {
                if (++s->strstart <= max_insert) {
@ -2116,8 +2136,8 @@ local block_state deflate_slow(s, flush)
             * single literal. If there was a match but the current match
             * is longer, truncate the previous match to a single literal.
             */
-            Tracevv(("%c", s->window[s->strstart-1]));
-            _tr_tally_lit(s, s->window[s->strstart-1], bflush);
+            Tracevv(("%c", s->window[s->strstart - 1]));
+            _tr_tally_lit(s, s->window[s->strstart - 1], bflush);
            if (bflush) {
                FLUSH_BLOCK_ONLY(s, 0);
            }
@ -2135,8 +2155,8 @@ local block_state deflate_slow(s, flush)
    }
    Assert (flush != Z_NO_FLUSH, "no flush?");
    if (s->match_available) {
-        Tracevv(("%c", s->window[s->strstart-1]));
-        _tr_tally_lit(s, s->window[s->strstart-1], bflush);
+        Tracevv(("%c", s->window[s->strstart - 1]));
+        _tr_tally_lit(s, s->window[s->strstart - 1], bflush);
        s->match_available = 0;
    }
    s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1;
@ -2193,7 +2213,8 @@ local block_state deflate_rle(s, flush)
                if (s->match_length > s->lookahead)
                    s->match_length = s->lookahead;
            }
-            Assert(scan <= s->window+(uInt)(s->window_size-1), "wild scan");
+            Assert(scan <= s->window + (uInt)(s->window_size - 1),
+                   "wild scan");
        }

        /* Emit match if have run of MIN_MATCH or longer, else emit literal */
@ -2208,7 +2229,7 @@ local block_state deflate_rle(s, flush)
        } else {
            /* No match, output a literal byte */
            Tracevv(("%c", s->window[s->strstart]));
-            _tr_tally_lit (s, s->window[s->strstart], bflush);
+            _tr_tally_lit(s, s->window[s->strstart], bflush);
            s->lookahead--;
            s->strstart++;
        }
@ -2248,7 +2269,7 @@ local block_state deflate_huff(s, flush)
        /* Output a literal byte */
        s->match_length = 0;
        Tracevv(("%c", s->window[s->strstart]));
-        _tr_tally_lit (s, s->window[s->strstart], bflush);
+        _tr_tally_lit(s, s->window[s->strstart], bflush);
        s->lookahead--;
        s->strstart++;
        if (bflush) FLUSH_BLOCK(s, 0);
--- a/third_party/zlib/gz/gzclose.c
+++ b/third_party/zlib/gz/gzclose.c
@ -7,11 +7,6 @@
 */
 #include "third_party/zlib/gz/gzguts.inc"
 #include "third_party/zlib/macros.internal.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /* gzclose() is in a separate file so that it is linked in only if it is used.
--- a/third_party/zlib/gz/gzlib.c
+++ b/third_party/zlib/gz/gzlib.c
@ -14,15 +14,10 @@
 #include "third_party/zlib/gz/gzguts.inc"
 #include "third_party/zlib/zlib.h"
 #include "third_party/zlib/zutil.internal.h"
+// clang-format off

 #define LSEEK lseek

-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
-// clang-format off
-
 /* Local functions */
 static void gz_reset(gz_statep);
 static gzFile gz_open(const void *, int, const char *);
--- a/third_party/zlib/gz/gzread.c
+++ b/third_party/zlib/gz/gzread.c
@ -12,11 +12,6 @@
 #include "libc/str/str.h"
 #include "third_party/zlib/gz/gzguts.inc"
 #include "third_party/zlib/zlib.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /* Local functions */
--- a/third_party/zlib/gz/gzwrite.c
+++ b/third_party/zlib/gz/gzwrite.c
@ -9,11 +9,6 @@
 #include "libc/fmt/fmt.h"
 #include "libc/mem/mem.h"
 #include "third_party/zlib/gz/gzguts.inc"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /* Local functions */
--- a/third_party/zlib/infback.c
+++ b/third_party/zlib/infback.c
@ -11,11 +11,6 @@
 #include "third_party/zlib/internal.h"
 #include "third_party/zlib/macros.internal.h"
 #include "third_party/zlib/zutil.internal.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /*
--- a/third_party/zlib/inffast.c
+++ b/third_party/zlib/inffast.c
@ -5,11 +5,6 @@
 * Copyright (C) 1995-2017 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 #include "third_party/zlib/zutil.internal.h"
--- a/third_party/zlib/inffast_chunk.c
+++ b/third_party/zlib/inffast_chunk.c
@ -0,0 +1,387 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╚─────────────────────────────────────────────────────────────────────────────*/
+/* inffast_chunk.c -- fast decoding
+ * Copyright (C) 1995-2017 Mark Adler
+ * Copyright 2023 The Chromium Authors
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+asm(".ident\t\"\\n\\n\
+Chromium (BSD-3 License)\\n\
+Copyright 2017 The Chromium Authors\"");
+// clang-format off
+
+#include "third_party/zlib/zutil.internal.h"
+#include "third_party/zlib/inftrees.internal.h"
+#include "third_party/zlib/inflate.internal.h"
+#include "third_party/zlib/inffast_chunk.internal.h"
+#include "third_party/zlib/chunkcopy.inc"
+
+#ifdef ASMINF
+#  pragma message("Assembler code may have bugs -- use at your own risk")
+#else
+
+/*
+   Decode literal, length, and distance codes and write out the resulting
+   literal and match bytes until either not enough input or output is
+   available, an end-of-block is encountered, or a data error is encountered.
+   When large enough input and output buffers are supplied to inflate(), for
+   example, a 16K input buffer and a 64K output buffer, more than 95% of the
+   inflate() execution time is spent in this routine.
+
+   Entry assumptions:
+
+        state->mode == LEN
+        strm->avail_in >= INFLATE_FAST_MIN_INPUT (6 or 8 bytes + 7 bytes)
+        strm->avail_out >= INFLATE_FAST_MIN_OUTPUT (258 bytes + 2 bytes)
+        start >= strm->avail_out
+        state->bits < 8
+        (state->hold >> state->bits) == 0
+        strm->next_out[0..strm->avail_out] does not overlap with
+              strm->next_in[0..strm->avail_in]
+        strm->state->window is allocated with an additional
+              CHUNKCOPY_CHUNK_SIZE-1 bytes of padding beyond strm->state->wsize
+
+   On return, state->mode is one of:
+
+        LEN -- ran out of enough output space or enough available input
+        TYPE -- reached end of block code, inflate() to interpret next block
+        BAD -- error in block data
+
+   Notes:
+
+    INFLATE_FAST_MIN_INPUT: 6 or 8 bytes + 7 bytes
+
+    - The maximum input bits used by a length/distance pair is 15 bits for the
+      length code, 5 bits for the length extra, 15 bits for the distance code,
+      and 13 bits for the distance extra.  This totals 48 bits, or six bytes.
+      Therefore if strm->avail_in >= 6, then there is enough input to avoid
+      checking for available input while decoding.
+
+    - The wide input data reading option reads 64 input bits at a time. Thus,
+      if strm->avail_in >= 8, then there is enough input to avoid checking for
+      available input while decoding. Reading consumes the input with:
+
+          hold |= read64le(in) << bits;
+          in += 6;
+          bits += 48;
+
+      reporting 6 bytes of new input because |bits| is 0..15 (2 bytes rounded
+      up, worst case) and 6 bytes is enough to decode as noted above. At exit,
+      hold &= (1U << bits) - 1 drops excess input to keep the invariant:
+
+          (state->hold >> state->bits) == 0
+
+    INFLATE_FAST_MIN_OUTPUT: 258 bytes + 2 bytes for literals = 260 bytes
+
+    - The maximum bytes that a single length/distance pair can output is 258
+      bytes, which is the maximum length that can be coded.  inflate_fast()
+      requires strm->avail_out >= 260 for each loop to avoid checking for
+      available output space while decoding.
+ */
+void ZLIB_INTERNAL inflate_fast_chunk_(strm, start)
+z_streamp strm;
+unsigned start;         /* inflate()'s starting value for strm->avail_out */
+{
+    struct inflate_state FAR *state;
+    z_const unsigned char FAR *in;      /* local strm->next_in */
+    z_const unsigned char FAR *last;    /* have enough input while in < last */
+    unsigned char FAR *out;     /* local strm->next_out */
+    unsigned char FAR *beg;     /* inflate()'s initial strm->next_out */
+    unsigned char FAR *end;     /* while out < end, enough space available */
+    unsigned char FAR *limit;   /* safety limit for chunky copies */
+#ifdef INFLATE_STRICT
+    unsigned dmax;              /* maximum distance from zlib header */
+#endif
+    unsigned wsize;             /* window size or zero if not using window */
+    unsigned whave;             /* valid bytes in the window */
+    unsigned wnext;             /* window write index */
+    unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
+    inflate_holder_t hold;      /* local strm->hold */
+    unsigned bits;              /* local strm->bits */
+    code const FAR *lcode;      /* local strm->lencode */
+    code const FAR *dcode;      /* local strm->distcode */
+    unsigned lmask;             /* mask for first level of length codes */
+    unsigned dmask;             /* mask for first level of distance codes */
+    code const *here;           /* retrieved table entry */
+    unsigned op;                /* code bits, operation, extra bits, or */
+                                /*  window position, window bytes to copy */
+    unsigned len;               /* match length, unused bytes */
+    unsigned dist;              /* match distance */
+    unsigned char FAR *from;    /* where to copy match from */
+
+    /* copy state to local variables */
+    state = (struct inflate_state FAR *)strm->state;
+    in = strm->next_in;
+    last = in + (strm->avail_in - (INFLATE_FAST_MIN_INPUT - 1));
+    out = strm->next_out;
+    beg = out - (start - strm->avail_out);
+    end = out + (strm->avail_out - (INFLATE_FAST_MIN_OUTPUT - 1));
+    limit = out + strm->avail_out;
+#ifdef INFLATE_STRICT
+    dmax = state->dmax;
+#endif
+    wsize = state->wsize;
+    whave = state->whave;
+    wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext;
+    window = state->window;
+    hold = state->hold;
+    bits = state->bits;
+    lcode = state->lencode;
+    dcode = state->distcode;
+    lmask = (1U << state->lenbits) - 1;
+    dmask = (1U << state->distbits) - 1;
+
+#ifdef INFLATE_CHUNK_READ_64LE
+#define REFILL() do { \
+        Assert(bits < 64, "### Too many bits in inflate_fast."); \
+        hold |= read64le(in) << bits; \
+        in += 7; \
+        in -= bits >> 3; \
+        bits |= 56; \
+    } while (0)
+#endif
+
+    /* decode literals and length/distances until end-of-block or not enough
+       input data or output space */
+    do {
+#ifdef INFLATE_CHUNK_READ_64LE
+        REFILL();
+#else
+        if (bits < 15) {
+            hold += (unsigned long)(*in++) << bits;
+            bits += 8;
+            hold += (unsigned long)(*in++) << bits;
+            bits += 8;
+        }
+#endif
+        here = lcode + (hold & lmask);
+#ifdef INFLATE_CHUNK_READ_64LE
+        if (here->op == 0) {                    /* literal */
+            Tracevv((here->val >= 0x20 && here->val < 0x7f ?
+                    "inflate:         literal '%c'\n" :
+                    "inflate:         literal 0x%02x\n", here->val));
+            *out++ = (unsigned char)(here->val);
+            hold >>= here->bits;
+            bits -= here->bits;
+            here = lcode + (hold & lmask);
+            if (here->op == 0) {                /* literal */
+                Tracevv((here->val >= 0x20 && here->val < 0x7f ?
+                        "inflate:    2nd  literal '%c'\n" :
+                        "inflate:    2nd  literal 0x%02x\n", here->val));
+                *out++ = (unsigned char)(here->val);
+                hold >>= here->bits;
+                bits -= here->bits;
+                here = lcode + (hold & lmask);
+            }
+        }
+#endif
+      dolen:
+        op = (unsigned)(here->bits);
+        hold >>= op;
+        bits -= op;
+        op = (unsigned)(here->op);
+        if (op == 0) {                          /* literal */
+            Tracevv((here->val >= 0x20 && here->val < 0x7f ?
+                    "inflate:         literal '%c'\n" :
+                    "inflate:         literal 0x%02x\n", here->val));
+            *out++ = (unsigned char)(here->val);
+        }
+        else if (op & 16) {                     /* length base */
+            len = (unsigned)(here->val);
+            op &= 15;                           /* number of extra bits */
+            if (op) {
+#ifndef INFLATE_CHUNK_READ_64LE
+                if (bits < op) {
+                    hold += (unsigned long)(*in++) << bits;
+                    bits += 8;
+                }
+#endif
+                len += (unsigned)hold & ((1U << op) - 1);
+                hold >>= op;
+                bits -= op;
+            }
+            Tracevv(("inflate:         length %u\n", len));
+#ifndef INFLATE_CHUNK_READ_64LE
+            if (bits < 15) {
+                hold += (unsigned long)(*in++) << bits;
+                bits += 8;
+                hold += (unsigned long)(*in++) << bits;
+                bits += 8;
+            }
+#endif
+            here = dcode + (hold & dmask);
+          dodist:
+            op = (unsigned)(here->bits);
+            hold >>= op;
+            bits -= op;
+            op = (unsigned)(here->op);
+            if (op & 16) {                      /* distance base */
+                dist = (unsigned)(here->val);
+                op &= 15;                       /* number of extra bits */
+                /* we have two fast-path loads: 10+10 + 15+5 + 15 = 55,
+                   but we may need to refill here in the worst case */
+                if (bits < op) {
+#ifdef INFLATE_CHUNK_READ_64LE
+                    REFILL();
+#else
+                    hold += (unsigned long)(*in++) << bits;
+                    bits += 8;
+                    if (bits < op) {
+                        hold += (unsigned long)(*in++) << bits;
+                        bits += 8;
+                    }
+#endif
+                }
+                dist += (unsigned)hold & ((1U << op) - 1);
+#ifdef INFLATE_STRICT
+                if (dist > dmax) {
+                    strm->msg = (char *)"invalid distance too far back";
+                    state->mode = BAD;
+                    break;
+                }
+#endif
+                hold >>= op;
+                bits -= op;
+                Tracevv(("inflate:         distance %u\n", dist));
+                op = (unsigned)(out - beg);     /* max distance in output */
+                if (dist > op) {                /* see if copy from window */
+                    op = dist - op;             /* distance back in window */
+                    if (op > whave) {
+                        if (state->sane) {
+                            strm->msg =
+                                (char *)"invalid distance too far back";
+                            state->mode = BAD;
+                            break;
+                        }
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+                        if (len <= op - whave) {
+                            do {
+                                *out++ = 0;
+                            } while (--len);
+                            continue;
+                        }
+                        len -= op - whave;
+                        do {
+                            *out++ = 0;
+                        } while (--op > whave);
+                        if (op == 0) {
+                            from = out - dist;
+                            do {
+                                *out++ = *from++;
+                            } while (--len);
+                            continue;
+                        }
+#endif
+                    }
+                    from = window;
+                    if (wnext >= op) {          /* contiguous in window */
+                        from += wnext - op;
+                    }
+                    else {                      /* wrap around window */
+                        op -= wnext;
+                        from += wsize - op;
+                        if (op < len) {         /* some from end of window */
+                            len -= op;
+                            out = chunkcopy_safe(out, from, op, limit);
+                            from = window;      /* more from start of window */
+                            op = wnext;
+                            /* This (rare) case can create a situation where
+                               the first chunkcopy below must be checked.
+                             */
+                        }
+                    }
+                    if (op < len) {             /* still need some from output */
+                        out = chunkcopy_safe(out, from, op, limit);
+                        len -= op;
+                        /* When dist is small the amount of data that can be
+                           copied from the window is also small, and progress
+                           towards the dangerous end of the output buffer is
+                           also small.  This means that for trivial memsets and
+                           for chunkunroll_relaxed() a safety check is
+                           unnecessary.  However, these conditions may not be
+                           entered at all, and in that case it's possible that
+                           the main copy is near the end.
+                          */
+                        out = chunkunroll_relaxed(out, &dist, &len);
+                        out = chunkcopy_safe_ugly(out, dist, len, limit);
+                    } else {
+                        /* from points to window, so there is no risk of
+                           overlapping pointers requiring memset-like behaviour
+                         */
+                        out = chunkcopy_safe(out, from, len, limit);
+                    }
+                }
+                else {
+                    /* Whole reference is in range of current output.  No
+                       range checks are necessary because we start with room
+                       for at least 258 bytes of output, so unroll and roundoff
+                       operations can write beyond `out+len` so long as they
+                       stay within 258 bytes of `out`.
+                     */
+                    out = chunkcopy_lapped_relaxed(out, dist, len);
+                }
+            }
+            else if ((op & 64) == 0) {          /* 2nd level distance code */
+                here = dcode + here->val + (hold & ((1U << op) - 1));
+                goto dodist;
+            }
+            else {
+                strm->msg = (char *)"invalid distance code";
+                state->mode = BAD;
+                break;
+            }
+        }
+        else if ((op & 64) == 0) {              /* 2nd level length code */
+            here = lcode + here->val + (hold & ((1U << op) - 1));
+            goto dolen;
+        }
+        else if (op & 32) {                     /* end-of-block */
+            Tracevv(("inflate:         end of block\n"));
+            state->mode = TYPE;
+            break;
+        }
+        else {
+            strm->msg = (char *)"invalid literal/length code";
+            state->mode = BAD;
+            break;
+        }
+    } while (in < last && out < end);
+
+    /* return unused bytes (on entry, bits < 8, so in won't go too far back) */
+    len = bits >> 3;
+    in -= len;
+    bits -= len << 3;
+    hold &= (1U << bits) - 1;
+
+    /* update state and return */
+    strm->next_in = in;
+    strm->next_out = out;
+    strm->avail_in = (unsigned)(in < last ?
+        (INFLATE_FAST_MIN_INPUT - 1) + (last - in) :
+        (INFLATE_FAST_MIN_INPUT - 1) - (in - last));
+    strm->avail_out = (unsigned)(out < end ?
+        (INFLATE_FAST_MIN_OUTPUT - 1) + (end - out) :
+        (INFLATE_FAST_MIN_OUTPUT - 1) - (out - end));
+    state->hold = hold;
+    state->bits = bits;
+
+    Assert((state->hold >> state->bits) == 0, "invalid input data state");
+}
+
+/*
+   inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
+   - Using bit fields for code structure
+   - Different op definition to avoid & for extra bits (do & for table bits)
+   - Three separate decoding do-loops for direct, window, and wnext == 0
+   - Special case for distance > 1 copies to do overlapped load and store copy
+   - Explicit branch predictions (based on measured branch probabilities)
+   - Deferring match copy and interspersed it with decoding subsequent codes
+   - Swapping literal/length else
+   - Swapping window/direct else
+   - Larger unrolled copy loops (three is about right)
+   - Moving len -= 3 statement into middle of loop
+ */
+
+#endif /* !ASMINF */
--- a/third_party/zlib/inffast_chunk.internal.h
+++ b/third_party/zlib/inffast_chunk.internal.h
@ -0,0 +1,42 @@
+/* inffast_chunk.h -- header to use inffast_chunk.c
+ * Copyright (C) 1995-2003, 2010 Mark Adler
+ * Copyright (C) 2017 ARM, Inc.
+ * Copyright 2023 The Chromium Authors
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the compression library and is
+   subject to change. Applications should only use zlib.h.
+ */
+
+#include "third_party/zlib/inffast.internal.h"
+
+/* INFLATE_FAST_MIN_INPUT:
+   The minimum number of input bytes needed so that we can safely call
+   inflate_fast() with only one up-front bounds check. One
+   length/distance code pair (15 bits for the length code, 5 bits for length
+   extra, 15 bits for the distance code, 13 bits for distance extra) requires
+   reading up to 48 input bits. Additionally, in the same iteraction, we may
+   decode two literals from the root-table (requiring MIN_OUTPUT = 258 + 2).
+
+   Each root-table entry is up to 10 bits, for a total of 68 input bits each
+   iteraction.
+
+   The refill variant reads 8 bytes from the buffer at a time, and advances
+   the input pointer by up to 7 bytes, ensuring there are at least 56-bits
+   available in the bit-buffer. The technique was documented by Fabian Giesen
+   on his blog as variant 4 in the article 'Reading bits in far too many ways':
+   https://fgiesen.wordpress.com/2018/02/20/
+
+   In the worst case, we may refill twice in the same iteraction, requiring
+   MIN_INPUT = 8 + 7.
+*/
+#ifdef INFLATE_CHUNK_READ_64LE
+#undef INFLATE_FAST_MIN_INPUT
+#define INFLATE_FAST_MIN_INPUT 15
+#undef INFLATE_FAST_MIN_OUTPUT
+#define INFLATE_FAST_MIN_OUTPUT 260
+#endif
+
+void inflate_fast_chunk_(z_streamp strm, unsigned start);
--- a/third_party/zlib/inflate.c
+++ b/third_party/zlib/inflate.c
@ -5,15 +5,11 @@
 * Copyright (C) 1995-2022 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
-#include "third_party/zlib/inffast.internal.h"
-#include "third_party/zlib/inflate.internal.h"
-#include "third_party/zlib/inftrees.internal.h"
-#include "third_party/zlib/internal.h"

 asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
+zlib 1.2.13 (zlib License)\\n\
+Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
+Invented 1990 Phillip Walter Katz\"");
 // clang-format off

 /*
@ -93,6 +89,13 @@ asm(".include \"libc/disclaimer.inc\"");
 * The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
 */

+#include "third_party/zlib/zutil.internal.h"
+#include "third_party/zlib/inftrees.internal.h"
+#include "third_party/zlib/inflate.internal.h"
+#include "third_party/zlib/inffast_chunk.internal.h"
+#include "third_party/zlib/internal.h"
+#include "third_party/zlib/chunkcopy.inc"
+
 #ifdef MAKEFIXED
 #  ifndef BUILDFIXED
 #    define BUILDFIXED
@ -176,6 +179,8 @@ int windowBits;

    /* extract wrap request from windowBits parameter */
    if (windowBits < 0) {
+        if (windowBits < -15)
+            return Z_STREAM_ERROR;
        wrap = 0;
        windowBits = -windowBits;
    }
@ -322,7 +327,14 @@ struct inflate_state FAR *state;
 }

 #ifdef MAKEFIXED
-#include <stdio.h>
+#include "libc/calls/calls.h"
+#include "libc/calls/dprintf.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/fmt/fmt.h"
+#include "libc/mem/fmt.h"
+#include "libc/stdio/stdio.h"
+#include "libc/stdio/temp.h"
+#include "third_party/musl/tempnam.h"

 /*
   Write out the inffixed.h that is #include'd above.  Defining MAKEFIXED also
@ -408,10 +420,20 @@ unsigned copy;

    /* if it hasn't been done already, allocate space for the window */
    if (state->window == Z_NULL) {
+        unsigned wsize = 1U << state->wbits;
        state->window = (unsigned char FAR *)
-                        ZALLOC(strm, 1U << state->wbits,
+                        ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
                               sizeof(unsigned char));
        if (state->window == Z_NULL) return 1;
+#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED
+        /* Copies from the overflow portion of this buffer are undefined and
+           may cause analysis tools to raise a warning if we don't initialize
+           it.  However, this undefined data overwrites other undefined data
+           and is subsequently either overwritten or left deliberately
+           undefined at the end of decode; so there's really no point.
+         */
+        zmemzero(state->window + wsize, CHUNKCOPY_CHUNK_SIZE);
+#endif
    }

    /* if window not in use yet, initialize */
@ -1065,7 +1087,7 @@ int flush;
            if (have >= INFLATE_FAST_MIN_INPUT &&
                left >= INFLATE_FAST_MIN_OUTPUT) {
                RESTORE();
-                inflate_fast(strm, out);
+                inflate_fast_chunk_(strm, out);
                LOAD();
                if (state->mode == TYPE)
                    state->back = -1;
@ -1200,17 +1222,16 @@ int flush;
                else
                    from = state->window + (state->wnext - copy);
                if (copy > state->length) copy = state->length;
+                if (copy > left) copy = left;
+                put = chunkcopy_safe(put, from, copy, put + left);
            }
            else {                              /* copy from output */
-                from = put - state->offset;
                copy = state->length;
+                if (copy > left) copy = left;
+                put = chunkcopy_lapped_safe(put, state->offset, copy, put + left);
            }
-            if (copy > left) copy = left;
            left -= copy;
            state->length -= copy;
-            do {
-                *put++ = *from++;
-            } while (--copy);
            if (state->length == 0) state->mode = LEN;
            break;
        case LIT:
@ -1279,6 +1300,29 @@ int flush;
       Note: a memory error from inflate() is non-recoverable.
     */
  inf_leave:
+#if defined(ZLIB_DEBUG)
+   /* XXX(cavalcantii): I put this in place back in 2017 to help debug faulty
+    * client code relying on undefined behavior when chunk_copy first landed.
+    *
+    * It is save to say after all these years that Chromium code is well
+    * behaved and works fine with the optimization, therefore we can enable
+    * this only for DEBUG builds.
+    *
+    * We write a defined value in the unused space to help mark
+    * where the stream has ended. We don't use zeros as that can
+    * mislead clients relying on undefined behavior (i.e. assuming
+    * that the data is over when the buffer has a zero/null value).
+    *
+    * The basic idea is that if client code is not relying on the zlib context
+    * to inform the amount of decompressed data, but instead reads the output
+    * buffer until a zero/null is found, it will fail faster and harder
+    * when the remaining of the buffer is marked with a symbol (e.g. 0x55).
+    */
+   if (left >= CHUNKCOPY_CHUNK_SIZE)
+      memset(put, 0x55, CHUNKCOPY_CHUNK_SIZE);
+   else
+      memset(put, 0x55, left);
+#endif
    RESTORE();
    if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
            (state->mode < CHECK || flush != Z_FINISH)))
--- a/third_party/zlib/inftrees.c
+++ b/third_party/zlib/inftrees.c
@ -1,3 +1,4 @@
+// clang-format off
 /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
 │vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
 ╚─────────────────────────────────────────────────────────────────────────────*/
@ -6,17 +7,16 @@
 * For conditions of distribution and use, see copyright notice in zlib.h
 */
 #include "third_party/zlib/inftrees.internal.h"
+#include "third_party/zlib/zutil.internal.h"

 asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
+zlib 1.2.13 (zlib License)\\n\
+Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
+Invented 1990 Phillip Walter Katz\"");
 // clang-format off

 #define MAXBITS 15

-const char inflate_copyright[] =
-   " inflate 1.2.12.1 Copyright 1995-2022 Mark Adler ";
 /*
  If you use the zlib library in a product, an acknowledgment is welcome
  in the documentation of your product. If for some reason you cannot
@ -69,7 +69,7 @@ unsigned short FAR *work;
        35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
    static const unsigned short lext[31] = { /* Length codes 257..285 extra */
        16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
-        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 76, 202};
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 194, 65};
    static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
        1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
        257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
--- a/third_party/zlib/inftrees.internal.h
+++ b/third_party/zlib/inftrees.internal.h
@ -5,7 +5,6 @@
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 COSMOPOLITAN_C_START_
 /* clang-format off */
-
 /* inftrees.h -- header to use inftrees.c
 * Copyright (C) 1995-2005, 2010 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
--- a/third_party/zlib/insert_string.internal.h
+++ b/third_party/zlib/insert_string.internal.h
@ -1,11 +1,13 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_
-#define COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_
-#include "libc/nexgen32e/x86feature.h"
-#include "third_party/zlib/deflate.internal.h"
-#include "third_party/zlib/zutil.internal.h"
-#if !(__ASSEMBLER__ + __LINKER__ + 0)
-COSMOPOLITAN_C_START_
-/* clang-format off */
+// clang-format off
+/* insert_string.h
+ *
+ * Copyright 2019 The Chromium Authors
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#ifndef INSERT_STRING_H
+#define INSERT_STRING_H

 #ifndef INLINE
 #if defined(_MSC_VER) && !defined(__clang__)
@ -15,9 +17,11 @@ COSMOPOLITAN_C_START_
 #endif
 #endif

+#include "third_party/zlib/cpu_features.internal.h"
+
 // clang-format off
 #if defined(CRC32_SIMD_SSE42_PCLMUL)
-//  #include <smmintrin.h>  /* Required to make MSVC bot build pass. */
+#include "third_party/intel/smmintrin.internal.h"  /* Required to make MSVC bot build pass. */

  #if defined(__clang__) || defined(__GNUC__)
    #define TARGET_CPU_WITH_CRC __attribute__((target("sse4.2")))
@ -51,19 +55,17 @@ COSMOPOLITAN_C_START_

 #if defined(TARGET_CPU_WITH_CRC)

-// TODO(jart): Why does this fail alignment check?
-noubsan local INLINE Pos insert_string_simd(deflate_state* const s,
-                                            const Pos str) {
+TARGET_CPU_WITH_CRC
+local INLINE Pos insert_string_simd(deflate_state* const s, const Pos str) {
  Pos ret;
-  unsigned *ip, val, h = 0;
+  unsigned val, h = 0;

-  ip = (unsigned*)&s->window[str];
-  val = *ip;
+  zmemcpy(&val, &s->window[str], sizeof(val));

  if (s->level >= 6) val &= 0xFFFFFF;

  /* Compute hash from the CRC32C of |val|. */
-  asm("crc32l\t%1,%0" : "+r"(h) : "rm"(val));
+  h = _cpu_crc32c_hash_u32(h, val);

  ret = s->head[h & s->hash_mask];
  s->head[h & s->hash_mask] = str;
@ -131,13 +133,11 @@ local INLINE Pos insert_string(deflate_state* const s, const Pos str) {
 * the Rabin-Karp hash instead.
 */ /* FALLTHROUGH Rabin-Karp */
 #elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
-  if (X86_HAVE(SSE4_2)) return insert_string_simd(s, str);
+  if (x86_cpu_enable_simd) return insert_string_simd(s, str);
 #elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_ARMV8_CRC32)
  if (arm_cpu_enable_crc32) return insert_string_simd(s, str);
 #endif
  return insert_string_c(s, str); /* Rabin-Karp */
 }

-COSMOPOLITAN_C_END_
-#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
-#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_ */
+#endif /* INSERT_STRING_H */
--- a/third_party/zlib/slide_hash_simd.inc
+++ b/third_party/zlib/slide_hash_simd.inc
@ -0,0 +1,117 @@
+// clang-format off
+/* slide_hash_simd.h
+ *
+ * Copyright 2022 The Chromium Authors
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ */
+
+#ifndef SLIDE_HASH_SIMD_H
+#define SLIDE_HASH_SIMD_H
+
+#include "third_party/zlib/deflate.internal.h"
+
+#ifndef INLINE
+#if defined(_MSC_VER) && !defined(__clang__)
+#define INLINE __inline
+#else
+#define INLINE inline
+#endif
+#endif
+
+#if defined(CPU_NO_SIMD)
+
+#error SIMD has been disabled for your build target
+
+#elif defined(DEFLATE_SLIDE_HASH_SSE2)
+
+#include "third_party/intel/emmintrin.internal.h"  /* SSE2 */
+
+#define Z_SLIDE_INIT_SIMD(wsize) _mm_set1_epi16((ush)(wsize))
+
+#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
+    for (const Posf* const end = table + size; table != end;) { \
+        __m128i vO = _mm_loadu_si128((__m128i *)(table + 0)); \
+        vO = _mm_subs_epu16(vO, vector_wsize); \
+        _mm_storeu_si128((__m128i *)(table + 0), vO); \
+        table += 8; \
+    }
+
+typedef __m128i z_vec128i_u16x8_t;
+
+#elif defined(DEFLATE_SLIDE_HASH_NEON)
+
+#include "third_party/aarch64/arm_neon.h"  /* NEON */
+
+#define Z_SLIDE_INIT_SIMD(wsize) vdupq_n_u16((ush)(wsize))
+
+#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
+    for (const Posf* const end = table + size; table != end;) { \
+        uint16x8_t vO = vld1q_u16(table + 0); \
+        uint16x8_t v8 = vld1q_u16(table + 8); \
+        vO = vqsubq_u16(vO, vector_wsize); \
+        v8 = vqsubq_u16(v8, vector_wsize); \
+        vst1q_u16(table + 0, vO); \
+        vst1q_u16(table + 8, v8); \
+        table += 8 + 8; \
+    }
+
+typedef uint16x8_t z_vec128i_u16x8_t;
+
+#else
+
+#error slide_hash_simd is not defined for your build target
+
+#endif
+
+/* ===========================================================================
+ * Slide the hash table when sliding the window down (could be avoided with 32
+ * bit values at the expense of memory usage). We slide even when level == 0 to
+ * keep the hash table consistent if we switch back to level > 0 later.
+ */
+local INLINE void slide_hash_simd(
+    Posf *head, Posf *prev, const uInt w_size, const uInt hash_size) {
+    /*
+     * The SIMD implementation of the hash table slider assumes:
+     *
+     * 1. hash chain offset is 2 bytes. Should be true as Pos is "ush" type.
+     */
+    Assert(sizeof(Pos) == 2, "Pos type size error: should be 2 bytes");
+    Assert(sizeof(ush) == 2, "ush type size error: should be 2 bytes");
+
+    Assert(hash_size <= (1 << 16), "Hash table maximum size error");
+    Assert(hash_size >= (1 << 8), "Hash table minimum size error");
+    Assert(w_size == (ush)w_size, "Prev table size error");
+
+    /*
+     * 2. The hash & prev table sizes are a multiple of 32 bytes (256 bits),
+     * since the NEON table slider moves two 128-bit items per loop (loop is
+     * unrolled on NEON for performance, see http://crbug.com/863257).
+     */
+    Assert(!((hash_size * sizeof(head[0])) & (32 - 1)),
+        "Hash table size error: should be a multiple of 32 bytes");
+    Assert(!((w_size * sizeof(prev[0])) & (32 - 1)),
+        "Prev table size error: should be a multiple of 32 bytes");
+
+    /*
+     * Duplicate (ush)w_size in each uint16_t component of a 128-bit vector.
+     */
+    const z_vec128i_u16x8_t vec_wsize = Z_SLIDE_INIT_SIMD(w_size);
+
+    /*
+     * Slide {head,prev} hash chain values: subtracts (ush)w_size from every
+     * value with a saturating SIMD subtract, to clamp the result to 0(NIL),
+     * to implement slide_hash() `(m >= wsize ? m - wsize : NIL);` code.
+     */
+    Z_SLIDE_HASH_SIMD(head, hash_size, vec_wsize);
+#ifndef FASTEST
+    Z_SLIDE_HASH_SIMD(prev, w_size, vec_wsize);
+#endif
+
+}
+
+#undef z_vec128i_u16x8_t
+#undef Z_SLIDE_HASH_SIMD
+#undef Z_SLIDE_INIT_SIMD
+
+#endif  /* SLIDE_HASH_SIMD_H */
--- a/third_party/zlib/trees.c
+++ b/third_party/zlib/trees.c
@ -14,11 +14,6 @@
 #include "libc/stdio/temp.h"
 #include "libc/str/str.h"
 #include "third_party/zlib/deflate.internal.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /*
@ -174,7 +169,7 @@ local void gen_trees_header OF((void));

 #else /* !ZLIB_DEBUG */
 #  define send_code(s, c, tree) \
-     { if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
+      { if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
       send_bits(s, tree[c].Code, tree[c].Len); }
 #endif

@ -204,7 +199,7 @@ local void send_bits(s, value, length)
    s->bits_sent += (ulg)length;

    /* If not enough room in bi_buf, use (valid) bits from bi_buf and
-     * (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
+     * (16 - bi_valid) bits from value, leaving (width - (16 - bi_valid))
     * unused bits in value.
     */
    if (s->bi_valid > (int)Buf_size - length) {
@ -267,7 +262,7 @@ local void tr_static_init()
    length = 0;
    for (code = 0; code < LENGTH_CODES-1; code++) {
        base_length[code] = length;
-        for (n = 0; n < (1<<extra_lbits[code]); n++) {
+        for (n = 0; n < (1 << extra_lbits[code]); n++) {
            _length_code[length++] = (uch)code;
        }
    }
@ -276,13 +271,13 @@ local void tr_static_init()
     * in two different ways: code 284 + 5 bits or code 285, so we
     * overwrite length_code[255] to use the best encoding:
     */
-    _length_code[length-1] = (uch)code;
+    _length_code[length - 1] = (uch)code;

    /* Initialize the mapping dist (0..32K) -> dist code (0..29) */
    dist = 0;
    for (code = 0 ; code < 16; code++) {
        base_dist[code] = dist;
-        for (n = 0; n < (1<<extra_dbits[code]); n++) {
+        for (n = 0; n < (1 << extra_dbits[code]); n++) {
            _dist_code[dist++] = (uch)code;
        }
    }
@ -290,11 +285,11 @@ local void tr_static_init()
    dist >>= 7; /* from now on, all distances are divided by 128 */
    for ( ; code < D_CODES; code++) {
        base_dist[code] = dist << 7;
-        for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
+        for (n = 0; n < (1 << (extra_dbits[code] - 7)); n++) {
            _dist_code[256 + dist++] = (uch)code;
        }
    }
-    Assert (dist == 256, "tr_static_init: 256+dist != 512");
+    Assert (dist == 256, "tr_static_init: 256 + dist != 512");

    /* Construct the codes of the static literal tree */
    for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
@ -327,11 +322,12 @@ local void tr_static_init()
 */
 #ifdef GEN_TREES_H
 #  ifndef ZLIB_DEBUG
+//#    include <stdio.h>
 #  endif

 #  define SEPARATOR(i, last, width) \
      ((i) == (last)? "\n};\n\n" :    \
-       ((i) % (width) == (width)-1 ? ",\n" : ", "))
+       ((i) % (width) == (width) - 1 ? ",\n" : ", "))

 void gen_trees_header()
 {
@ -468,7 +464,7 @@ local void pqdownheap(s, tree, k)
    while (j <= s->heap_len) {
        /* Set j to the smallest of the two sons: */
        if (j < s->heap_len &&
-            smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
+            smaller(tree, s->heap[j + 1], s->heap[j], s->depth)) {
            j++;
        }
        /* Exit if v is smaller than both sons */
@ -517,7 +513,7 @@ local void gen_bitlen(s, desc)
     */
    tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */

-    for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
+    for (h = s->heap_max + 1; h < HEAP_SIZE; h++) {
        n = s->heap[h];
        bits = tree[tree[n].Dad].Len + 1;
        if (bits > max_length) bits = max_length, overflow++;
@ -528,7 +524,7 @@ local void gen_bitlen(s, desc)

        s->bl_count[bits]++;
        xbits = 0;
-        if (n >= base) xbits = extra[n-base];
+        if (n >= base) xbits = extra[n - base];
        f = tree[n].Freq;
        s->opt_len += (ulg)f * (unsigned)(bits + xbits);
        if (stree) s->static_len += (ulg)f * (unsigned)(stree[n].Len + xbits);
@ -540,10 +536,10 @@ local void gen_bitlen(s, desc)

    /* Find the first bit length which could increase: */
    do {
-        bits = max_length-1;
+        bits = max_length - 1;
        while (s->bl_count[bits] == 0) bits--;
-        s->bl_count[bits]--;      /* move one leaf down the tree */
-        s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
+        s->bl_count[bits]--;        /* move one leaf down the tree */
+        s->bl_count[bits + 1] += 2; /* move one overflow item as its brother */
        s->bl_count[max_length]--;
        /* The brother of the overflow item also moves one step up,
         * but this does not affect bl_count[max_length]
@ -579,7 +575,7 @@ local void gen_bitlen(s, desc)
 * OUT assertion: the field code is set for all tree elements of non
 *     zero code length.
 */
-local void gen_codes (tree, max_code, bl_count)
+local void gen_codes(tree, max_code, bl_count)
    ct_data *tree;             /* the tree to decorate */
    int max_code;              /* largest code with non zero frequency */
    ushf *bl_count;            /* number of codes at each bit length */
@ -593,13 +589,13 @@ local void gen_codes (tree, max_code, bl_count)
     * without bit reversal.
     */
    for (bits = 1; bits <= MAX_BITS; bits++) {
-        code = (code + bl_count[bits-1]) << 1;
+        code = (code + bl_count[bits - 1]) << 1;
        next_code[bits] = (ush)code;
    }
    /* Check that the bit counts in bl_count are consistent. The last code
     * must be all ones.
     */
-    Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
+    Assert (code + bl_count[MAX_BITS] - 1 == (1 << MAX_BITS) - 1,
            "inconsistent bit counts");
    Tracev(("\ngen_codes: max_code %d ", max_code));

@ -610,7 +606,7 @@ local void gen_codes (tree, max_code, bl_count)
        tree[n].Code = (ush)bi_reverse(next_code[len]++, len);

        Tracecv(tree != static_ltree, ("\nn %3d %c l %2d c %4x (%x) ",
-             n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
+            n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len] - 1));
    }
 }

@ -634,7 +630,7 @@ local void build_tree(s, desc)
    int node;          /* new node being created */

    /* Construct the initial heap, with least frequent element in
-     * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
+     * heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n + 1].
     * heap[0] is not used.
     */
    s->heap_len = 0, s->heap_max = HEAP_SIZE;
@ -662,7 +658,7 @@ local void build_tree(s, desc)
    }
    desc->max_code = max_code;

-    /* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
+    /* The elements heap[heap_len/2 + 1 .. heap_len] are leaves of the tree,
     * establish sub-heaps of increasing lengths:
     */
    for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
@ -710,7 +706,7 @@ local void build_tree(s, desc)
 * Scan a literal or distance tree to determine the frequencies of the codes
 * in the bit length tree.
 */
-local void scan_tree (s, tree, max_code)
+local void scan_tree(s, tree, max_code)
    deflate_state *s;
    ct_data *tree;   /* the tree to be scanned */
    int max_code;    /* and its largest code of non zero frequency */
@ -724,10 +720,10 @@ local void scan_tree (s, tree, max_code)
    int min_count = 4;         /* min repeat count */

    if (nextlen == 0) max_count = 138, min_count = 3;
-    tree[max_code+1].Len = (ush)0xffff; /* guard */
+    tree[max_code + 1].Len = (ush)0xffff; /* guard */

    for (n = 0; n <= max_code; n++) {
-        curlen = nextlen; nextlen = tree[n+1].Len;
+        curlen = nextlen; nextlen = tree[n + 1].Len;
        if (++count < max_count && curlen == nextlen) {
            continue;
        } else if (count < min_count) {
@ -755,7 +751,7 @@ local void scan_tree (s, tree, max_code)
 * Send a literal or distance tree in compressed form, using the codes in
 * bl_tree.
 */
-local void send_tree (s, tree, max_code)
+local void send_tree(s, tree, max_code)
    deflate_state *s;
    ct_data *tree; /* the tree to be scanned */
    int max_code;       /* and its largest code of non zero frequency */
@ -768,11 +764,11 @@ local void send_tree (s, tree, max_code)
    int max_count = 7;         /* max repeat count */
    int min_count = 4;         /* min repeat count */

-    /* tree[max_code+1].Len = -1; */  /* guard already set */
+    /* tree[max_code + 1].Len = -1; */  /* guard already set */
    if (nextlen == 0) max_count = 138, min_count = 3;

    for (n = 0; n <= max_code; n++) {
-        curlen = nextlen; nextlen = tree[n+1].Len;
+        curlen = nextlen; nextlen = tree[n + 1].Len;
        if (++count < max_count && curlen == nextlen) {
            continue;
        } else if (count < min_count) {
@ -783,13 +779,13 @@ local void send_tree (s, tree, max_code)
                send_code(s, curlen, s->bl_tree); count--;
            }
            Assert(count >= 3 && count <= 6, " 3_6?");
-            send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
+            send_code(s, REP_3_6, s->bl_tree); send_bits(s, count - 3, 2);

        } else if (count <= 10) {
-            send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
+            send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count - 3, 3);

        } else {
-            send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
+            send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count - 11, 7);
        }
        count = 0; prevlen = curlen;
        if (nextlen == 0) {
@ -817,8 +813,8 @@ local int build_bl_tree(s)

    /* Build the bit length tree: */
    build_tree(s, (tree_desc *)(&(s->bl_desc)));
-    /* opt_len now includes the length of the tree representations, except
-     * the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
+    /* opt_len now includes the length of the tree representations, except the
+     * lengths of the bit lengths codes and the 5 + 5 + 4 bits for the counts.
     */

    /* Determine the number of bit length codes to send. The pkzip format
@ -829,8 +825,8 @@ local int build_bl_tree(s)
        if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
    }
    /* Update opt_len to include the bit length tree and counts */
-    s->opt_len += 3*((ulg)max_blindex+1) + 5+5+4;
-    Tracev(("\ndyn trees: dyn %ld, stat %ld",
+    s->opt_len += 3*((ulg)max_blindex + 1) + 5 + 5 + 4;
+    Tracev(( "\ndyn trees: dyn %ld, stat %ld",
            s->opt_len, s->static_len));

    return max_blindex;
@ -850,21 +846,21 @@ local void send_all_trees(s, lcodes, dcodes, blcodes)
    Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
    Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
            "too many codes");
-    Tracev(("\nbl counts: "));
-    send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
-    send_bits(s, dcodes-1,   5);
-    send_bits(s, blcodes-4,  4); /* not -3 as stated in appnote.txt */
+    Tracev(( "\nbl counts: "));
+    send_bits(s, lcodes - 257, 5);  /* not +255 as stated in appnote.txt */
+    send_bits(s, dcodes - 1,   5);
+    send_bits(s, blcodes - 4,  4);  /* not -3 as stated in appnote.txt */
    for (rank = 0; rank < blcodes; rank++) {
-        Tracev(("\nbl code %2d ", bl_order[rank]));
+        Tracev(( "\nbl code %2d ", bl_order[rank]));
        send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
    }
-    Tracev(("\nbl tree: sent %ld", s->bits_sent));
+    Tracev(( "\nbl tree: sent %ld", s->bits_sent));

-    send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
-    Tracev(("\nlit tree: sent %ld", s->bits_sent));
+    send_tree(s, (ct_data *)s->dyn_ltree, lcodes - 1);  /* literal tree */
+    Tracev(( "\nlit tree: sent %ld", s->bits_sent));

-    send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
-    Tracev(("\ndist tree: sent %ld", s->bits_sent));
+    send_tree(s, (ct_data *)s->dyn_dtree, dcodes - 1);  /* distance tree */
+    Tracev(( "\ndist tree: sent %ld", s->bits_sent));
 }

 /* ===========================================================================
@ -876,7 +872,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
    ulg stored_len;   /* length of input block */
    int last;         /* one if this is the last block for a file */
 {
-    send_bits(s, (STORED_BLOCK<<1)+last, 3);    /* send block type */
+    send_bits(s, (STORED_BLOCK<<1) + last, 3);  /* send block type */
    bi_windup(s);        /* align on byte boundary */
    put_short(s, (ush)stored_len);
    put_short(s, (ush)~stored_len);
@ -887,7 +883,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
    s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
    s->compressed_len += (stored_len + 4) << 3;
    s->bits_sent += 2*16;
-    s->bits_sent += stored_len<<3;
+    s->bits_sent += stored_len << 3;
 #endif
 }

@ -937,11 +933,11 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)

        /* Construct the literal and distance trees */
        build_tree(s, (tree_desc *)(&(s->l_desc)));
-        Tracev(("\nlit data: dyn %ld, stat %ld", s->opt_len,
+        Tracev(( "\nlit data: dyn %ld, stat %ld", s->opt_len,
                s->static_len));

        build_tree(s, (tree_desc *)(&(s->d_desc)));
-        Tracev(("\ndist data: dyn %ld, stat %ld", s->opt_len,
+        Tracev(( "\ndist data: dyn %ld, stat %ld", s->opt_len,
                s->static_len));
        /* At this point, opt_len and static_len are the total bit lengths of
         * the compressed block data, excluding the tree representations.
@ -953,14 +949,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
        max_blindex = build_bl_tree(s);

        /* Determine the best encoding. Compute the block lengths in bytes. */
-        opt_lenb = (s->opt_len+3+7)>>3;
-        static_lenb = (s->static_len+3+7)>>3;
+        opt_lenb = (s->opt_len + 3 + 7) >> 3;
+        static_lenb = (s->static_len + 3 + 7) >> 3;

-        Tracev(("\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
+        Tracev(( "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
                opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
                s->sym_next / 3));

-        if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
+#ifndef FORCE_STATIC
+        if (static_lenb <= opt_lenb || s->strategy == Z_FIXED)
+#endif
+            opt_lenb = static_lenb;

    } else {
        Assert(buf != (char*)0, "lost buf");
@ -970,7 +969,7 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
 #ifdef FORCE_STORED
    if (buf != (char*)0) { /* force stored block */
 #else
-    if (stored_len+4 <= opt_lenb && buf != (char*)0) {
+    if (stored_len + 4 <= opt_lenb && buf != (char*)0) {
                       /* 4: two words for the lengths */
 #endif
        /* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
@ -981,21 +980,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
         */
        _tr_stored_block(s, buf, stored_len, last);

-#ifdef FORCE_STATIC
-    } else if (static_lenb >= 0) { /* force static trees */
-#else
-    } else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
-#endif
-        send_bits(s, (STATIC_TREES<<1)+last, 3);
+    } else if (static_lenb == opt_lenb) {
+        send_bits(s, (STATIC_TREES<<1) + last, 3);
        compress_block(s, (const ct_data *)static_ltree,
                       (const ct_data *)static_dtree);
 #ifdef ZLIB_DEBUG
        s->compressed_len += 3 + s->static_len;
 #endif
    } else {
-        send_bits(s, (DYN_TREES<<1)+last, 3);
-        send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
-                       max_blindex+1);
+        send_bits(s, (DYN_TREES<<1) + last, 3);
+        send_all_trees(s, s->l_desc.max_code + 1, s->d_desc.max_code + 1,
+                       max_blindex + 1);
        compress_block(s, (const ct_data *)s->dyn_ltree,
                       (const ct_data *)s->dyn_dtree);
 #ifdef ZLIB_DEBUG
@ -1014,18 +1009,18 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
        s->compressed_len += 7;  /* align on byte boundary */
 #endif
    }
-    Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len>>3,
-           s->compressed_len-7*last));
+    Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len >> 3,
+           s->compressed_len - 7*last));
 }

 /* ===========================================================================
 * Save the match info and tally the frequency counts. Return true if
 * the current block must be flushed.
 */
-int ZLIB_INTERNAL _tr_tally (s, dist, lc)
+int ZLIB_INTERNAL _tr_tally(s, dist, lc)
    deflate_state *s;
    unsigned dist;  /* distance of matched string */
-    unsigned lc;    /* match length-MIN_MATCH or unmatched char (if dist==0) */
+    unsigned lc;    /* match length - MIN_MATCH or unmatched char (dist==0) */
 {
    s->sym_buf[s->sym_next++] = (uch)dist;
    s->sym_buf[s->sym_next++] = (uch)(dist >> 8);
@ -1041,7 +1036,7 @@ int ZLIB_INTERNAL _tr_tally (s, dist, lc)
               (ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
               (ush)d_code(dist) < (ush)D_CODES,  "_tr_tally: bad match");

-        s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++;
+        s->dyn_ltree[_length_code[lc] + LITERALS + 1].Freq++;
        s->dyn_dtree[d_code(dist)].Freq++;
    }
    return (s->sym_next == s->sym_end);
@ -1071,7 +1066,7 @@ local void compress_block(s, ltree, dtree)
        } else {
            /* Here, lc is the match length - MIN_MATCH */
            code = _length_code[lc];
-            send_code(s, code+LITERALS+1, ltree); /* send the length code */
+            send_code(s, code + LITERALS + 1, ltree);   /* send length code */
            extra = extra_lbits[code];
            if (extra != 0) {
                lc -= base_length[code];
@ -1187,6 +1182,6 @@ local void bi_windup(s)
    s->bi_buf = 0;
    s->bi_valid = 0;
 #ifdef ZLIB_DEBUG
-    s->bits_sent = (s->bits_sent+7) & ~7;
+    s->bits_sent = (s->bits_sent + 7) & ~7;
 #endif
 }
--- a/third_party/zlib/trees.inc
+++ b/third_party/zlib/trees.inc
@ -125,4 +125,3 @@ local const int base_dist[D_CODES] = {
   32,    48,    64,    96,   128,   192,   256,   384,   512,   768,
 1024,  1536,  2048,  3072,  4096,  6144,  8192, 12288, 16384, 24576
 };
-
--- a/third_party/zlib/uncompr.c
+++ b/third_party/zlib/uncompr.c
@ -9,11 +9,6 @@
 #include "third_party/zlib/internal.h"
 #include "third_party/zlib/macros.internal.h"
 #include "third_party/zlib/zlib.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /* @(#) $Id$ */
--- a/third_party/zlib/zlib.h
+++ b/third_party/zlib/zlib.h
@ -1696,6 +1696,11 @@ uLong adler32_combine(uLong adler1, uLong adler2, int64_t len2);
 */
 uLong crc32(uLong crc, const Bytef *buf, uInt len);

+/**
+ * Same as crc32(), but with a size_t length.
+ */
+uint32_t crc32_z(uint32_t crc, const void *buf, size_t len);
+
 /**
 * Combine two CRC-32 check values into one. For two sequences of bytes,
 * seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
--- a/third_party/zlib/zlib.mk
+++ b/third_party/zlib/zlib.mk
@ -9,6 +9,7 @@ THIRD_PARTY_ZLIB_A = o/$(MODE)/third_party/zlib/zlib.a
 THIRD_PARTY_ZLIB_A_FILES := $(wildcard third_party/zlib/*)
 THIRD_PARTY_ZLIB_A_HDRS = $(filter %.h,$(THIRD_PARTY_ZLIB_A_FILES))
 THIRD_PARTY_ZLIB_A_SRCS = $(filter %.c,$(THIRD_PARTY_ZLIB_A_FILES))
+THIRD_PARTY_ZLIB_A_INCS = $(filter %.inc,$(THIRD_PARTY_ZLIB_A_FILES))
 THIRD_PARTY_ZLIB_A_OBJS = $(THIRD_PARTY_ZLIB_A_SRCS:%.c=o/$(MODE)/%.o)

 THIRD_PARTY_ZLIB_A_CHECKS =				\
@ -18,6 +19,7 @@ THIRD_PARTY_ZLIB_A_CHECKS =				\
 THIRD_PARTY_ZLIB_A_DIRECTDEPS =				\
 	LIBC_INTRIN					\
 	LIBC_NEXGEN32E					\
+	LIBC_SYSV					\
 	LIBC_STR					\
 	LIBC_STUBS

@ -34,18 +36,35 @@ $(THIRD_PARTY_ZLIB_A).pkg:				\
 		$(foreach x,$(THIRD_PARTY_ZLIB_A_DIRECTDEPS),$($(x)_A).pkg)

 ifeq ($(ARCH), x86_64)
-o/$(MODE)/third_party/zlib/adler32simd.o: private	\
-		OVERRIDE_CFLAGS +=			\
+o/$(MODE)/third_party/zlib/adler32_simd.o: private	\
+		TARGET_ARCH +=				\
 			-mssse3
-o/$(MODE)/third_party/zlib/adler32simd.o: private	\
+o/$(MODE)/third_party/zlib/crc_folding.o		\
+o/$(MODE)/third_party/zlib/crc32_simd.o: private	\
+		TARGET_ARCH +=				\
+			-msse4.2			\
+			-mpclmul
+$(THIRD_PARTY_ZLIB_A_OBJS): private			\
 		OVERRIDE_CPPFLAGS +=			\
-			-DADLER32_SIMD_SSSE3
-o/$(MODE)/third_party/zlib/adler32.o: private		\
+			-DADLER32_SIMD_SSSE3		\
+			-DCRC32_SIMD_SSE42_PCLMUL	\
+			-DDEFLATE_SLIDE_HASH_SSE2	\
+			-DINFLATE_CHUNK_SIMD_SSE2	\
+			-DINFLATE_CHUNK_READ_64LE
+endif
+
+ifeq ($(ARCH), aarch64)
+o/$(MODE)/third_party/zlib/deflate.o			\
+o/$(MODE)/third_party/zlib/crc32_simd.o: private	\
+		TARGET_ARCH +=				\
+			-march=armv8-a+aes+crc
+$(THIRD_PARTY_ZLIB_A_OBJS): private			\
 		OVERRIDE_CPPFLAGS +=			\
-			-DADLER32_SIMD_SSSE3
-o/$(MODE)/third_party/zlib/deflate.o: private		\
-		OVERRIDE_CPPFLAGS +=			\
-			-DCRC32_SIMD_SSE42_PCLMUL
+			-DADLER32_SIMD_NEON		\
+			-DCRC32_ARMV8_CRC32		\
+			-DDEFLATE_SLIDE_HASH_NEON	\
+			-DINFLATE_CHUNK_SIMD_NEON	\
+			-DINFLATE_CHUNK_READ_64LE
 endif

 $(THIRD_PARTY_ZLIB_A_OBJS): private			\
@ -56,6 +75,7 @@ $(THIRD_PARTY_ZLIB_A_OBJS): private			\
 THIRD_PARTY_ZLIB_LIBS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)))
 THIRD_PARTY_ZLIB_SRCS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_SRCS))
 THIRD_PARTY_ZLIB_HDRS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_HDRS))
+THIRD_PARTY_ZLIB_INCS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_INCS))
 THIRD_PARTY_ZLIB_BINS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_BINS))
 THIRD_PARTY_ZLIB_CHECKS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_CHECKS))
 THIRD_PARTY_ZLIB_OBJS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_OBJS))
--- a/third_party/zlib/zutil.c
+++ b/third_party/zlib/zutil.c
@ -10,11 +10,6 @@
 #include "libc/mem/mem.h"
 #include "third_party/zlib/internal.h"
 #include "third_party/zlib/zutil.internal.h"
-
-asm(".ident\t\"\\n\\n\
-zlib (zlib License)\\n\
-Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
-asm(".include \"libc/disclaimer.inc\"");
 // clang-format off

 /* @(#) $Id$ */
--- a/third_party/zlib/zutil.internal.h
+++ b/third_party/zlib/zutil.internal.h
@ -128,6 +128,12 @@ typedef unsigned long ulg;
 #endif
 #endif

+#ifdef _MSC_VER
+#define zalign(x) __declspec(align(x))
+#else
+#define zalign(x) __attribute__((aligned((x))))
+#endif
+
 COSMOPOLITAN_C_END_
 #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
 #endif /* ZUTIL_H */