Make AARCH64 harder, better, faster, stronger

- Perform some housekeeping on scalar math function code
- Import ARM's Optimized Routines for SIMD string processing
- Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
Justine Tunney 2023-05-15 01:51:29 -07:00
parent 550b52abf6
commit cc1732bc42
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
143 changed files with 15661 additions and 1329 deletions

View file

@ -1,3 +1,4 @@
// -*- c++ -*-
#ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
#define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
#include "libc/macros.internal.h"

View file

@ -1,3 +1,4 @@
// -*- c++ -*-
#ifndef GPTNEOX_UTIL_H
#define GPTNEOX_UTIL_H
#include "libc/calls/calls.h"

View file

@ -1,3 +1,4 @@
// -*- c++ -*-
#ifndef GPTNEOX_H
#define GPTNEOX_H
// clang-format off

View file

@ -687,8 +687,6 @@ ulg crc32(crc, buf, len)
pointer, then initialize the crc shift register contents instead.
Return the current crc in either case. */
{
return crc32_z(crc,buf,len);
register z_uint4 c;
register ZCONST ulg near *crc_32_tab;

27
third_party/zlib/LICENSE.chromium vendored Normal file
View file

@ -0,0 +1,27 @@
// Copyright 2015 The Chromium Authors
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google LLC nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

22
third_party/zlib/LICENSE.zlib vendored Normal file
View file

@ -0,0 +1,22 @@
Copyright notice:
(C) 1995-2022 Jean-loup Gailly and Mark Adler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Jean-loup Gailly Mark Adler
jloup@gzip.org madler@alumni.caltech.edu

View file

@ -11,23 +11,9 @@ ORIGIN
The zlib sources were obtained from Chromium's zlib fork.
https://chromium.googlesource.com/chromium/src/third_party/zlib
commit 8f22e90f007a7dd466b426513725c13191248315
Author: Hans Wennborg <hans@chromium.org>
Date: Fri Sep 16 16:14:51 2022 +0000
[zlib][fuzz] Cap the input size for zlib_inflate_with_header_fuzzer
To prevent timeouts when processing large inputs with small chunk sizes.
Bug: 1362206
Change-Id: Ie21ea48abf85ee49897243857bf84b0f32d24bd5
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3899099
Reviewed-by: Adenilson Cavalcanti <cavalcantii@chromium.org>
Auto-Submit: Hans Wennborg <hans@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1048044}
NOKEYCHECK=True
GitOrigin-RevId: fd75b8c2768e7cc3a3e7a06bc563bb03c5ba0ec2
commit 14dd4c4455602c9b71a1a89b5cafd1f4030d2e3f
Author: Adenilson Cavalcanti <cavalcantii@chromium.org>
Date: Tue Apr 11 17:40:40 2023 +0000
The source code for puff was obtained from zlib itself:
@ -42,7 +28,7 @@ LOCAL CHANGES
- Changed Trace(stderr) calls to use kprintf()
- We use our own crc32() implementation from LIBC_STR
- Made the type signature of crc32_z() less obnoxious
- Fix a Chromium Zlib regression where malloc() failures inside
deflateInit2() will result in a segmentation fault

View file

@ -1,22 +1,15 @@
// clang-format off
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
*/
// clang-format off
/* adler32.c -- compute the Adler-32 checksum of a data stream
* Copyright (C) 1995-2011, 2016 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "libc/nexgen32e/x86feature.h"
#include "third_party/zlib/macros.internal.h"
#include "third_party/zlib/zconf.h"
#include "third_party/zlib/zutil.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* @(#) $Id$ */
#include "third_party/zlib/macros.internal.h"
local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
@ -70,7 +63,10 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
# define MOD63(a) a %= BASE
#endif
uint32_t ZLIB_INTERNAL adler32_simd_(uint32_t, const unsigned char *, z_size_t);
#include "third_party/zlib/cpu_features.internal.h"
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
#include "third_party/zlib/adler32_simd.inc"
#endif
/* ========================================================================= */
uLong ZEXPORT adler32_z(adler, buf, len)
@ -82,7 +78,7 @@ uLong ZEXPORT adler32_z(adler, buf, len)
unsigned n;
#if defined(ADLER32_SIMD_SSSE3)
if (buf != Z_NULL && len >= 64 && X86_HAVE(SSSE3))
if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
return adler32_simd_(adler, buf, len);
#elif defined(ADLER32_SIMD_NEON)
if (buf != Z_NULL && len >= 64)
@ -104,9 +100,24 @@ uLong ZEXPORT adler32_z(adler, buf, len)
return adler | (sum2 << 16);
}
#if defined(ADLER32_SIMD_SSSE3)
/*
* Use SSSE3 to compute the adler32. Since this routine can be
* freely used, check CPU features here. zlib convention is to
* call adler32(0, NULL, 0), before making calls to adler32().
* So this is a good early (and infrequent) place to cache CPU
* features for those later, more interesting adler32() calls.
*/
if (buf == Z_NULL) {
if (!len) /* Assume user is calling adler32(0, NULL, 0); */
cpu_check_features();
return 1L;
}
#else
/* initial Adler-32 value (deferred check for len == 1 speed) */
if (buf == Z_NULL)
return 1L;
#endif
/* in case short lengths are provided, keep it somewhat fast */
if (len < 16) {

371
third_party/zlib/adler32_simd.c vendored Normal file
View file

@ -0,0 +1,371 @@
asm(".ident\t\"\\n\\n\
Chromium (BSD-3 License)\\n\
Copyright 2017 The Chromium Authors\"");
// clang-format off
/* adler32_simd.c
*
* Copyright 2017 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*
* Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
* the sum of N input data bytes D1 ... DN,
*
* A = A0 + D1 + D2 + ... + DN
*
* where A0 is the initial value.
*
* SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
* for example) and accumulating the byte sums can use SSE shuffle-adds (see
* the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
* similar instructions.
*
* The adler32 B value (aka s2) sums the A values from each step:
*
* B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
*
* B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
*
* B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
*
* B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
*
* Adjacent blocks of 32 input bytes can be iterated with the expressions to
* compute the adler32 s1 s2 of M >> 32 input bytes [1].
*
* As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
* overflow the precision of their integer representation (bad). However, s1
* and s2 also need to be computed modulo the adler BASE value (reduced). If
* at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
* a uint32_t type (the NMAX constraint) [2].
*
* [1] the iterative equations for s2 contain constant factors; these can be
* hoisted from the n-blocks do loop of the SIMD code.
*
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/
#include "third_party/zlib/adler32_simd.inc"
/* Definitions from adler32.c: largest prime smaller than 65536 */
#define BASE 65521U
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define NMAX 5552
#if defined(ADLER32_SIMD_SSSE3)
#include "third_party/intel/tmmintrin.internal.h"
uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */
uint32_t adler,
const unsigned char *buf,
z_size_t len)
{
/*
* Split Adler-32 into component sums.
*/
uint32_t s1 = adler & 0xffff;
uint32_t s2 = adler >> 16;
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
z_size_t blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while (blocks)
{
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
n = (unsigned) blocks;
blocks -= n;
const __m128i tap1 =
_mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17);
const __m128i tap2 =
_mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i zero =
_mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
const __m128i ones =
_mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1);
/*
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo BASE.
*/
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
do {
/*
* Load 32 input bytes.
*/
const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));
/*
* Add previous block byte sum to v_ps.
*/
v_ps = _mm_add_epi32(v_ps, v_s1);
/*
* Horizontally add the bytes for s1, multiply-adds the
* bytes by [ 32, 31, 30, ... ] for s2.
*/
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
buf += BLOCK_SIZE;
} while (--n);
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
/*
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
*/
#define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */
#define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
s1 += _mm_cvtsi128_si32(v_s1);
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
s2 = _mm_cvtsi128_si32(v_s2);
#undef S23O1
#undef S1O32
/*
* Reduce.
*/
s1 %= BASE;
s2 %= BASE;
}
/*
* Handle leftover data.
*/
if (len) {
if (len >= 16) {
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
len -= 16;
}
while (len--) {
s2 += (s1 += *buf++);
}
if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}
/*
* Return the recombined sums.
*/
return s1 | (s2 << 16);
}
#elif defined(ADLER32_SIMD_NEON)
#include "third_party/aarch64/arm_neon.h"
uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
uint32_t adler,
const unsigned char *buf,
z_size_t len)
{
/*
* Split Adler-32 into component sums.
*/
uint32_t s1 = adler & 0xffff;
uint32_t s2 = adler >> 16;
/*
* Serially compute s1 & s2, until the data is 16-byte aligned.
*/
if ((uintptr_t)buf & 15) {
while ((uintptr_t)buf & 15) {
s2 += (s1 += *buf++);
--len;
}
if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
z_size_t blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while (blocks)
{
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
n = (unsigned) blocks;
blocks -= n;
/*
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo BASE.
*/
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n };
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
do {
/*
* Load 32 input bytes.
*/
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf));
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16));
/*
* Add previous block byte sum to v_s2.
*/
v_s2 = vaddq_u32(v_s2, v_s1);
/*
* Horizontally add the bytes for s1.
*/
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
/*
* Vertically add the bytes for s2.
*/
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1));
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2));
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
buf += BLOCK_SIZE;
} while (--n);
v_s2 = vshlq_n_u32(v_s2, 5);
/*
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
*/
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1),
(uint16x4_t) { 32, 31, 30, 29 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1),
(uint16x4_t) { 28, 27, 26, 25 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2),
(uint16x4_t) { 24, 23, 22, 21 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2),
(uint16x4_t) { 20, 19, 18, 17 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3),
(uint16x4_t) { 16, 15, 14, 13 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3),
(uint16x4_t) { 12, 11, 10, 9 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4),
(uint16x4_t) { 8, 7, 6, 5 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4),
(uint16x4_t) { 4, 3, 2, 1 });
/*
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
*/
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
s1 += vget_lane_u32(s1s2, 0);
s2 += vget_lane_u32(s1s2, 1);
/*
* Reduce.
*/
s1 %= BASE;
s2 %= BASE;
}
/*
* Handle leftover data.
*/
if (len) {
if (len >= 16) {
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
len -= 16;
}
while (len--) {
s2 += (s1 += *buf++);
}
if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}
/*
* Return the recombined sums.
*/
return s1 | (s2 << 16);
}
#endif /* ADLER32_SIMD_SSSE3 */

15
third_party/zlib/adler32_simd.inc vendored Normal file
View file

@ -0,0 +1,15 @@
/* adler32_simd.h
*
* Copyright 2017 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
#include "libc/inttypes.h"
#include "libc/limits.h"
#include "libc/literal.h"
#include "third_party/zlib/zconf.h"
#include "third_party/zlib/zutil.internal.h"
uint32_t ZLIB_INTERNAL adler32_simd_(uint32_t adler, const unsigned char *buf,
z_size_t len);

View file

@ -1,198 +0,0 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
Copyright 2017 The Chromium Authors
Use of this source code is governed by the BSD-style licenses that can
be found in the third_party/zlib/LICENSE file.
*/
#include "third_party/intel/emmintrin.internal.h"
#include "third_party/intel/tmmintrin.internal.h"
#include "third_party/zlib/internal.h"
asm(".ident\t\"\\n\\n\
Chromium (BSD-3 License)\\n\
Copyright 2017 The Chromium Authors\"");
asm(".include \"libc/disclaimer.inc\"");
/**
* Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
* the sum of N input data bytes D1 ... DN,
*
* A = A0 + D1 + D2 + ... + DN
*
* where A0 is the initial value.
*
* SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
* for example) and accumulating the byte sums can use SSE shuffle-adds (see
* the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
* similar instructions.
*
* The adler32 B value (aka s2) sums the A values from each step:
*
* B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
*
* B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
*
* B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
*
* B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
*
* Adjacent blocks of 32 input bytes can be iterated with the expressions to
* compute the adler32 s1 s2 of M >> 32 input bytes [1].
*
* As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
* overflow the precision of their integer representation (bad). However, s1
* and s2 also need to be computed modulo the adler BASE value (reduced). If
* at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
* a uint32_t type (the NMAX constraint) [2].
*
* [1] the iterative equations for s2 contain constant factors; these can be
* hoisted from the n-blocks do loop of the SIMD code.
*
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/
/* Definitions from adler32.c: largest prime smaller than 65536 */
#define BASE 65521U
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define NMAX 5552
#ifdef ADLER32_SIMD_SSSE3
uint32_t adler32_simd_(uint32_t adler, const unsigned char *buf, size_t len) {
/*
* Split Adler-32 into component sums.
*/
uint32_t s1 = adler & 0xffff;
uint32_t s2 = adler >> 16;
/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;
size_t blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;
while (blocks) {
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks) n = (unsigned)blocks;
blocks -= n;
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23,
22, 21, 20, 19, 18, 17);
const __m128i tap2 =
_mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
const __m128i zero =
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
/*
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo BASE.
*/
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
do {
/*
* Load 32 input bytes.
*/
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(buf));
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(buf + 16));
/*
* Add previous block byte sum to v_ps.
*/
v_ps = _mm_add_epi32(v_ps, v_s1);
/*
* Horizontally add the bytes for s1, multiply-adds the
* bytes by [ 32, 31, 30, ... ] for s2.
*/
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
buf += BLOCK_SIZE;
} while (--n);
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
/*
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
*/
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
s1 += _mm_cvtsi128_si32(v_s1);
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
s2 = _mm_cvtsi128_si32(v_s2);
#undef S23O1
#undef S1O32
/*
* Reduce.
*/
s1 %= BASE;
s2 %= BASE;
}
/*
* Handle leftover data.
*/
if (len) {
if (len >= 16) {
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
len -= 16;
}
while (len--) {
s2 += (s1 += *buf++);
}
if (s1 >= BASE) s1 -= BASE;
s2 %= BASE;
}
/*
* Return the recombined sums.
*/
return s1 | (s2 << 16);
}
#endif /* ADLER32_SIMD_SSSE3 */

491
third_party/zlib/chunkcopy.inc vendored Normal file
View file

@ -0,0 +1,491 @@
// clang-format off
/* chunkcopy.h -- fast chunk copy and set operations
* Copyright (C) 2017 ARM, Inc.
* Copyright 2017 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
#ifndef CHUNKCOPY_H
#define CHUNKCOPY_H
#include "libc/inttypes.h"
#include "libc/limits.h"
#include "libc/literal.h"
#include "third_party/zlib/zutil.internal.h"
#define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
#if __STDC_VERSION__ >= 199901L
#define Z_RESTRICT restrict
#else
#define Z_RESTRICT
#endif
#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
#define Z_BUILTIN_MEMCPY __builtin_memcpy
#else
#define Z_BUILTIN_MEMCPY zmemcpy
#endif
#if defined(INFLATE_CHUNK_SIMD_NEON)
#include "third_party/aarch64/arm_neon.h"
typedef uint8x16_t z_vec128i_t;
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
#include "third_party/intel/emmintrin.internal.h"
typedef __m128i z_vec128i_t;
#else
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
#endif
/*
* Suppress MSan errors about copying uninitialized bytes (crbug.com/1376033).
*/
#define Z_DISABLE_MSAN
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#undef Z_DISABLE_MSAN
#define Z_DISABLE_MSAN __attribute__((no_sanitize("memory")))
#endif
#endif
/*
* chunk copy type: the z_vec128i_t type size should be exactly 128-bits
* and equal to CHUNKCOPY_CHUNK_SIZE.
*/
#define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
Z_STATIC_ASSERT(vector_128_bits_wide,
CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
/*
* Ask the compiler to perform a wide, unaligned load with a machine
* instruction appropriate for the z_vec128i_t type.
*/
static inline z_vec128i_t loadchunk(
const unsigned char FAR* s) Z_DISABLE_MSAN {
z_vec128i_t v;
Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
return v;
}
/*
* Ask the compiler to perform a wide, unaligned store with a machine
* instruction appropriate for the z_vec128i_t type.
*/
static inline void storechunk(
unsigned char FAR* d,
const z_vec128i_t v) {
Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
}
/*
* Perform a memcpy-like operation, assuming that length is non-zero and that
* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
* the length is shorter than this.
*
* It also guarantees that it will properly unroll the data if the distance
* between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
* in chunkcopy_relaxed().
*
* Aside from better memory bus utilisation, this means that short copies
* (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
* without iteration, which will hopefully make the branch prediction more
* reliable.
*/
static inline unsigned char FAR* chunkcopy_core(
unsigned char FAR* out,
const unsigned char FAR* from,
unsigned len) Z_DISABLE_MSAN {
const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
storechunk(out, loadchunk(from));
out += bump;
from += bump;
len /= CHUNKCOPY_CHUNK_SIZE;
while (len-- > 0) {
storechunk(out, loadchunk(from));
out += CHUNKCOPY_CHUNK_SIZE;
from += CHUNKCOPY_CHUNK_SIZE;
}
return out;
}
/*
* Like chunkcopy_core(), but avoid writing beyond of legal output.
*
* Accepts an additional pointer to the end of safe output. A generic safe
* copy would use (out + len), but it's normally the case that the end of the
* output buffer is beyond the end of the current copy, and this can still be
* exploited.
*/
static inline unsigned char FAR* chunkcopy_core_safe(
unsigned char FAR* out,
const unsigned char FAR* from,
unsigned len,
unsigned char FAR* limit) {
Assert(out + len <= limit, "chunk copy exceeds safety limit");
if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
const unsigned char FAR* Z_RESTRICT rfrom = from;
Assert((uintptr_t)out - (uintptr_t)from >= len,
"invalid restrict in chunkcopy_core_safe");
Assert((uintptr_t)from - (uintptr_t)out >= len,
"invalid restrict in chunkcopy_core_safe");
if (len & 8) {
Z_BUILTIN_MEMCPY(out, rfrom, 8);
out += 8;
rfrom += 8;
}
if (len & 4) {
Z_BUILTIN_MEMCPY(out, rfrom, 4);
out += 4;
rfrom += 4;
}
if (len & 2) {
Z_BUILTIN_MEMCPY(out, rfrom, 2);
out += 2;
rfrom += 2;
}
if (len & 1) {
*out++ = *rfrom++;
}
return out;
}
return chunkcopy_core(out, from, len);
}
/*
* Perform short copies until distance can be rewritten as being at least
* CHUNKCOPY_CHUNK_SIZE.
*
* Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
* bytes of output even if the copy is shorter than this. This assumption
* holds within zlib inflate_fast(), which starts every iteration with at
* least 258 bytes of output space available (258 being the maximum length
* output from a single token; see inffast.c).
*/
static inline unsigned char FAR* chunkunroll_relaxed(
unsigned char FAR* out,
unsigned FAR* dist,
unsigned FAR* len) Z_DISABLE_MSAN {
const unsigned char FAR* from = out - *dist;
while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
storechunk(out, loadchunk(from));
out += *dist;
*len -= *dist;
*dist += *dist;
}
return out;
}
#if defined(INFLATE_CHUNK_SIMD_NEON)
/*
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
* every 64-bit component of the 128-bit result (64-bit int splat).
*/
static inline z_vec128i_t v_load64_dup(const void* src) {
return vcombine_u8(vld1_u8(src), vld1_u8(src));
}
/*
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
* every 32-bit component of the 128-bit result (32-bit int splat).
*/
static inline z_vec128i_t v_load32_dup(const void* src) {
int32_t i32;
Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
return vreinterpretq_u8_s32(vdupq_n_s32(i32));
}
/*
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
* every 16-bit component of the 128-bit result (16-bit int splat).
*/
static inline z_vec128i_t v_load16_dup(const void* src) {
int16_t i16;
Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
return vreinterpretq_u8_s16(vdupq_n_s16(i16));
}
/*
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
* component of the 128-bit result (8-bit int splat).
*/
static inline z_vec128i_t v_load8_dup(const void* src) {
return vld1q_dup_u8((const uint8_t*)src);
}
/*
* v_store_128(): store the 128-bit vec in a memory destination (that might
* not be 16-byte aligned) void* out.
*/
static inline void v_store_128(void* out, const z_vec128i_t vec) {
vst1q_u8(out, vec);
}
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
/*
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
* every 64-bit component of the 128-bit result (64-bit int splat).
*/
static inline z_vec128i_t v_load64_dup(const void* src) {
int64_t i64;
Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
return _mm_set1_epi64x(i64);
}
/*
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
* every 32-bit component of the 128-bit result (32-bit int splat).
*/
static inline z_vec128i_t v_load32_dup(const void* src) {
int32_t i32;
Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
return _mm_set1_epi32(i32);
}
/*
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
* every 16-bit component of the 128-bit result (16-bit int splat).
*/
static inline z_vec128i_t v_load16_dup(const void* src) {
int16_t i16;
Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
return _mm_set1_epi16(i16);
}
/*
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
* component of the 128-bit result (8-bit int splat).
*/
static inline z_vec128i_t v_load8_dup(const void* src) {
return _mm_set1_epi8(*(const char*)src);
}
/*
* v_store_128(): store the 128-bit vec in a memory destination (that might
* not be 16-byte aligned) void* out.
*/
static inline void v_store_128(void* out, const z_vec128i_t vec) {
_mm_storeu_si128((__m128i*)out, vec);
}
#endif
/*
* Perform an overlapping copy which behaves as a memset() operation, but
* supporting periods other than one, and assume that length is non-zero and
* that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
* even if the length is shorter than this.
*/
static inline unsigned char FAR* chunkset_core(
unsigned char FAR* out,
unsigned period,
unsigned len) {
z_vec128i_t v;
const int bump = ((len - 1) % sizeof(v)) + 1;
switch (period) {
case 1:
v = v_load8_dup(out - 1);
v_store_128(out, v);
out += bump;
len -= bump;
while (len > 0) {
v_store_128(out, v);
out += sizeof(v);
len -= sizeof(v);
}
return out;
case 2:
v = v_load16_dup(out - 2);
v_store_128(out, v);
out += bump;
len -= bump;
if (len > 0) {
v = v_load16_dup(out - 2);
do {
v_store_128(out, v);
out += sizeof(v);
len -= sizeof(v);
} while (len > 0);
}
return out;
case 4:
v = v_load32_dup(out - 4);
v_store_128(out, v);
out += bump;
len -= bump;
if (len > 0) {
v = v_load32_dup(out - 4);
do {
v_store_128(out, v);
out += sizeof(v);
len -= sizeof(v);
} while (len > 0);
}
return out;
case 8:
v = v_load64_dup(out - 8);
v_store_128(out, v);
out += bump;
len -= bump;
if (len > 0) {
v = v_load64_dup(out - 8);
do {
v_store_128(out, v);
out += sizeof(v);
len -= sizeof(v);
} while (len > 0);
}
return out;
}
out = chunkunroll_relaxed(out, &period, &len);
return chunkcopy_core(out, out - period, len);
}
/*
* Perform a memcpy-like operation, but assume that length is non-zero and that
* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
* the length is shorter than this.
*
* Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
* of overlapping buffers, regardless of the distance between the pointers.
* This is reflected in the `restrict`-qualified pointers, allowing the
* compiler to re-order loads and stores.
*/
static inline unsigned char FAR* chunkcopy_relaxed(
unsigned char FAR* Z_RESTRICT out,
const unsigned char FAR* Z_RESTRICT from,
unsigned len) {
Assert((uintptr_t)out - (uintptr_t)from >= len,
"invalid restrict in chunkcopy_relaxed");
Assert((uintptr_t)from - (uintptr_t)out >= len,
"invalid restrict in chunkcopy_relaxed");
return chunkcopy_core(out, from, len);
}
/*
* Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
*
* Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
* behaviour of overlapping buffers, regardless of the distance between the
* pointers. This is reflected in the `restrict`-qualified pointers, allowing
* the compiler to re-order loads and stores.
*
* Accepts an additional pointer to the end of safe output. A generic safe
* copy would use (out + len), but it's normally the case that the end of the
* output buffer is beyond the end of the current copy, and this can still be
* exploited.
*/
static inline unsigned char FAR* chunkcopy_safe(
unsigned char FAR* out,
const unsigned char FAR* Z_RESTRICT from,
unsigned len,
unsigned char FAR* limit) {
Assert(out + len <= limit, "chunk copy exceeds safety limit");
Assert((uintptr_t)out - (uintptr_t)from >= len,
"invalid restrict in chunkcopy_safe");
Assert((uintptr_t)from - (uintptr_t)out >= len,
"invalid restrict in chunkcopy_safe");
return chunkcopy_core_safe(out, from, len, limit);
}
/*
* Perform chunky copy within the same buffer, where the source and destination
* may potentially overlap.
*
* Assumes that len > 0 on entry, and that it's safe to write at least
* CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
*/
static inline unsigned char FAR* chunkcopy_lapped_relaxed(
unsigned char FAR* out,
unsigned dist,
unsigned len) {
if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
return chunkset_core(out, dist, len);
}
return chunkcopy_core(out, out - dist, len);
}
/*
* Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
* output.
*
* Accepts an additional pointer to the end of safe output. A generic safe
* copy would use (out + len), but it's normally the case that the end of the
* output buffer is beyond the end of the current copy, and this can still be
* exploited.
*/
static inline unsigned char FAR* chunkcopy_lapped_safe(
unsigned char FAR* out,
unsigned dist,
unsigned len,
unsigned char FAR* limit) {
Assert(out + len <= limit, "chunk copy exceeds safety limit");
if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) {
/* TODO(cavalcantii): try harder to optimise this */
while (len-- > 0) {
*out = *(out - dist);
out++;
}
return out;
}
return chunkcopy_lapped_relaxed(out, dist, len);
}
/* TODO(cavalcanti): see crbug.com/1110083. */
static inline unsigned char FAR* chunkcopy_safe_ugly(unsigned char FAR* out,
unsigned dist,
unsigned len,
unsigned char FAR* limit) {
#if defined(__GNUC__) && !defined(__clang__)
/* Speed is the same as using chunkcopy_safe
w/ GCC on ARM (tested gcc 6.3 and 7.5) and avoids
undefined behavior.
*/
return chunkcopy_core_safe(out, out - dist, len, limit);
#elif defined(__clang__) && defined(ARMV8_OS_ANDROID) && !defined(__aarch64__)
/* Seems to perform better on 32bit (i.e. Android). */
return chunkcopy_core_safe(out, out - dist, len, limit);
#else
/* Seems to perform better on 64bit. */
return chunkcopy_lapped_safe(out, dist, len, limit);
#endif
}
/*
* The chunk-copy code above deals with writing the decoded DEFLATE data to
* the output with SIMD methods to increase decode speed. Reading the input
* to the DEFLATE decoder with a wide, SIMD method can also increase decode
* speed. This option is supported on little endian machines, and reads the
* input data in 64-bit (8 byte) chunks.
*/
#ifdef INFLATE_CHUNK_READ_64LE
/*
* Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
*/
typedef uint64_t inflate_holder_t;
/*
* Ask the compiler to perform a wide, unaligned load of a uint64_t using a
* machine instruction appropriate for the uint64_t type.
*/
static inline inflate_holder_t read64le(const unsigned char FAR *in) {
inflate_holder_t input;
Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
return input;
}
#else
/*
* Otherwise, buffer the input bits using zlib's default input buffer type.
*/
typedef unsigned long inflate_holder_t;
#endif /* INFLATE_CHUNK_READ_64LE */
#undef Z_STATIC_ASSERT
#undef Z_RESTRICT
#undef Z_BUILTIN_MEMCPY
#undef Z_DISABLE_MSAN
#endif /* CHUNKCOPY_H */

View file

@ -8,11 +8,6 @@
#include "third_party/zlib/internal.h"
#include "third_party/zlib/macros.internal.h"
#include "third_party/zlib/zlib.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* @(#) $Id$ */

37
third_party/zlib/cpu_features.c vendored Normal file
View file

@ -0,0 +1,37 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2023 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/dce.h"
#include "libc/runtime/runtime.h"
#include "libc/sysv/consts/auxv.h"
#include "libc/sysv/consts/hwap.h"
#include "third_party/zlib/cpu_features.internal.h"
#include "third_party/zlib/zlib.h"
int arm_cpu_enable_crc32;
int arm_cpu_enable_pmull;
void(cpu_check_features)(void) {
#if defined(__aarch64__) && defined(__ARM_NEON)
if (IsLinux()) {
unsigned long features = getauxval(AT_HWCAP);
arm_cpu_enable_crc32 = !!(features & HWCAP_CRC32);
arm_cpu_enable_pmull = !!(features & HWCAP_PMULL);
}
#endif
}

View file

@ -0,0 +1,24 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_
#define COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_
#include "libc/nexgen32e/x86feature.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#ifdef __x86_64__
#define x86_cpu_enable_sse2 X86_HAVE(SSE2)
#define x86_cpu_enable_ssse3 X86_HAVE(SSSE3)
#define x86_cpu_enable_simd (X86_HAVE(SSE4_2) && X86_HAVE(PCLMUL))
#define x86_cpu_enable_avx512 X86_HAVE(AVX512F)
#define cpu_check_features() (void)0
#elif defined(__aarch64__)
#define cpu_check_features zlib_cpu_check_features
_Hide extern int arm_cpu_enable_crc32;
_Hide extern int arm_cpu_enable_pmull;
_Hide void cpu_check_features(void);
#endif
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_ */

View file

@ -13,11 +13,6 @@
#include "third_party/zlib/deflate.internal.h"
#include "third_party/zlib/internal.h"
#include "third_party/zlib/macros.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* @(#) $Id$ */
@ -33,8 +28,19 @@ asm(".include \"libc/disclaimer.inc\"");
produced, so that this one source file can be compiled to an executable.
*/
#ifdef MAKECRCH
//# include <stdio.h>
# ifndef DYNAMIC_CRC_TABLE
# define DYNAMIC_CRC_TABLE
# endif /* !DYNAMIC_CRC_TABLE */
#endif /* MAKECRCH */
#include "third_party/zlib/deflate.internal.h"
#include "third_party/zlib/cpu_features.internal.h"
#include "third_party/zlib/zutil.internal.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
# include "crc32_simd.h"
#include "third_party/zlib/crc32_simd.internal.h"
#endif
/*
@ -106,16 +112,25 @@ asm(".include \"libc/disclaimer.inc\"");
# endif
#endif
/* Local functions. */
local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
/* If available, use the ARM processor CRC32 instruction. */
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
&& defined(USE_CANONICAL_ARMV8_CRC32)
# define ARMCRC32_CANONICAL_ZLIB
#endif
/* Local functions. */
local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
#if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
local z_word_t byte_swap OF((z_word_t word));
#endif
#if defined(W) && !defined(ARMCRC32_CANONICAL_ZLIB)
local z_crc_t crc_word OF((z_word_t data));
local z_word_t crc_word_big OF((z_word_t data));
#endif
#if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
/*
Swap the bytes in a z_word_t to convert between little and big endian. Any
@ -149,7 +164,6 @@ local z_word_t byte_swap(word)
/* CRC polynomial. */
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
#define DYNAMIC_CRC_TABLE
#ifdef DYNAMIC_CRC_TABLE
local z_crc_t FAR crc_table[256];
@ -534,7 +548,7 @@ local void braid(ltl, big, n, w)
* Tables for byte-wise and braided CRC-32 calculations, and a table of powers
* of x for combining CRC-32s, all made by make_crc_table().
*/
# include "crc32.h"
#include "third_party/zlib/crc32.inc"
#endif /* DYNAMIC_CRC_TABLE */
/* ========================================================================
@ -617,15 +631,15 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
#define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
#define Z_BATCH_MIN 800 /* fewest words in a final batch */
#error this is arm?
unsigned long ZEXPORT crc32_z(crc, buf, len)
unsigned long crc;
const unsigned char FAR *buf;
z_size_t len;
uint32_t ZEXPORT crc32_z(crc, buf_, len)
uint32_t crc;
const void FAR *buf_;
size_t len;
{
z_crc_t val;
z_word_t crc1, crc2;
const z_word_t *word;
const unsigned char FAR *buf = buf_;
z_word_t val0, val1, val2;
z_size_t last, last2, i;
z_size_t num;
@ -743,13 +757,13 @@ local z_word_t crc_word_big(data)
#endif
#if 0 /* [jart] favor LIBC_STR crc32() */
/* ========================================================================= */
unsigned long ZEXPORT crc32_z(crc, buf, len)
unsigned long crc;
const unsigned char FAR *buf;
z_size_t len;
uint32_t ZEXPORT crc32_z(crc, buf_, len)
uint32_t crc;
const void FAR *buf_;
size_t len;
{
const unsigned char FAR *buf = buf_;
/*
* zlib convention is to call crc32(0, NULL, 0); before making
* calls to crc32(). So this is a good, early (and infrequent)
@ -767,7 +781,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
}
#endif
#if defined(CRC32_SIMD_SSE42_PCLMUL)
#if defined(CRC32_SIMD_AVX512_PCLMUL)
if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
/* crc32 64-byte chunks */
z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
/* check remaining data */
len -= chunk_size;
if (!len)
return crc;
/* Fall into the default crc32 for the remaining data. */
buf += chunk_size;
}
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
/* crc32 16-byte chunks */
z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
@ -1114,11 +1140,9 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
/* Return the CRC, post-conditioned. */
return crc ^ 0xffffffff;
}
#endif
#endif
#if 0 /* [jart] favor LIBC_STR crc32() */
/* ========================================================================= */
unsigned long ZEXPORT crc32(crc, buf, len)
unsigned long crc;
@ -1162,7 +1186,6 @@ unsigned long ZEXPORT crc32(crc, buf, len)
#endif
return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
}
#endif
/* ========================================================================= */
uLong ZEXPORT crc32_combine64(crc1, crc2, len2)

9448
third_party/zlib/crc32.inc vendored Normal file

File diff suppressed because it is too large Load diff

626
third_party/zlib/crc32_simd.c vendored Normal file
View file

@ -0,0 +1,626 @@
/* crc32_simd.c
*
* Copyright 2017 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
asm(".ident\t\"\\n\\n\
Chromium (BSD-3 License)\\n\
Copyright 2017 The Chromium Authors\"");
// clang-format off
#include "third_party/zlib/crc32_simd.internal.h"
#if defined(CRC32_SIMD_AVX512_PCLMUL)
/*
* crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 256, and a multiple of 64. Based on:
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
*/
#include "third_party/intel/emmintrin.internal.h"
#include "third_party/intel/smmintrin.internal.h"
#include "third_party/intel/wmmintrin.internal.h"
#include "third_party/intel/immintrin.internal.h"
uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
/*
* Definitions of the bit-reflected domain constants k1,k2,k3,k4
* are similar to those given at the end of the paper, and remaining
* constants and CRC32+Barrett polynomials remain unchanged.
*
* Replace the index of x from 128 to 512. As follows:
* k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
* k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
* k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
* k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
*/
static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430 };
static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a0, a1, a2, a3;
/*
* There's at least one block of 256.
*/
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
x0 = _mm512_load_si512((__m512i *)k1k2);
buf += 256;
len -= 256;
/*
* Parallel fold blocks of 256, if any.
*/
while (len >= 256)
{
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, x5);
x2 = _mm512_xor_si512(x2, x6);
x3 = _mm512_xor_si512(x3, x7);
x4 = _mm512_xor_si512(x4, x8);
x1 = _mm512_xor_si512(x1, y5);
x2 = _mm512_xor_si512(x2, y6);
x3 = _mm512_xor_si512(x3, y7);
x4 = _mm512_xor_si512(x4, y8);
buf += 256;
len -= 256;
}
/*
* Fold into 512-bits.
*/
x0 = _mm512_load_si512((__m512i *)k3k4);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x3);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x4);
x1 = _mm512_xor_si512(x1, x5);
/*
* Single fold blocks of 64, if any.
*/
while (len >= 64)
{
x2 = _mm512_loadu_si512((__m512i *)buf);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
buf += 64;
len -= 64;
}
/*
* Fold 512-bits to 384-bits.
*/
a0 = _mm_load_si128((__m128i *)k5k6);
a1 = _mm512_extracti32x4_epi32(x1, 0);
a2 = _mm512_extracti32x4_epi32(x1, 1);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
/*
* Fold 384-bits to 256-bits.
*/
a2 = _mm512_extracti32x4_epi32(x1, 2);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
/*
* Fold 256-bits to 128-bits.
*/
a2 = _mm512_extracti32x4_epi32(x1, 3);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
/*
* Fold 128-bits to 64-bits.
*/
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
a1 = _mm_srli_si128(a1, 8);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_loadl_epi64((__m128i*)k7k8);
a2 = _mm_srli_si128(a1, 4);
a1 = _mm_and_si128(a1, a3);
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
/*
* Barret reduce to 32-bits.
*/
a0 = _mm_load_si128((__m128i*)poly);
a2 = _mm_and_si128(a1, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
a2 = _mm_and_si128(a2, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
/*
* Return the crc32.
*/
return _mm_extract_epi32(a1, 1);
}
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
/*
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16.
*/
#include "third_party/intel/emmintrin.internal.h"
#include "third_party/intel/smmintrin.internal.h"
#include "third_party/intel/wmmintrin.internal.h"
uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
/*
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
* the CRC32+Barrett polynomials given at the end of the paper.
*/
static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
/*
* There's at least one block of 64.
*/
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
x0 = _mm_load_si128((__m128i *)k1k2);
buf += 64;
len -= 64;
/*
* Parallel fold blocks of 64, if any.
*/
while (len >= 64)
{
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_xor_si128(x4, x8);
x1 = _mm_xor_si128(x1, y5);
x2 = _mm_xor_si128(x2, y6);
x3 = _mm_xor_si128(x3, y7);
x4 = _mm_xor_si128(x4, y8);
buf += 64;
len -= 64;
}
/*
* Fold into 128-bits.
*/
x0 = _mm_load_si128((__m128i *)k3k4);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x3);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x4);
x1 = _mm_xor_si128(x1, x5);
/*
* Single fold blocks of 16, if any.
*/
while (len >= 16)
{
x2 = _mm_loadu_si128((__m128i *)buf);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
buf += 16;
len -= 16;
}
/*
* Fold 128-bits to 64-bits.
*/
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_loadl_epi64((__m128i*)k5k0);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
/*
* Barret reduce to 32-bits.
*/
x0 = _mm_load_si128((__m128i*)poly);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
/*
* Return the crc32.
*/
return _mm_extract_epi32(x1, 1);
}
#elif defined(CRC32_ARMV8_CRC32)
/* CRC32 checksums using ARMv8-a crypto instructions.
*/
#if defined(__clang__)
/* We need some extra types for using PMULL.
*/
#if defined(__aarch64__)
#include "third_party/aarch64/arm_neon.h"
#include "third_party/aarch64/arm_acle.h"
#endif
/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
* armv8 target, which is incompatible with ThinLTO optimizations on Android.
* (Namely, mixing and matching different module-level targets makes ThinLTO
* warn, and Android defaults to armv7-a. This restriction does not apply to
* function-level `target`s, however.)
*
* Since we only need four crc intrinsics, and since clang's implementation of
* those are just wrappers around compiler builtins, it's simplest to #define
* those builtins directly. If this #define list grows too much (or we depend on
* an intrinsic that isn't a trivial wrapper), we may have to find a better way
* to go about this.
*
* NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
* feature for this target (ignoring feature)." This appears to be a harmless
* bug in clang.
*
* These definitions must appear *after* including arm_acle.h otherwise that
* header may end up defining functions named __builtin_arm_crc32* that call
* themselves, creating an infinite loop when the intrinsic is called.
*/
/* XXX: Cannot hook into builtins with XCode for arm64. */
#if !defined(ARMV8_OS_MACOS)
#define __crc32b __builtin_arm_crc32b
#define __crc32d __builtin_arm_crc32d
#define __crc32w __builtin_arm_crc32w
#define __crc32cw __builtin_arm_crc32cw
#endif
#if defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
#else // !defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
#endif // defined(__aarch64__)
#elif defined(__GNUC__)
/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
* allowed. We can just include arm_acle.h.
*/
#include "third_party/aarch64/arm_neon.h"
#include "third_party/aarch64/arm_acle.h"
#define TARGET_ARMV8_WITH_CRC
#else // !defined(__GNUC__) && !defined(_aarch64__)
#error ARM CRC32 SIMD extensions only supported for Clang and GCC
#endif
TARGET_ARMV8_WITH_CRC
uint32_t ZLIB_INTERNAL armv8_crc32_little(
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
uint32_t c = (uint32_t) ~crc;
while (len && ((uintptr_t)buf & 7)) {
c = __crc32b(c, *buf++);
--len;
}
const uint64_t *buf8 = (const uint64_t *)buf;
while (len >= 64) {
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
len -= 64;
}
while (len >= 8) {
c = __crc32d(c, *buf8++);
len -= 8;
}
buf = (const unsigned char *)buf8;
while (len--) {
c = __crc32b(c, *buf++);
}
return ~c;
}
#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
/*
* crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16. Based on:
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
*/
TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
{
uint8x16_t r;
__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
: "=w" (r) : "w" (a), "w" (b) );
return r;
}
TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
{
uint8x16_t r;
__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
: "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
return r;
}
TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
{
uint8x16_t r;
__asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
: "=w" (r) : "w" (a), "w" (b) );
return r;
}
TARGET_ARMV8_WITH_CRC
uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
/*
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
* the CRC32+Barrett polynomials given at the end of the paper.
*/
static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
/*
* There's at least one block of 64.
*/
x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
x0 = vld1q_u64(k1k2);
buf += 64;
len -= 64;
/*
* Parallel fold blocks of 64, if any.
*/
while (len >= 64)
{
x5 = (uint64x2_t) pmull_lo(x1, x0);
x6 = (uint64x2_t) pmull_lo(x2, x0);
x7 = (uint64x2_t) pmull_lo(x3, x0);
x8 = (uint64x2_t) pmull_lo(x4, x0);
y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
x1 = (uint64x2_t) pmull_hi(x1, x0);
x2 = (uint64x2_t) pmull_hi(x2, x0);
x3 = (uint64x2_t) pmull_hi(x3, x0);
x4 = (uint64x2_t) pmull_hi(x4, x0);
x1 = veorq_u64(x1, x5);
x2 = veorq_u64(x2, x6);
x3 = veorq_u64(x3, x7);
x4 = veorq_u64(x4, x8);
x1 = veorq_u64(x1, y5);
x2 = veorq_u64(x2, y6);
x3 = veorq_u64(x3, y7);
x4 = veorq_u64(x4, y8);
buf += 64;
len -= 64;
}
/*
* Fold into 128-bits.
*/
x0 = vld1q_u64(k3k4);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x2);
x1 = veorq_u64(x1, x5);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x3);
x1 = veorq_u64(x1, x5);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x4);
x1 = veorq_u64(x1, x5);
/*
* Single fold blocks of 16, if any.
*/
while (len >= 16)
{
x2 = vld1q_u64((const uint64_t *)buf);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x2);
x1 = veorq_u64(x1, x5);
buf += 16;
len -= 16;
}
/*
* Fold 128-bits to 64-bits.
*/
static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
x2 = (uint64x2_t) pmull_01(x1, x0);
x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
x3 = (uint64x2_t) vld1q_u32(mask);
x1 = veorq_u64(x1, x2);
x0 = vld1q_u64(k5k0);
x2 = (uint64x2_t) pmull_01(x2, x0);
x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
x1 = vandq_u64(x1, x3);
x1 = (uint64x2_t) pmull_lo(x1, x0);
x1 = veorq_u64(x1, x2);
/*
* Barret reduce to 32-bits.
*/
x0 = vld1q_u64(poly);
x2 = vandq_u64(x1, x3);
x2 = (uint64x2_t) pmull_01(x2, x0);
x2 = vandq_u64(x2, x3);
x2 = (uint64x2_t) pmull_lo(x2, x0);
x1 = veorq_u64(x1, x2);
/*
* Return the crc32.
*/
return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
}
#endif /* aarch64 specific code. */
#endif

57
third_party/zlib/crc32_simd.internal.h vendored Executable file
View file

@ -0,0 +1,57 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_
#define COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_
#include "third_party/zlib/deflate.internal.h"
#include "third_party/zlib/zconf.h"
#include "third_party/zlib/zutil.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
// clang-format off
/*
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16.
*/
uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
z_size_t len,
uint32_t crc);
uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
z_size_t len,
uint32_t crc);
/*
* crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
* for computing the crc32 of an arbitrary length buffer.
*/
#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63
/*
* CRC32 checksums using ARMv8-a crypto instructions.
*/
uint32_t ZLIB_INTERNAL armv8_crc32_little(const unsigned char* buf,
z_size_t len,
uint32_t crc);
/* aarch64 specific code. */
#if defined(__aarch64__)
/* 128 is the sweet spot at the time of coding (late 2020). */
#define Z_CRC32_PMULL_MINIMUM_LENGTH 128
#define Z_CRC32_PMULL_CHUNKSIZE_MASK 15
/*
* CRC32 checksums using ARMv8-a PMULL instructions, where the buffer
* length must be at least 64, and a multiple of 16.
*/
uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(const unsigned char* buf,
z_size_t len,
uint32_t crc);
#endif
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_ */

503
third_party/zlib/crc_folding.c vendored Normal file
View file

@ -0,0 +1,503 @@
#include "libc/fmt/conv.h"
#include "libc/inttypes.h"
#include "libc/limits.h"
#include "libc/literal.h"
#include "libc/str/str.h"
#include "third_party/intel/emmintrin.internal.h"
#include "third_party/intel/immintrin.internal.h"
#include "third_party/intel/wmmintrin.internal.h"
#include "third_party/zlib/deflate.internal.h"
// clang-format off
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#ifdef CRC32_SIMD_SSE42_PCLMUL
#define CRC_LOAD(s) \
do { \
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
__m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
#define CRC_SAVE(s) \
_mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
_mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
_mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
_mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
_mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
} while (0);
ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
{
CRC_LOAD(s)
xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
xmm_crc1 = _mm_setzero_si128();
xmm_crc2 = _mm_setzero_si128();
xmm_crc3 = _mm_setzero_si128();
CRC_SAVE(s)
s->strm->adler = 0;
}
local void fold_1(deflate_state *const s,
__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3)
{
const __m128i xmm_fold4 = _mm_set_epi32(
0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc3, ps_res;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
*xmm_crc0 = *xmm_crc1;
*xmm_crc1 = *xmm_crc2;
*xmm_crc2 = x_tmp3;
*xmm_crc3 = _mm_castps_si128(ps_res);
}
local void fold_2(deflate_state *const s,
__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3)
{
const __m128i xmm_fold4 = _mm_set_epi32(
0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3, x_tmp2;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
x_tmp3 = *xmm_crc3;
x_tmp2 = *xmm_crc2;
*xmm_crc3 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
*xmm_crc2 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
*xmm_crc0 = x_tmp2;
*xmm_crc1 = x_tmp3;
*xmm_crc2 = _mm_castps_si128(ps_res20);
*xmm_crc3 = _mm_castps_si128(ps_res31);
}
local void fold_3(deflate_state *const s,
__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3)
{
const __m128i xmm_fold4 = _mm_set_epi32(
0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc2;
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
*xmm_crc2 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
*xmm_crc1 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
*xmm_crc0 = x_tmp3;
*xmm_crc1 = _mm_castps_si128(ps_res10);
*xmm_crc2 = _mm_castps_si128(ps_res21);
*xmm_crc3 = _mm_castps_si128(ps_res32);
}
local void fold_4(deflate_state *const s,
__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3)
{
const __m128i xmm_fold4 = _mm_set_epi32(
0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
x_tmp0 = *xmm_crc0;
x_tmp1 = *xmm_crc1;
x_tmp2 = *xmm_crc2;
x_tmp3 = *xmm_crc3;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_t0 = _mm_castsi128_ps(x_tmp0);
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_t1 = _mm_castsi128_ps(x_tmp1);
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_t2 = _mm_castsi128_ps(x_tmp2);
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_t3 = _mm_castsi128_ps(x_tmp3);
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
*xmm_crc0 = _mm_castps_si128(ps_res0);
*xmm_crc1 = _mm_castps_si128(ps_res1);
*xmm_crc2 = _mm_castps_si128(ps_res2);
*xmm_crc3 = _mm_castps_si128(ps_res3);
}
local const unsigned zalign(32) pshufb_shf_table[60] = {
0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
};
local void partial_fold(deflate_state *const s, const size_t len,
__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3,
__m128i *xmm_crc_part)
{
const __m128i xmm_fold4 = _mm_set_epi32(
0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
__m128i xmm_a0_0, xmm_a0_1;
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
xmm_shr = xmm_shl;
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
ps_res = _mm_xor_ps(ps_res, psa0_1);
*xmm_crc3 = _mm_castps_si128(ps_res);
}
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
unsigned char *dst, const unsigned char *src, long len)
{
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
CRC_LOAD(s)
if (len < 16) {
if (len == 0)
return;
goto partial;
}
algn_diff = (0 - (uintptr_t)src) & 0xF;
if (algn_diff) {
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
dst += algn_diff;
src += algn_diff;
len -= algn_diff;
partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
&xmm_crc_part);
}
while ((len -= 64) >= 0) {
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
src += 64;
dst += 64;
}
/*
* len = num bytes left - 64
*/
if (len + 16 >= 0) {
len += 16;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
if (len == 0)
goto done;
dst += 48;
src += 48;
} else if (len + 32 >= 0) {
len += 32;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
_mm_storeu_si128((__m128i *)dst, xmm_t0);
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
if (len == 0)
goto done;
dst += 32;
src += 32;
} else if (len + 48 >= 0) {
len += 48;
xmm_t0 = _mm_load_si128((__m128i *)src);
fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
_mm_storeu_si128((__m128i *)dst, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
if (len == 0)
goto done;
dst += 16;
src += 16;
} else {
len += 64;
if (len == 0)
goto done;
}
partial:
#if defined(_MSC_VER)
/* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
{
int32_t parts[4] = {0, 0, 0, 0};
memcpy(&parts, src, len);
xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
}
#else
{
int64_t parts[2] = {0, 0};
memcpy(&parts, src, len);
xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
}
#endif
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
&xmm_crc_part);
done:
CRC_SAVE(s)
}
local const unsigned zalign(16) crc_k[] = {
0xccaa009e, 0x00000000, /* rk1 */
0x751997d0, 0x00000001, /* rk2 */
0xccaa009e, 0x00000000, /* rk5 */
0x63cd6124, 0x00000001, /* rk6 */
0xf7011640, 0x00000001, /* rk7 */
0xdb710640, 0x00000001 /* rk8 */
};
local const unsigned zalign(16) crc_mask[4] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
};
local const unsigned zalign(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
{
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
unsigned crc;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
/*
* k1
*/
crc_fold = _mm_load_si128((__m128i *)crc_k);
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
/*
* k5
*/
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
/*
* k7
*/
xmm_crc1 = xmm_crc3;
xmm_crc2 = xmm_crc3;
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
xmm_crc2 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
crc = _mm_extract_epi32(xmm_crc3, 2);
return ~crc;
}
#endif /* CRC32_SIMD_SSE42_PCLMUL */

View file

@ -5,13 +5,15 @@
* Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "libc/assert.h"
#include "third_party/zlib/cpu_features.internal.h"
#include "third_party/zlib/deflate.internal.h"
#include "third_party/zlib/insert_string.internal.h"
#include "third_party/zlib/internal.h"
#include "third_party/zlib/zutil.internal.h"
asm(".ident\t\"\\n\\n\
deflate 1.2.12.1 (zlib License)\\n\
zlib 1.2.13 (zlib License)\\n\
Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
Invented 1990 Phillip Walter Katz\"");
// clang-format off
@ -62,6 +64,12 @@ Invented 1990 Phillip Walter Katz\"");
/* @(#) $Id$ */
#if defined(DEFLATE_SLIDE_HASH_SSE2) || defined(DEFLATE_SLIDE_HASH_NEON)
#include "third_party/zlib/slide_hash_simd.inc"
#endif
#include "third_party/zlib/insert_string.inc"
#ifdef FASTEST
/* See http://crbug.com/1113596 */
#error "FASTEST is not supported in Chromium's zlib."
@ -101,13 +109,7 @@ local void lm_init OF((deflate_state *s));
local void putShortMSB OF((deflate_state *s, uInt b));
local void flush_pending OF((z_streamp strm));
local unsigned read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
#ifdef ASMV
# pragma message("Assembler code may have bugs -- use at your own risk")
void match_init OF((void)); /* asm code initialization */
uInt longest_match OF((deflate_state *s, IPos cur_match));
#else
local uInt longest_match OF((deflate_state *s, IPos cur_match));
#endif
#ifdef ZLIB_DEBUG
local void check_match OF((deflate_state *s, IPos start, IPos match,
@ -181,9 +183,9 @@ local const config configuration_table[10] = {
*/
#define CLEAR_HASH(s) \
do { \
s->head[s->hash_size-1] = NIL; \
s->head[s->hash_size - 1] = NIL; \
zmemzero((Bytef *)s->head, \
(unsigned)(s->hash_size-1)*sizeof(*s->head)); \
(unsigned)(s->hash_size - 1)*sizeof(*s->head)); \
} while (0)
/* ===========================================================================
@ -245,6 +247,14 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
deflate_state *s;
int wrap = 1;
// Needed to activate optimized insert_string() that helps compression
// for all wrapper formats (e.g. RAW, ZLIB, GZIP).
// Feature detection is not triggered while using RAW mode (i.e. we never
// call crc32() with a NULL buffer).
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
cpu_check_features();
#endif
if (strm == Z_NULL) return Z_STREAM_ERROR;
strm->msg = Z_NULL;
@ -271,6 +281,8 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
if (windowBits < 0) { /* suppress zlib wrapper */
wrap = 0;
if (windowBits < -15)
return Z_STREAM_ERROR;
windowBits = -windowBits;
}
#ifdef GZIP
@ -300,7 +312,7 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
s->chromium_zlib_hash = 0;
#if !defined(USE_ZLIB_RABIN_KARP_ROLLING_HASH)
#if defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
if (X86_HAVE(SSE4_2))
if (x86_cpu_enable_simd)
s->chromium_zlib_hash = 1;
#elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_ARMV8_CRC32)
if (arm_cpu_enable_crc32)
@ -315,16 +327,15 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
s->hash_size = 1 << s->hash_bits;
s->hash_mask = s->hash_size - 1;
s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
s->hash_shift = ((s->hash_bits + MIN_MATCH-1) / MIN_MATCH);
s->window = (Bytef *) ZALLOC(strm,
s->w_size + window_padding,
2*sizeof(Byte));
/* Avoid use of unitialized values in the window, see crbug.com/1137613 and
* crbug.com/1144420 */
if (s->window) { /* [jart] fix regression in malloc failure checking */
zmemzero(s->window, (s->w_size + window_padding) * (2 * sizeof(Byte)));
zmemzero(s->window, (s->w_size + window_padding) * (2 * sizeof(Byte)));
}
s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
/* Avoid use of uninitialized value, see:
@ -355,11 +366,11 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
* sym_buf value to read moves forward three bytes. From that symbol, up to
* 31 bits are written to pending_buf. The closest the written pending_buf
* bits gets to the next sym_buf symbol to read is just before the last
* code is written. At that time, 31*(n-2) bits have been written, just
* after 24*(n-2) bits have been consumed from sym_buf. sym_buf starts at
* 8*n bits into pending_buf. (Note that the symbol buffer fills when n-1
* code is written. At that time, 31*(n - 2) bits have been written, just
* after 24*(n - 2) bits have been consumed from sym_buf. sym_buf starts at
* 8*n bits into pending_buf. (Note that the symbol buffer fills when n - 1
* symbols are written.) The closest the writing gets to what is unread is
* then n+14 bits. Here n is lit_bufsize, which is 16384 by default, and
* then n + 14 bits. Here n is lit_bufsize, which is 16384 by default, and
* can range from 128 to 32768.
*
* Therefore, at a minimum, there are 142 bits of space between what is
@ -404,7 +415,7 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
/* =========================================================================
* Check for a valid deflate stream state. Return 0 if ok, 1 if not.
*/
local int deflateStateCheck (strm)
local int deflateStateCheck(strm)
z_streamp strm;
{
deflate_state *s;
@ -427,7 +438,7 @@ local int deflateStateCheck (strm)
}
/* ========================================================================= */
int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
int ZEXPORT deflateSetDictionary(strm, dictionary, dictLength)
z_streamp strm;
const Bytef *dictionary;
uInt dictLength;
@ -492,7 +503,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
}
/* ========================================================================= */
int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength)
int ZEXPORT deflateGetDictionary(strm, dictionary, dictLength)
z_streamp strm;
Bytef *dictionary;
uInt *dictLength;
@ -514,7 +525,7 @@ int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength)
}
/* ========================================================================= */
int ZEXPORT deflateResetKeep (strm)
int ZEXPORT deflateResetKeep(strm)
z_streamp strm;
{
deflate_state *s;
@ -552,7 +563,7 @@ int ZEXPORT deflateResetKeep (strm)
}
/* ========================================================================= */
int ZEXPORT deflateReset (strm)
int ZEXPORT deflateReset(strm)
z_streamp strm;
{
int ret;
@ -564,7 +575,7 @@ int ZEXPORT deflateReset (strm)
}
/* ========================================================================= */
int ZEXPORT deflateSetHeader (strm, head)
int ZEXPORT deflateSetHeader(strm, head)
z_streamp strm;
gz_headerp head;
{
@ -575,7 +586,7 @@ int ZEXPORT deflateSetHeader (strm, head)
}
/* ========================================================================= */
int ZEXPORT deflatePending (strm, pending, bits)
int ZEXPORT deflatePending(strm, pending, bits)
unsigned *pending;
int *bits;
z_streamp strm;
@ -589,7 +600,7 @@ int ZEXPORT deflatePending (strm, pending, bits)
}
/* ========================================================================= */
int ZEXPORT deflatePrime (strm, bits, value)
int ZEXPORT deflatePrime(strm, bits, value)
z_streamp strm;
int bits;
int value;
@ -684,36 +695,50 @@ int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
}
/* =========================================================================
* For the default windowBits of 15 and memLevel of 8, this function returns
* a close to exact, as well as small, upper bound on the compressed size.
* They are coded as constants here for a reason--if the #define's are
* changed, then this function needs to be changed as well. The return
* value for 15 and 8 only works for those exact settings.
* For the default windowBits of 15 and memLevel of 8, this function returns a
* close to exact, as well as small, upper bound on the compressed size. This
* is an expansion of ~0.03%, plus a small constant.
*
* For any setting other than those defaults for windowBits and memLevel,
* the value returned is a conservative worst case for the maximum expansion
* resulting from using fixed blocks instead of stored blocks, which deflate
* can emit on compressed data for some combinations of the parameters.
* For any setting other than those defaults for windowBits and memLevel, one
* of two worst case bounds is returned. This is at most an expansion of ~4% or
* ~13%, plus a small constant.
*
* This function could be more sophisticated to provide closer upper bounds for
* every combination of windowBits and memLevel. But even the conservative
* upper bound of about 14% expansion does not seem onerous for output buffer
* allocation.
* Both the 0.03% and 4% derive from the overhead of stored blocks. The first
* one is for stored blocks of 16383 bytes (memLevel == 8), whereas the second
* is for stored blocks of 127 bytes (the worst case memLevel == 1). The
* expansion results from five bytes of header for each stored block.
*
* The larger expansion of 13% results from a window size less than or equal to
* the symbols buffer size (windowBits <= memLevel + 7). In that case some of
* the data being compressed may have slid out of the sliding window, impeding
* a stored block from being emitted. Then the only choice is a fixed or
* dynamic block, where a fixed block limits the maximum expansion to 9 bits
* per 8-bit byte, plus 10 bits for every block. The smallest block size for
* which this can occur is 255 (memLevel == 2).
*
* Shifts are used to approximate divisions, for speed.
*/
uLong ZEXPORT deflateBound(strm, sourceLen)
z_streamp strm;
uLong sourceLen;
{
deflate_state *s;
uLong complen, wraplen;
uLong fixedlen, storelen, wraplen;
/* conservative upper bound for compressed data */
complen = sourceLen +
((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 5;
/* upper bound for fixed blocks with 9-bit literals and length 255
(memLevel == 2, which is the lowest that may not use stored blocks) --
~13% overhead plus a small constant */
fixedlen = sourceLen + (sourceLen >> 3) + (sourceLen >> 8) +
(sourceLen >> 9) + 4;
/* if can't get parameters, return conservative bound plus zlib wrapper */
/* upper bound for stored blocks with length 127 (memLevel == 1) --
~4% overhead plus a small constant */
storelen = sourceLen + (sourceLen >> 5) + (sourceLen >> 7) +
(sourceLen >> 11) + 7;
/* if can't get parameters, return larger bound plus a zlib wrapper */
if (deflateStateCheck(strm))
return complen + 6;
return (fixedlen > storelen ? fixedlen : storelen) + 6;
/* compute wrapper length */
s = strm->state;
@ -750,11 +775,12 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
wraplen = 6;
}
/* if not default parameters, return conservative bound */
/* if not default parameters, return one of the conservative bounds */
if (s->w_bits != 15 || s->hash_bits != 8 + 7)
return complen + wraplen;
return (s->w_bits <= s->hash_bits ? fixedlen : storelen) + wraplen;
/* default settings: return tight bound for that case */
/* default settings: return tight bound for that case -- ~0.03% overhead
plus a small constant */
return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
(sourceLen >> 25) + 13 - 6 + wraplen;
}
@ -764,7 +790,7 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
* IN assertion: the stream state is correct and there is enough room in
* pending_buf.
*/
local void putShortMSB (s, b)
local void putShortMSB(s, b)
deflate_state *s;
uInt b;
{
@ -811,7 +837,7 @@ local void flush_pending(strm)
} while (0)
/* ========================================================================= */
int ZEXPORT deflate (strm, flush)
int ZEXPORT deflate(strm, flush)
z_streamp strm;
int flush;
{
@ -866,7 +892,7 @@ int ZEXPORT deflate (strm, flush)
s->status = BUSY_STATE;
if (s->status == INIT_STATE) {
/* zlib header */
uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
uInt header = (Z_DEFLATED + ((s->w_bits - 8) << 4)) << 8;
uInt level_flags;
if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2)
@ -1127,7 +1153,7 @@ int ZEXPORT deflate (strm, flush)
}
/* ========================================================================= */
int ZEXPORT deflateEnd (strm)
int ZEXPORT deflateEnd(strm)
z_streamp strm;
{
int status;
@ -1153,7 +1179,7 @@ int ZEXPORT deflateEnd (strm)
* To simplify the source, this is not supported for 16-bit MSDOS (which
* doesn't have enough memory anyway to duplicate compression states).
*/
int ZEXPORT deflateCopy (dest, source)
int ZEXPORT deflateCopy(dest, source)
z_streamp dest;
z_streamp source;
{
@ -1243,7 +1269,7 @@ local unsigned read_buf(strm, buf, size)
/* ===========================================================================
* Initialize the "longest match" routines for a new zlib stream
*/
local void lm_init (s)
local void lm_init(s)
deflate_state *s;
{
s->window_size = (ulg)2L*s->w_size;
@ -1264,11 +1290,6 @@ local void lm_init (s)
s->match_length = s->prev_length = MIN_MATCH-1;
s->match_available = 0;
s->ins_h = 0;
#ifndef FASTEST
#ifdef ASMV
match_init(); /* initialize the asm code */
#endif
#endif
}
#ifndef FASTEST
@ -1281,10 +1302,6 @@ local void lm_init (s)
* string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
* OUT assertion: the match length is not greater than s->lookahead.
*/
#ifndef ASMV
/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
* match.S. The code will be functionally equivalent.
*/
local uInt longest_match(s, cur_match)
deflate_state *s;
IPos cur_match; /* current match */
@ -1309,10 +1326,10 @@ local uInt longest_match(s, cur_match)
*/
register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
register ush scan_start = *(ushf*)scan;
register ush scan_end = *(ushf*)(scan+best_len-1);
register ush scan_end = *(ushf*)(scan + best_len - 1);
#else
register Bytef *strend = s->window + s->strstart + MAX_MATCH;
register Byte scan_end1 = scan[best_len-1];
register Byte scan_end1 = scan[best_len - 1];
register Byte scan_end = scan[best_len];
#endif
@ -1330,7 +1347,8 @@ local uInt longest_match(s, cur_match)
*/
if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead;
Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
"need lookahead");
do {
Assert(cur_match < s->strstart, "no future");
@ -1348,15 +1366,15 @@ local uInt longest_match(s, cur_match)
/* This code assumes sizeof(unsigned short) == 2. Do not use
* UNALIGNED_OK if your compiler uses a different size.
*/
if (*(ushf*)(match+best_len-1) != scan_end ||
if (*(ushf*)(match + best_len - 1) != scan_end ||
*(ushf*)match != scan_start) continue;
/* It is not necessary to compare scan[2] and match[2] since they are
* always equal when the other bytes match, given that the hash keys
* are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
* strstart+3, +5, ... up to strstart+257. We check for insufficient
* strstart + 3, + 5, up to strstart + 257. We check for insufficient
* lookahead only every 4th comparison; the 128th check will be made
* at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
* at strstart + 257. If MAX_MATCH-2 is not a multiple of 8, it is
* necessary to put more guard bytes at the end of the window, or
* to check more often for insufficient lookahead.
*/
@ -1372,28 +1390,29 @@ local uInt longest_match(s, cur_match)
}
scan++, match++;
do {
} while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
} while (*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
scan < strend);
/* The funny "do {}" generates better code on most compilers */
/* Here, scan <= window+strstart+257 */
Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
/* Here, scan <= window + strstart + 257 */
Assert(scan <= s->window+(unsigned)(s->window_size - 1),
"wild scan");
if (*scan == *match) scan++;
len = (MAX_MATCH - 1) - (int)(strend-scan);
len = (MAX_MATCH - 1) - (int)(strend - scan);
scan = strend - (MAX_MATCH-1);
#else /* UNALIGNED_OK */
if (match[best_len] != scan_end ||
match[best_len-1] != scan_end1 ||
match[best_len - 1] != scan_end1 ||
*match != *scan ||
*++match != scan[1]) continue;
/* The check at best_len-1 can be removed because it will be made
/* The check at best_len - 1 can be removed because it will be made
* again later. (This heuristic is not always a win.)
* It is not necessary to compare scan[2] and match[2] since they
* are always equal when the other bytes match, given that
@ -1412,7 +1431,7 @@ local uInt longest_match(s, cur_match)
}
/* We check for insufficient lookahead only every 8th comparison;
* the 256th check will be made at strstart+258.
* the 256th check will be made at strstart + 258.
*/
do {
} while (*++scan == *++match && *++scan == *++match &&
@ -1421,7 +1440,8 @@ local uInt longest_match(s, cur_match)
*++scan == *++match && *++scan == *++match &&
scan < strend);
Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
Assert(scan <= s->window + (unsigned)(s->window_size - 1),
"wild scan");
len = MAX_MATCH - (int)(strend - scan);
scan = strend - MAX_MATCH;
@ -1433,9 +1453,9 @@ local uInt longest_match(s, cur_match)
best_len = len;
if (len >= nice_match) break;
#ifdef UNALIGNED_OK
scan_end = *(ushf*)(scan+best_len-1);
scan_end = *(ushf*)(scan + best_len - 1);
#else
scan_end1 = scan[best_len-1];
scan_end1 = scan[best_len - 1];
scan_end = scan[best_len];
#endif
}
@ -1445,7 +1465,6 @@ local uInt longest_match(s, cur_match)
if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
return s->lookahead;
}
#endif /* ASMV */
#else /* FASTEST */
@ -1466,7 +1485,8 @@ local uInt longest_match(s, cur_match)
*/
Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
"need lookahead");
Assert(cur_match < s->strstart, "no future");
@ -1476,7 +1496,7 @@ local uInt longest_match(s, cur_match)
*/
if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1;
/* The check at best_len-1 can be removed because it will be made
/* The check at best_len - 1 can be removed because it will be made
* again later. (This heuristic is not always a win.)
* It is not necessary to compare scan[2] and match[2] since they
* are always equal when the other bytes match, given that
@ -1486,7 +1506,7 @@ local uInt longest_match(s, cur_match)
Assert(*scan == *match, "match[2]?");
/* We check for insufficient lookahead only every 8th comparison;
* the 256th check will be made at strstart+258.
* the 256th check will be made at strstart + 258.
*/
do {
} while (*++scan == *++match && *++scan == *++match &&
@ -1495,7 +1515,7 @@ local uInt longest_match(s, cur_match)
*++scan == *++match && *++scan == *++match &&
scan < strend);
Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
Assert(scan <= s->window + (unsigned)(s->window_size - 1), "wild scan");
len = MAX_MATCH - (int)(strend - scan);
@ -1531,7 +1551,7 @@ local void check_match(s, start, match, length)
z_error(__FILE__, __LINE__, "invalid match");
}
if (z_verbose > 1) {
kprintf("\\[%d,%d]", start-match, length);
kprintf("\\[%d,%d]", start - match, length);
do { kprintf("%c", s->window[start++]); } while (--length != 0);
}
}
@ -1577,9 +1597,9 @@ local void fill_window(s)
/* If the window is almost full and there is insufficient lookahead,
* move the upper half to the lower one to make room in the upper half.
*/
if (s->strstart >= wsize+MAX_DIST(s)) {
if (s->strstart >= wsize + MAX_DIST(s)) {
zmemcpy(s->window, s->window+wsize, (unsigned)wsize - more);
zmemcpy(s->window, s->window + wsize, (unsigned)wsize - more);
s->match_start -= wsize;
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
s->block_start -= (long) wsize;
@ -1934,7 +1954,7 @@ local block_state deflate_fast(s, flush)
if (s->lookahead == 0) break; /* flush the current block */
}
/* Insert the string window[strstart .. strstart+2] in the
/* Insert the string window[strstart .. strstart + 2] in the
* dictionary, and set hash_head to the head of the hash chain:
*/
hash_head = NIL;
@ -1984,7 +2004,7 @@ local block_state deflate_fast(s, flush)
if (!s->chromium_zlib_hash) {
s->ins_h = s->window[s->strstart];
UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
UPDATE_HASH(s, s->ins_h, s->window[s->strstart + 1]);
#if MIN_MATCH != 3
Call UPDATE_HASH() MIN_MATCH-3 more times
#endif
@ -1996,7 +2016,7 @@ local block_state deflate_fast(s, flush)
} else {
/* No match, output a literal byte */
Tracevv(("%c", s->window[s->strstart]));
_tr_tally_lit (s, s->window[s->strstart], bflush);
_tr_tally_lit(s, s->window[s->strstart], bflush);
s->lookahead--;
s->strstart++;
}
@ -2040,7 +2060,7 @@ local block_state deflate_slow(s, flush)
if (s->lookahead == 0) break; /* flush the current block */
}
/* Insert the string window[strstart .. strstart+2] in the
/* Insert the string window[strstart .. strstart + 2] in the
* dictionary, and set hash_head to the head of the hash chain:
*/
hash_head = NIL;
@ -2085,20 +2105,20 @@ local block_state deflate_slow(s, flush)
if (s->prev_match == -1) {
/* The window has slid one byte past the previous match,
* so the first byte cannot be compared. */
check_match(s, s->strstart, s->prev_match+1, s->prev_length-1);
check_match(s, s->strstart, s->prev_match + 1, s->prev_length - 1);
} else {
check_match(s, s->strstart-1, s->prev_match, s->prev_length);
check_match(s, s->strstart - 1, s->prev_match, s->prev_length);
}
_tr_tally_dist(s, s->strstart -1 - s->prev_match,
_tr_tally_dist(s, s->strstart - 1 - s->prev_match,
s->prev_length - MIN_MATCH, bflush);
/* Insert in hash table all strings up to the end of the match.
* strstart-1 and strstart are already inserted. If there is not
* strstart - 1 and strstart are already inserted. If there is not
* enough lookahead, the last two strings are not inserted in
* the hash table.
*/
s->lookahead -= s->prev_length-1;
s->lookahead -= s->prev_length - 1;
s->prev_length -= 2;
do {
if (++s->strstart <= max_insert) {
@ -2116,8 +2136,8 @@ local block_state deflate_slow(s, flush)
* single literal. If there was a match but the current match
* is longer, truncate the previous match to a single literal.
*/
Tracevv(("%c", s->window[s->strstart-1]));
_tr_tally_lit(s, s->window[s->strstart-1], bflush);
Tracevv(("%c", s->window[s->strstart - 1]));
_tr_tally_lit(s, s->window[s->strstart - 1], bflush);
if (bflush) {
FLUSH_BLOCK_ONLY(s, 0);
}
@ -2135,8 +2155,8 @@ local block_state deflate_slow(s, flush)
}
Assert (flush != Z_NO_FLUSH, "no flush?");
if (s->match_available) {
Tracevv(("%c", s->window[s->strstart-1]));
_tr_tally_lit(s, s->window[s->strstart-1], bflush);
Tracevv(("%c", s->window[s->strstart - 1]));
_tr_tally_lit(s, s->window[s->strstart - 1], bflush);
s->match_available = 0;
}
s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1;
@ -2193,7 +2213,8 @@ local block_state deflate_rle(s, flush)
if (s->match_length > s->lookahead)
s->match_length = s->lookahead;
}
Assert(scan <= s->window+(uInt)(s->window_size-1), "wild scan");
Assert(scan <= s->window + (uInt)(s->window_size - 1),
"wild scan");
}
/* Emit match if have run of MIN_MATCH or longer, else emit literal */
@ -2208,7 +2229,7 @@ local block_state deflate_rle(s, flush)
} else {
/* No match, output a literal byte */
Tracevv(("%c", s->window[s->strstart]));
_tr_tally_lit (s, s->window[s->strstart], bflush);
_tr_tally_lit(s, s->window[s->strstart], bflush);
s->lookahead--;
s->strstart++;
}
@ -2248,7 +2269,7 @@ local block_state deflate_huff(s, flush)
/* Output a literal byte */
s->match_length = 0;
Tracevv(("%c", s->window[s->strstart]));
_tr_tally_lit (s, s->window[s->strstart], bflush);
_tr_tally_lit(s, s->window[s->strstart], bflush);
s->lookahead--;
s->strstart++;
if (bflush) FLUSH_BLOCK(s, 0);

View file

@ -7,11 +7,6 @@
*/
#include "third_party/zlib/gz/gzguts.inc"
#include "third_party/zlib/macros.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* gzclose() is in a separate file so that it is linked in only if it is used.

View file

@ -14,15 +14,10 @@
#include "third_party/zlib/gz/gzguts.inc"
#include "third_party/zlib/zlib.h"
#include "third_party/zlib/zutil.internal.h"
// clang-format off
#define LSEEK lseek
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* Local functions */
static void gz_reset(gz_statep);
static gzFile gz_open(const void *, int, const char *);

View file

@ -12,11 +12,6 @@
#include "libc/str/str.h"
#include "third_party/zlib/gz/gzguts.inc"
#include "third_party/zlib/zlib.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* Local functions */

View file

@ -9,11 +9,6 @@
#include "libc/fmt/fmt.h"
#include "libc/mem/mem.h"
#include "third_party/zlib/gz/gzguts.inc"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* Local functions */

View file

@ -11,11 +11,6 @@
#include "third_party/zlib/internal.h"
#include "third_party/zlib/macros.internal.h"
#include "third_party/zlib/zutil.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/*

View file

@ -5,11 +5,6 @@
* Copyright (C) 1995-2017 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
#include "third_party/zlib/zutil.internal.h"

387
third_party/zlib/inffast_chunk.c vendored Normal file
View file

@ -0,0 +1,387 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
*/
/* inffast_chunk.c -- fast decoding
* Copyright (C) 1995-2017 Mark Adler
* Copyright 2023 The Chromium Authors
* For conditions of distribution and use, see copyright notice in zlib.h
*/
asm(".ident\t\"\\n\\n\
Chromium (BSD-3 License)\\n\
Copyright 2017 The Chromium Authors\"");
// clang-format off
#include "third_party/zlib/zutil.internal.h"
#include "third_party/zlib/inftrees.internal.h"
#include "third_party/zlib/inflate.internal.h"
#include "third_party/zlib/inffast_chunk.internal.h"
#include "third_party/zlib/chunkcopy.inc"
#ifdef ASMINF
# pragma message("Assembler code may have bugs -- use at your own risk")
#else
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
available, an end-of-block is encountered, or a data error is encountered.
When large enough input and output buffers are supplied to inflate(), for
example, a 16K input buffer and a 64K output buffer, more than 95% of the
inflate() execution time is spent in this routine.
Entry assumptions:
state->mode == LEN
strm->avail_in >= INFLATE_FAST_MIN_INPUT (6 or 8 bytes + 7 bytes)
strm->avail_out >= INFLATE_FAST_MIN_OUTPUT (258 bytes + 2 bytes)
start >= strm->avail_out
state->bits < 8
(state->hold >> state->bits) == 0
strm->next_out[0..strm->avail_out] does not overlap with
strm->next_in[0..strm->avail_in]
strm->state->window is allocated with an additional
CHUNKCOPY_CHUNK_SIZE-1 bytes of padding beyond strm->state->wsize
On return, state->mode is one of:
LEN -- ran out of enough output space or enough available input
TYPE -- reached end of block code, inflate() to interpret next block
BAD -- error in block data
Notes:
INFLATE_FAST_MIN_INPUT: 6 or 8 bytes + 7 bytes
- The maximum input bits used by a length/distance pair is 15 bits for the
length code, 5 bits for the length extra, 15 bits for the distance code,
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
Therefore if strm->avail_in >= 6, then there is enough input to avoid
checking for available input while decoding.
- The wide input data reading option reads 64 input bits at a time. Thus,
if strm->avail_in >= 8, then there is enough input to avoid checking for
available input while decoding. Reading consumes the input with:
hold |= read64le(in) << bits;
in += 6;
bits += 48;
reporting 6 bytes of new input because |bits| is 0..15 (2 bytes rounded
up, worst case) and 6 bytes is enough to decode as noted above. At exit,
hold &= (1U << bits) - 1 drops excess input to keep the invariant:
(state->hold >> state->bits) == 0
INFLATE_FAST_MIN_OUTPUT: 258 bytes + 2 bytes for literals = 260 bytes
- The maximum bytes that a single length/distance pair can output is 258
bytes, which is the maximum length that can be coded. inflate_fast()
requires strm->avail_out >= 260 for each loop to avoid checking for
available output space while decoding.
*/
void ZLIB_INTERNAL inflate_fast_chunk_(strm, start)
z_streamp strm;
unsigned start; /* inflate()'s starting value for strm->avail_out */
{
struct inflate_state FAR *state;
z_const unsigned char FAR *in; /* local strm->next_in */
z_const unsigned char FAR *last; /* have enough input while in < last */
unsigned char FAR *out; /* local strm->next_out */
unsigned char FAR *beg; /* inflate()'s initial strm->next_out */
unsigned char FAR *end; /* while out < end, enough space available */
unsigned char FAR *limit; /* safety limit for chunky copies */
#ifdef INFLATE_STRICT
unsigned dmax; /* maximum distance from zlib header */
#endif
unsigned wsize; /* window size or zero if not using window */
unsigned whave; /* valid bytes in the window */
unsigned wnext; /* window write index */
unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */
inflate_holder_t hold; /* local strm->hold */
unsigned bits; /* local strm->bits */
code const FAR *lcode; /* local strm->lencode */
code const FAR *dcode; /* local strm->distcode */
unsigned lmask; /* mask for first level of length codes */
unsigned dmask; /* mask for first level of distance codes */
code const *here; /* retrieved table entry */
unsigned op; /* code bits, operation, extra bits, or */
/* window position, window bytes to copy */
unsigned len; /* match length, unused bytes */
unsigned dist; /* match distance */
unsigned char FAR *from; /* where to copy match from */
/* copy state to local variables */
state = (struct inflate_state FAR *)strm->state;
in = strm->next_in;
last = in + (strm->avail_in - (INFLATE_FAST_MIN_INPUT - 1));
out = strm->next_out;
beg = out - (start - strm->avail_out);
end = out + (strm->avail_out - (INFLATE_FAST_MIN_OUTPUT - 1));
limit = out + strm->avail_out;
#ifdef INFLATE_STRICT
dmax = state->dmax;
#endif
wsize = state->wsize;
whave = state->whave;
wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext;
window = state->window;
hold = state->hold;
bits = state->bits;
lcode = state->lencode;
dcode = state->distcode;
lmask = (1U << state->lenbits) - 1;
dmask = (1U << state->distbits) - 1;
#ifdef INFLATE_CHUNK_READ_64LE
#define REFILL() do { \
Assert(bits < 64, "### Too many bits in inflate_fast."); \
hold |= read64le(in) << bits; \
in += 7; \
in -= bits >> 3; \
bits |= 56; \
} while (0)
#endif
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
do {
#ifdef INFLATE_CHUNK_READ_64LE
REFILL();
#else
if (bits < 15) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
#endif
here = lcode + (hold & lmask);
#ifdef INFLATE_CHUNK_READ_64LE
if (here->op == 0) { /* literal */
Tracevv((here->val >= 0x20 && here->val < 0x7f ?
"inflate: literal '%c'\n" :
"inflate: literal 0x%02x\n", here->val));
*out++ = (unsigned char)(here->val);
hold >>= here->bits;
bits -= here->bits;
here = lcode + (hold & lmask);
if (here->op == 0) { /* literal */
Tracevv((here->val >= 0x20 && here->val < 0x7f ?
"inflate: 2nd literal '%c'\n" :
"inflate: 2nd literal 0x%02x\n", here->val));
*out++ = (unsigned char)(here->val);
hold >>= here->bits;
bits -= here->bits;
here = lcode + (hold & lmask);
}
}
#endif
dolen:
op = (unsigned)(here->bits);
hold >>= op;
bits -= op;
op = (unsigned)(here->op);
if (op == 0) { /* literal */
Tracevv((here->val >= 0x20 && here->val < 0x7f ?
"inflate: literal '%c'\n" :
"inflate: literal 0x%02x\n", here->val));
*out++ = (unsigned char)(here->val);
}
else if (op & 16) { /* length base */
len = (unsigned)(here->val);
op &= 15; /* number of extra bits */
if (op) {
#ifndef INFLATE_CHUNK_READ_64LE
if (bits < op) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
#endif
len += (unsigned)hold & ((1U << op) - 1);
hold >>= op;
bits -= op;
}
Tracevv(("inflate: length %u\n", len));
#ifndef INFLATE_CHUNK_READ_64LE
if (bits < 15) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
#endif
here = dcode + (hold & dmask);
dodist:
op = (unsigned)(here->bits);
hold >>= op;
bits -= op;
op = (unsigned)(here->op);
if (op & 16) { /* distance base */
dist = (unsigned)(here->val);
op &= 15; /* number of extra bits */
/* we have two fast-path loads: 10+10 + 15+5 + 15 = 55,
but we may need to refill here in the worst case */
if (bits < op) {
#ifdef INFLATE_CHUNK_READ_64LE
REFILL();
#else
hold += (unsigned long)(*in++) << bits;
bits += 8;
if (bits < op) {
hold += (unsigned long)(*in++) << bits;
bits += 8;
}
#endif
}
dist += (unsigned)hold & ((1U << op) - 1);
#ifdef INFLATE_STRICT
if (dist > dmax) {
strm->msg = (char *)"invalid distance too far back";
state->mode = BAD;
break;
}
#endif
hold >>= op;
bits -= op;
Tracevv(("inflate: distance %u\n", dist));
op = (unsigned)(out - beg); /* max distance in output */
if (dist > op) { /* see if copy from window */
op = dist - op; /* distance back in window */
if (op > whave) {
if (state->sane) {
strm->msg =
(char *)"invalid distance too far back";
state->mode = BAD;
break;
}
#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
if (len <= op - whave) {
do {
*out++ = 0;
} while (--len);
continue;
}
len -= op - whave;
do {
*out++ = 0;
} while (--op > whave);
if (op == 0) {
from = out - dist;
do {
*out++ = *from++;
} while (--len);
continue;
}
#endif
}
from = window;
if (wnext >= op) { /* contiguous in window */
from += wnext - op;
}
else { /* wrap around window */
op -= wnext;
from += wsize - op;
if (op < len) { /* some from end of window */
len -= op;
out = chunkcopy_safe(out, from, op, limit);
from = window; /* more from start of window */
op = wnext;
/* This (rare) case can create a situation where
the first chunkcopy below must be checked.
*/
}
}
if (op < len) { /* still need some from output */
out = chunkcopy_safe(out, from, op, limit);
len -= op;
/* When dist is small the amount of data that can be
copied from the window is also small, and progress
towards the dangerous end of the output buffer is
also small. This means that for trivial memsets and
for chunkunroll_relaxed() a safety check is
unnecessary. However, these conditions may not be
entered at all, and in that case it's possible that
the main copy is near the end.
*/
out = chunkunroll_relaxed(out, &dist, &len);
out = chunkcopy_safe_ugly(out, dist, len, limit);
} else {
/* from points to window, so there is no risk of
overlapping pointers requiring memset-like behaviour
*/
out = chunkcopy_safe(out, from, len, limit);
}
}
else {
/* Whole reference is in range of current output. No
range checks are necessary because we start with room
for at least 258 bytes of output, so unroll and roundoff
operations can write beyond `out+len` so long as they
stay within 258 bytes of `out`.
*/
out = chunkcopy_lapped_relaxed(out, dist, len);
}
}
else if ((op & 64) == 0) { /* 2nd level distance code */
here = dcode + here->val + (hold & ((1U << op) - 1));
goto dodist;
}
else {
strm->msg = (char *)"invalid distance code";
state->mode = BAD;
break;
}
}
else if ((op & 64) == 0) { /* 2nd level length code */
here = lcode + here->val + (hold & ((1U << op) - 1));
goto dolen;
}
else if (op & 32) { /* end-of-block */
Tracevv(("inflate: end of block\n"));
state->mode = TYPE;
break;
}
else {
strm->msg = (char *)"invalid literal/length code";
state->mode = BAD;
break;
}
} while (in < last && out < end);
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
len = bits >> 3;
in -= len;
bits -= len << 3;
hold &= (1U << bits) - 1;
/* update state and return */
strm->next_in = in;
strm->next_out = out;
strm->avail_in = (unsigned)(in < last ?
(INFLATE_FAST_MIN_INPUT - 1) + (last - in) :
(INFLATE_FAST_MIN_INPUT - 1) - (in - last));
strm->avail_out = (unsigned)(out < end ?
(INFLATE_FAST_MIN_OUTPUT - 1) + (end - out) :
(INFLATE_FAST_MIN_OUTPUT - 1) - (out - end));
state->hold = hold;
state->bits = bits;
Assert((state->hold >> state->bits) == 0, "invalid input data state");
}
/*
inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
- Using bit fields for code structure
- Different op definition to avoid & for extra bits (do & for table bits)
- Three separate decoding do-loops for direct, window, and wnext == 0
- Special case for distance > 1 copies to do overlapped load and store copy
- Explicit branch predictions (based on measured branch probabilities)
- Deferring match copy and interspersed it with decoding subsequent codes
- Swapping literal/length else
- Swapping window/direct else
- Larger unrolled copy loops (three is about right)
- Moving len -= 3 statement into middle of loop
*/
#endif /* !ASMINF */

View file

@ -0,0 +1,42 @@
/* inffast_chunk.h -- header to use inffast_chunk.c
* Copyright (C) 1995-2003, 2010 Mark Adler
* Copyright (C) 2017 ARM, Inc.
* Copyright 2023 The Chromium Authors
* For conditions of distribution and use, see copyright notice in zlib.h
*/
/* WARNING: this file should *not* be used by applications. It is
part of the implementation of the compression library and is
subject to change. Applications should only use zlib.h.
*/
#include "third_party/zlib/inffast.internal.h"
/* INFLATE_FAST_MIN_INPUT:
The minimum number of input bytes needed so that we can safely call
inflate_fast() with only one up-front bounds check. One
length/distance code pair (15 bits for the length code, 5 bits for length
extra, 15 bits for the distance code, 13 bits for distance extra) requires
reading up to 48 input bits. Additionally, in the same iteraction, we may
decode two literals from the root-table (requiring MIN_OUTPUT = 258 + 2).
Each root-table entry is up to 10 bits, for a total of 68 input bits each
iteraction.
The refill variant reads 8 bytes from the buffer at a time, and advances
the input pointer by up to 7 bytes, ensuring there are at least 56-bits
available in the bit-buffer. The technique was documented by Fabian Giesen
on his blog as variant 4 in the article 'Reading bits in far too many ways':
https://fgiesen.wordpress.com/2018/02/20/
In the worst case, we may refill twice in the same iteraction, requiring
MIN_INPUT = 8 + 7.
*/
#ifdef INFLATE_CHUNK_READ_64LE
#undef INFLATE_FAST_MIN_INPUT
#define INFLATE_FAST_MIN_INPUT 15
#undef INFLATE_FAST_MIN_OUTPUT
#define INFLATE_FAST_MIN_OUTPUT 260
#endif
void inflate_fast_chunk_(z_streamp strm, unsigned start);

View file

@ -5,15 +5,11 @@
* Copyright (C) 1995-2022 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "third_party/zlib/inffast.internal.h"
#include "third_party/zlib/inflate.internal.h"
#include "third_party/zlib/inftrees.internal.h"
#include "third_party/zlib/internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
zlib 1.2.13 (zlib License)\\n\
Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
Invented 1990 Phillip Walter Katz\"");
// clang-format off
/*
@ -93,6 +89,13 @@ asm(".include \"libc/disclaimer.inc\"");
* The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
*/
#include "third_party/zlib/zutil.internal.h"
#include "third_party/zlib/inftrees.internal.h"
#include "third_party/zlib/inflate.internal.h"
#include "third_party/zlib/inffast_chunk.internal.h"
#include "third_party/zlib/internal.h"
#include "third_party/zlib/chunkcopy.inc"
#ifdef MAKEFIXED
# ifndef BUILDFIXED
# define BUILDFIXED
@ -176,6 +179,8 @@ int windowBits;
/* extract wrap request from windowBits parameter */
if (windowBits < 0) {
if (windowBits < -15)
return Z_STREAM_ERROR;
wrap = 0;
windowBits = -windowBits;
}
@ -322,7 +327,14 @@ struct inflate_state FAR *state;
}
#ifdef MAKEFIXED
#include <stdio.h>
#include "libc/calls/calls.h"
#include "libc/calls/dprintf.h"
#include "libc/calls/weirdtypes.h"
#include "libc/fmt/fmt.h"
#include "libc/mem/fmt.h"
#include "libc/stdio/stdio.h"
#include "libc/stdio/temp.h"
#include "third_party/musl/tempnam.h"
/*
Write out the inffixed.h that is #include'd above. Defining MAKEFIXED also
@ -408,10 +420,20 @@ unsigned copy;
/* if it hasn't been done already, allocate space for the window */
if (state->window == Z_NULL) {
unsigned wsize = 1U << state->wbits;
state->window = (unsigned char FAR *)
ZALLOC(strm, 1U << state->wbits,
ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
sizeof(unsigned char));
if (state->window == Z_NULL) return 1;
#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED
/* Copies from the overflow portion of this buffer are undefined and
may cause analysis tools to raise a warning if we don't initialize
it. However, this undefined data overwrites other undefined data
and is subsequently either overwritten or left deliberately
undefined at the end of decode; so there's really no point.
*/
zmemzero(state->window + wsize, CHUNKCOPY_CHUNK_SIZE);
#endif
}
/* if window not in use yet, initialize */
@ -1065,7 +1087,7 @@ int flush;
if (have >= INFLATE_FAST_MIN_INPUT &&
left >= INFLATE_FAST_MIN_OUTPUT) {
RESTORE();
inflate_fast(strm, out);
inflate_fast_chunk_(strm, out);
LOAD();
if (state->mode == TYPE)
state->back = -1;
@ -1200,17 +1222,16 @@ int flush;
else
from = state->window + (state->wnext - copy);
if (copy > state->length) copy = state->length;
if (copy > left) copy = left;
put = chunkcopy_safe(put, from, copy, put + left);
}
else { /* copy from output */
from = put - state->offset;
copy = state->length;
if (copy > left) copy = left;
put = chunkcopy_lapped_safe(put, state->offset, copy, put + left);
}
if (copy > left) copy = left;
left -= copy;
state->length -= copy;
do {
*put++ = *from++;
} while (--copy);
if (state->length == 0) state->mode = LEN;
break;
case LIT:
@ -1279,6 +1300,29 @@ int flush;
Note: a memory error from inflate() is non-recoverable.
*/
inf_leave:
#if defined(ZLIB_DEBUG)
/* XXX(cavalcantii): I put this in place back in 2017 to help debug faulty
* client code relying on undefined behavior when chunk_copy first landed.
*
* It is save to say after all these years that Chromium code is well
* behaved and works fine with the optimization, therefore we can enable
* this only for DEBUG builds.
*
* We write a defined value in the unused space to help mark
* where the stream has ended. We don't use zeros as that can
* mislead clients relying on undefined behavior (i.e. assuming
* that the data is over when the buffer has a zero/null value).
*
* The basic idea is that if client code is not relying on the zlib context
* to inform the amount of decompressed data, but instead reads the output
* buffer until a zero/null is found, it will fail faster and harder
* when the remaining of the buffer is marked with a symbol (e.g. 0x55).
*/
if (left >= CHUNKCOPY_CHUNK_SIZE)
memset(put, 0x55, CHUNKCOPY_CHUNK_SIZE);
else
memset(put, 0x55, left);
#endif
RESTORE();
if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
(state->mode < CHECK || flush != Z_FINISH)))

View file

@ -1,3 +1,4 @@
// clang-format off
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
*/
@ -6,17 +7,16 @@
* For conditions of distribution and use, see copyright notice in zlib.h
*/
#include "third_party/zlib/inftrees.internal.h"
#include "third_party/zlib/zutil.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
zlib 1.2.13 (zlib License)\\n\
Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
Invented 1990 Phillip Walter Katz\"");
// clang-format off
#define MAXBITS 15
const char inflate_copyright[] =
" inflate 1.2.12.1 Copyright 1995-2022 Mark Adler ";
/*
If you use the zlib library in a product, an acknowledgment is welcome
in the documentation of your product. If for some reason you cannot
@ -69,7 +69,7 @@ unsigned short FAR *work;
35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
static const unsigned short lext[31] = { /* Length codes 257..285 extra */
16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 76, 202};
19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 194, 65};
static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,

View file

@ -5,7 +5,6 @@
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
/* clang-format off */
/* inftrees.h -- header to use inftrees.c
* Copyright (C) 1995-2005, 2010 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h

View file

@ -1,11 +1,13 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_
#define COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_
#include "libc/nexgen32e/x86feature.h"
#include "third_party/zlib/deflate.internal.h"
#include "third_party/zlib/zutil.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
/* clang-format off */
// clang-format off
/* insert_string.h
*
* Copyright 2019 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
#ifndef INSERT_STRING_H
#define INSERT_STRING_H
#ifndef INLINE
#if defined(_MSC_VER) && !defined(__clang__)
@ -15,9 +17,11 @@ COSMOPOLITAN_C_START_
#endif
#endif
#include "third_party/zlib/cpu_features.internal.h"
// clang-format off
#if defined(CRC32_SIMD_SSE42_PCLMUL)
// #include <smmintrin.h> /* Required to make MSVC bot build pass. */
#include "third_party/intel/smmintrin.internal.h" /* Required to make MSVC bot build pass. */
#if defined(__clang__) || defined(__GNUC__)
#define TARGET_CPU_WITH_CRC __attribute__((target("sse4.2")))
@ -51,19 +55,17 @@ COSMOPOLITAN_C_START_
#if defined(TARGET_CPU_WITH_CRC)
// TODO(jart): Why does this fail alignment check?
noubsan local INLINE Pos insert_string_simd(deflate_state* const s,
const Pos str) {
TARGET_CPU_WITH_CRC
local INLINE Pos insert_string_simd(deflate_state* const s, const Pos str) {
Pos ret;
unsigned *ip, val, h = 0;
unsigned val, h = 0;
ip = (unsigned*)&s->window[str];
val = *ip;
zmemcpy(&val, &s->window[str], sizeof(val));
if (s->level >= 6) val &= 0xFFFFFF;
/* Compute hash from the CRC32C of |val|. */
asm("crc32l\t%1,%0" : "+r"(h) : "rm"(val));
h = _cpu_crc32c_hash_u32(h, val);
ret = s->head[h & s->hash_mask];
s->head[h & s->hash_mask] = str;
@ -131,13 +133,11 @@ local INLINE Pos insert_string(deflate_state* const s, const Pos str) {
* the Rabin-Karp hash instead.
*/ /* FALLTHROUGH Rabin-Karp */
#elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
if (X86_HAVE(SSE4_2)) return insert_string_simd(s, str);
if (x86_cpu_enable_simd) return insert_string_simd(s, str);
#elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_ARMV8_CRC32)
if (arm_cpu_enable_crc32) return insert_string_simd(s, str);
#endif
return insert_string_c(s, str); /* Rabin-Karp */
}
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_ */
#endif /* INSERT_STRING_H */

117
third_party/zlib/slide_hash_simd.inc vendored Normal file
View file

@ -0,0 +1,117 @@
// clang-format off
/* slide_hash_simd.h
*
* Copyright 2022 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
#ifndef SLIDE_HASH_SIMD_H
#define SLIDE_HASH_SIMD_H
#include "third_party/zlib/deflate.internal.h"
#ifndef INLINE
#if defined(_MSC_VER) && !defined(__clang__)
#define INLINE __inline
#else
#define INLINE inline
#endif
#endif
#if defined(CPU_NO_SIMD)
#error SIMD has been disabled for your build target
#elif defined(DEFLATE_SLIDE_HASH_SSE2)
#include "third_party/intel/emmintrin.internal.h" /* SSE2 */
#define Z_SLIDE_INIT_SIMD(wsize) _mm_set1_epi16((ush)(wsize))
#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
for (const Posf* const end = table + size; table != end;) { \
__m128i vO = _mm_loadu_si128((__m128i *)(table + 0)); \
vO = _mm_subs_epu16(vO, vector_wsize); \
_mm_storeu_si128((__m128i *)(table + 0), vO); \
table += 8; \
}
typedef __m128i z_vec128i_u16x8_t;
#elif defined(DEFLATE_SLIDE_HASH_NEON)
#include "third_party/aarch64/arm_neon.h" /* NEON */
#define Z_SLIDE_INIT_SIMD(wsize) vdupq_n_u16((ush)(wsize))
#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
for (const Posf* const end = table + size; table != end;) { \
uint16x8_t vO = vld1q_u16(table + 0); \
uint16x8_t v8 = vld1q_u16(table + 8); \
vO = vqsubq_u16(vO, vector_wsize); \
v8 = vqsubq_u16(v8, vector_wsize); \
vst1q_u16(table + 0, vO); \
vst1q_u16(table + 8, v8); \
table += 8 + 8; \
}
typedef uint16x8_t z_vec128i_u16x8_t;
#else
#error slide_hash_simd is not defined for your build target
#endif
/* ===========================================================================
* Slide the hash table when sliding the window down (could be avoided with 32
* bit values at the expense of memory usage). We slide even when level == 0 to
* keep the hash table consistent if we switch back to level > 0 later.
*/
local INLINE void slide_hash_simd(
Posf *head, Posf *prev, const uInt w_size, const uInt hash_size) {
/*
* The SIMD implementation of the hash table slider assumes:
*
* 1. hash chain offset is 2 bytes. Should be true as Pos is "ush" type.
*/
Assert(sizeof(Pos) == 2, "Pos type size error: should be 2 bytes");
Assert(sizeof(ush) == 2, "ush type size error: should be 2 bytes");
Assert(hash_size <= (1 << 16), "Hash table maximum size error");
Assert(hash_size >= (1 << 8), "Hash table minimum size error");
Assert(w_size == (ush)w_size, "Prev table size error");
/*
* 2. The hash & prev table sizes are a multiple of 32 bytes (256 bits),
* since the NEON table slider moves two 128-bit items per loop (loop is
* unrolled on NEON for performance, see http://crbug.com/863257).
*/
Assert(!((hash_size * sizeof(head[0])) & (32 - 1)),
"Hash table size error: should be a multiple of 32 bytes");
Assert(!((w_size * sizeof(prev[0])) & (32 - 1)),
"Prev table size error: should be a multiple of 32 bytes");
/*
* Duplicate (ush)w_size in each uint16_t component of a 128-bit vector.
*/
const z_vec128i_u16x8_t vec_wsize = Z_SLIDE_INIT_SIMD(w_size);
/*
* Slide {head,prev} hash chain values: subtracts (ush)w_size from every
* value with a saturating SIMD subtract, to clamp the result to 0(NIL),
* to implement slide_hash() `(m >= wsize ? m - wsize : NIL);` code.
*/
Z_SLIDE_HASH_SIMD(head, hash_size, vec_wsize);
#ifndef FASTEST
Z_SLIDE_HASH_SIMD(prev, w_size, vec_wsize);
#endif
}
#undef z_vec128i_u16x8_t
#undef Z_SLIDE_HASH_SIMD
#undef Z_SLIDE_INIT_SIMD
#endif /* SLIDE_HASH_SIMD_H */

View file

@ -14,11 +14,6 @@
#include "libc/stdio/temp.h"
#include "libc/str/str.h"
#include "third_party/zlib/deflate.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/*
@ -174,7 +169,7 @@ local void gen_trees_header OF((void));
#else /* !ZLIB_DEBUG */
# define send_code(s, c, tree) \
{ if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
{ if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
send_bits(s, tree[c].Code, tree[c].Len); }
#endif
@ -204,7 +199,7 @@ local void send_bits(s, value, length)
s->bits_sent += (ulg)length;
/* If not enough room in bi_buf, use (valid) bits from bi_buf and
* (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
* (16 - bi_valid) bits from value, leaving (width - (16 - bi_valid))
* unused bits in value.
*/
if (s->bi_valid > (int)Buf_size - length) {
@ -267,7 +262,7 @@ local void tr_static_init()
length = 0;
for (code = 0; code < LENGTH_CODES-1; code++) {
base_length[code] = length;
for (n = 0; n < (1<<extra_lbits[code]); n++) {
for (n = 0; n < (1 << extra_lbits[code]); n++) {
_length_code[length++] = (uch)code;
}
}
@ -276,13 +271,13 @@ local void tr_static_init()
* in two different ways: code 284 + 5 bits or code 285, so we
* overwrite length_code[255] to use the best encoding:
*/
_length_code[length-1] = (uch)code;
_length_code[length - 1] = (uch)code;
/* Initialize the mapping dist (0..32K) -> dist code (0..29) */
dist = 0;
for (code = 0 ; code < 16; code++) {
base_dist[code] = dist;
for (n = 0; n < (1<<extra_dbits[code]); n++) {
for (n = 0; n < (1 << extra_dbits[code]); n++) {
_dist_code[dist++] = (uch)code;
}
}
@ -290,11 +285,11 @@ local void tr_static_init()
dist >>= 7; /* from now on, all distances are divided by 128 */
for ( ; code < D_CODES; code++) {
base_dist[code] = dist << 7;
for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
for (n = 0; n < (1 << (extra_dbits[code] - 7)); n++) {
_dist_code[256 + dist++] = (uch)code;
}
}
Assert (dist == 256, "tr_static_init: 256+dist != 512");
Assert (dist == 256, "tr_static_init: 256 + dist != 512");
/* Construct the codes of the static literal tree */
for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
@ -327,11 +322,12 @@ local void tr_static_init()
*/
#ifdef GEN_TREES_H
# ifndef ZLIB_DEBUG
//# include <stdio.h>
# endif
# define SEPARATOR(i, last, width) \
((i) == (last)? "\n};\n\n" : \
((i) % (width) == (width)-1 ? ",\n" : ", "))
((i) % (width) == (width) - 1 ? ",\n" : ", "))
void gen_trees_header()
{
@ -468,7 +464,7 @@ local void pqdownheap(s, tree, k)
while (j <= s->heap_len) {
/* Set j to the smallest of the two sons: */
if (j < s->heap_len &&
smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
smaller(tree, s->heap[j + 1], s->heap[j], s->depth)) {
j++;
}
/* Exit if v is smaller than both sons */
@ -517,7 +513,7 @@ local void gen_bitlen(s, desc)
*/
tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
for (h = s->heap_max + 1; h < HEAP_SIZE; h++) {
n = s->heap[h];
bits = tree[tree[n].Dad].Len + 1;
if (bits > max_length) bits = max_length, overflow++;
@ -528,7 +524,7 @@ local void gen_bitlen(s, desc)
s->bl_count[bits]++;
xbits = 0;
if (n >= base) xbits = extra[n-base];
if (n >= base) xbits = extra[n - base];
f = tree[n].Freq;
s->opt_len += (ulg)f * (unsigned)(bits + xbits);
if (stree) s->static_len += (ulg)f * (unsigned)(stree[n].Len + xbits);
@ -540,10 +536,10 @@ local void gen_bitlen(s, desc)
/* Find the first bit length which could increase: */
do {
bits = max_length-1;
bits = max_length - 1;
while (s->bl_count[bits] == 0) bits--;
s->bl_count[bits]--; /* move one leaf down the tree */
s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
s->bl_count[bits]--; /* move one leaf down the tree */
s->bl_count[bits + 1] += 2; /* move one overflow item as its brother */
s->bl_count[max_length]--;
/* The brother of the overflow item also moves one step up,
* but this does not affect bl_count[max_length]
@ -579,7 +575,7 @@ local void gen_bitlen(s, desc)
* OUT assertion: the field code is set for all tree elements of non
* zero code length.
*/
local void gen_codes (tree, max_code, bl_count)
local void gen_codes(tree, max_code, bl_count)
ct_data *tree; /* the tree to decorate */
int max_code; /* largest code with non zero frequency */
ushf *bl_count; /* number of codes at each bit length */
@ -593,13 +589,13 @@ local void gen_codes (tree, max_code, bl_count)
* without bit reversal.
*/
for (bits = 1; bits <= MAX_BITS; bits++) {
code = (code + bl_count[bits-1]) << 1;
code = (code + bl_count[bits - 1]) << 1;
next_code[bits] = (ush)code;
}
/* Check that the bit counts in bl_count are consistent. The last code
* must be all ones.
*/
Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
Assert (code + bl_count[MAX_BITS] - 1 == (1 << MAX_BITS) - 1,
"inconsistent bit counts");
Tracev(("\ngen_codes: max_code %d ", max_code));
@ -610,7 +606,7 @@ local void gen_codes (tree, max_code, bl_count)
tree[n].Code = (ush)bi_reverse(next_code[len]++, len);
Tracecv(tree != static_ltree, ("\nn %3d %c l %2d c %4x (%x) ",
n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len] - 1));
}
}
@ -634,7 +630,7 @@ local void build_tree(s, desc)
int node; /* new node being created */
/* Construct the initial heap, with least frequent element in
* heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
* heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n + 1].
* heap[0] is not used.
*/
s->heap_len = 0, s->heap_max = HEAP_SIZE;
@ -662,7 +658,7 @@ local void build_tree(s, desc)
}
desc->max_code = max_code;
/* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
/* The elements heap[heap_len/2 + 1 .. heap_len] are leaves of the tree,
* establish sub-heaps of increasing lengths:
*/
for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
@ -710,7 +706,7 @@ local void build_tree(s, desc)
* Scan a literal or distance tree to determine the frequencies of the codes
* in the bit length tree.
*/
local void scan_tree (s, tree, max_code)
local void scan_tree(s, tree, max_code)
deflate_state *s;
ct_data *tree; /* the tree to be scanned */
int max_code; /* and its largest code of non zero frequency */
@ -724,10 +720,10 @@ local void scan_tree (s, tree, max_code)
int min_count = 4; /* min repeat count */
if (nextlen == 0) max_count = 138, min_count = 3;
tree[max_code+1].Len = (ush)0xffff; /* guard */
tree[max_code + 1].Len = (ush)0xffff; /* guard */
for (n = 0; n <= max_code; n++) {
curlen = nextlen; nextlen = tree[n+1].Len;
curlen = nextlen; nextlen = tree[n + 1].Len;
if (++count < max_count && curlen == nextlen) {
continue;
} else if (count < min_count) {
@ -755,7 +751,7 @@ local void scan_tree (s, tree, max_code)
* Send a literal or distance tree in compressed form, using the codes in
* bl_tree.
*/
local void send_tree (s, tree, max_code)
local void send_tree(s, tree, max_code)
deflate_state *s;
ct_data *tree; /* the tree to be scanned */
int max_code; /* and its largest code of non zero frequency */
@ -768,11 +764,11 @@ local void send_tree (s, tree, max_code)
int max_count = 7; /* max repeat count */
int min_count = 4; /* min repeat count */
/* tree[max_code+1].Len = -1; */ /* guard already set */
/* tree[max_code + 1].Len = -1; */ /* guard already set */
if (nextlen == 0) max_count = 138, min_count = 3;
for (n = 0; n <= max_code; n++) {
curlen = nextlen; nextlen = tree[n+1].Len;
curlen = nextlen; nextlen = tree[n + 1].Len;
if (++count < max_count && curlen == nextlen) {
continue;
} else if (count < min_count) {
@ -783,13 +779,13 @@ local void send_tree (s, tree, max_code)
send_code(s, curlen, s->bl_tree); count--;
}
Assert(count >= 3 && count <= 6, " 3_6?");
send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
send_code(s, REP_3_6, s->bl_tree); send_bits(s, count - 3, 2);
} else if (count <= 10) {
send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count - 3, 3);
} else {
send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count - 11, 7);
}
count = 0; prevlen = curlen;
if (nextlen == 0) {
@ -817,8 +813,8 @@ local int build_bl_tree(s)
/* Build the bit length tree: */
build_tree(s, (tree_desc *)(&(s->bl_desc)));
/* opt_len now includes the length of the tree representations, except
* the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
/* opt_len now includes the length of the tree representations, except the
* lengths of the bit lengths codes and the 5 + 5 + 4 bits for the counts.
*/
/* Determine the number of bit length codes to send. The pkzip format
@ -829,8 +825,8 @@ local int build_bl_tree(s)
if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
}
/* Update opt_len to include the bit length tree and counts */
s->opt_len += 3*((ulg)max_blindex+1) + 5+5+4;
Tracev(("\ndyn trees: dyn %ld, stat %ld",
s->opt_len += 3*((ulg)max_blindex + 1) + 5 + 5 + 4;
Tracev(( "\ndyn trees: dyn %ld, stat %ld",
s->opt_len, s->static_len));
return max_blindex;
@ -850,21 +846,21 @@ local void send_all_trees(s, lcodes, dcodes, blcodes)
Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
"too many codes");
Tracev(("\nbl counts: "));
send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
send_bits(s, dcodes-1, 5);
send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */
Tracev(( "\nbl counts: "));
send_bits(s, lcodes - 257, 5); /* not +255 as stated in appnote.txt */
send_bits(s, dcodes - 1, 5);
send_bits(s, blcodes - 4, 4); /* not -3 as stated in appnote.txt */
for (rank = 0; rank < blcodes; rank++) {
Tracev(("\nbl code %2d ", bl_order[rank]));
Tracev(( "\nbl code %2d ", bl_order[rank]));
send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
}
Tracev(("\nbl tree: sent %ld", s->bits_sent));
Tracev(( "\nbl tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
Tracev(("\nlit tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_ltree, lcodes - 1); /* literal tree */
Tracev(( "\nlit tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
Tracev(("\ndist tree: sent %ld", s->bits_sent));
send_tree(s, (ct_data *)s->dyn_dtree, dcodes - 1); /* distance tree */
Tracev(( "\ndist tree: sent %ld", s->bits_sent));
}
/* ===========================================================================
@ -876,7 +872,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
ulg stored_len; /* length of input block */
int last; /* one if this is the last block for a file */
{
send_bits(s, (STORED_BLOCK<<1)+last, 3); /* send block type */
send_bits(s, (STORED_BLOCK<<1) + last, 3); /* send block type */
bi_windup(s); /* align on byte boundary */
put_short(s, (ush)stored_len);
put_short(s, (ush)~stored_len);
@ -887,7 +883,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
s->compressed_len += (stored_len + 4) << 3;
s->bits_sent += 2*16;
s->bits_sent += stored_len<<3;
s->bits_sent += stored_len << 3;
#endif
}
@ -937,11 +933,11 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
/* Construct the literal and distance trees */
build_tree(s, (tree_desc *)(&(s->l_desc)));
Tracev(("\nlit data: dyn %ld, stat %ld", s->opt_len,
Tracev(( "\nlit data: dyn %ld, stat %ld", s->opt_len,
s->static_len));
build_tree(s, (tree_desc *)(&(s->d_desc)));
Tracev(("\ndist data: dyn %ld, stat %ld", s->opt_len,
Tracev(( "\ndist data: dyn %ld, stat %ld", s->opt_len,
s->static_len));
/* At this point, opt_len and static_len are the total bit lengths of
* the compressed block data, excluding the tree representations.
@ -953,14 +949,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
max_blindex = build_bl_tree(s);
/* Determine the best encoding. Compute the block lengths in bytes. */
opt_lenb = (s->opt_len+3+7)>>3;
static_lenb = (s->static_len+3+7)>>3;
opt_lenb = (s->opt_len + 3 + 7) >> 3;
static_lenb = (s->static_len + 3 + 7) >> 3;
Tracev(("\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
Tracev(( "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
s->sym_next / 3));
if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
#ifndef FORCE_STATIC
if (static_lenb <= opt_lenb || s->strategy == Z_FIXED)
#endif
opt_lenb = static_lenb;
} else {
Assert(buf != (char*)0, "lost buf");
@ -970,7 +969,7 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
#ifdef FORCE_STORED
if (buf != (char*)0) { /* force stored block */
#else
if (stored_len+4 <= opt_lenb && buf != (char*)0) {
if (stored_len + 4 <= opt_lenb && buf != (char*)0) {
/* 4: two words for the lengths */
#endif
/* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
@ -981,21 +980,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
*/
_tr_stored_block(s, buf, stored_len, last);
#ifdef FORCE_STATIC
} else if (static_lenb >= 0) { /* force static trees */
#else
} else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
#endif
send_bits(s, (STATIC_TREES<<1)+last, 3);
} else if (static_lenb == opt_lenb) {
send_bits(s, (STATIC_TREES<<1) + last, 3);
compress_block(s, (const ct_data *)static_ltree,
(const ct_data *)static_dtree);
#ifdef ZLIB_DEBUG
s->compressed_len += 3 + s->static_len;
#endif
} else {
send_bits(s, (DYN_TREES<<1)+last, 3);
send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
max_blindex+1);
send_bits(s, (DYN_TREES<<1) + last, 3);
send_all_trees(s, s->l_desc.max_code + 1, s->d_desc.max_code + 1,
max_blindex + 1);
compress_block(s, (const ct_data *)s->dyn_ltree,
(const ct_data *)s->dyn_dtree);
#ifdef ZLIB_DEBUG
@ -1014,18 +1009,18 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
s->compressed_len += 7; /* align on byte boundary */
#endif
}
Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len>>3,
s->compressed_len-7*last));
Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len >> 3,
s->compressed_len - 7*last));
}
/* ===========================================================================
* Save the match info and tally the frequency counts. Return true if
* the current block must be flushed.
*/
int ZLIB_INTERNAL _tr_tally (s, dist, lc)
int ZLIB_INTERNAL _tr_tally(s, dist, lc)
deflate_state *s;
unsigned dist; /* distance of matched string */
unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */
unsigned lc; /* match length - MIN_MATCH or unmatched char (dist==0) */
{
s->sym_buf[s->sym_next++] = (uch)dist;
s->sym_buf[s->sym_next++] = (uch)(dist >> 8);
@ -1041,7 +1036,7 @@ int ZLIB_INTERNAL _tr_tally (s, dist, lc)
(ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
(ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match");
s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++;
s->dyn_ltree[_length_code[lc] + LITERALS + 1].Freq++;
s->dyn_dtree[d_code(dist)].Freq++;
}
return (s->sym_next == s->sym_end);
@ -1071,7 +1066,7 @@ local void compress_block(s, ltree, dtree)
} else {
/* Here, lc is the match length - MIN_MATCH */
code = _length_code[lc];
send_code(s, code+LITERALS+1, ltree); /* send the length code */
send_code(s, code + LITERALS + 1, ltree); /* send length code */
extra = extra_lbits[code];
if (extra != 0) {
lc -= base_length[code];
@ -1187,6 +1182,6 @@ local void bi_windup(s)
s->bi_buf = 0;
s->bi_valid = 0;
#ifdef ZLIB_DEBUG
s->bits_sent = (s->bits_sent+7) & ~7;
s->bits_sent = (s->bits_sent + 7) & ~7;
#endif
}

View file

@ -125,4 +125,3 @@ local const int base_dist[D_CODES] = {
32, 48, 64, 96, 128, 192, 256, 384, 512, 768,
1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576
};

View file

@ -9,11 +9,6 @@
#include "third_party/zlib/internal.h"
#include "third_party/zlib/macros.internal.h"
#include "third_party/zlib/zlib.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* @(#) $Id$ */

View file

@ -1696,6 +1696,11 @@ uLong adler32_combine(uLong adler1, uLong adler2, int64_t len2);
*/
uLong crc32(uLong crc, const Bytef *buf, uInt len);
/**
* Same as crc32(), but with a size_t length.
*/
uint32_t crc32_z(uint32_t crc, const void *buf, size_t len);
/**
* Combine two CRC-32 check values into one. For two sequences of bytes,
* seq1 and seq2 with lengths len1 and len2, CRC-32 check values were

View file

@ -9,6 +9,7 @@ THIRD_PARTY_ZLIB_A = o/$(MODE)/third_party/zlib/zlib.a
THIRD_PARTY_ZLIB_A_FILES := $(wildcard third_party/zlib/*)
THIRD_PARTY_ZLIB_A_HDRS = $(filter %.h,$(THIRD_PARTY_ZLIB_A_FILES))
THIRD_PARTY_ZLIB_A_SRCS = $(filter %.c,$(THIRD_PARTY_ZLIB_A_FILES))
THIRD_PARTY_ZLIB_A_INCS = $(filter %.inc,$(THIRD_PARTY_ZLIB_A_FILES))
THIRD_PARTY_ZLIB_A_OBJS = $(THIRD_PARTY_ZLIB_A_SRCS:%.c=o/$(MODE)/%.o)
THIRD_PARTY_ZLIB_A_CHECKS = \
@ -18,6 +19,7 @@ THIRD_PARTY_ZLIB_A_CHECKS = \
THIRD_PARTY_ZLIB_A_DIRECTDEPS = \
LIBC_INTRIN \
LIBC_NEXGEN32E \
LIBC_SYSV \
LIBC_STR \
LIBC_STUBS
@ -34,18 +36,35 @@ $(THIRD_PARTY_ZLIB_A).pkg: \
$(foreach x,$(THIRD_PARTY_ZLIB_A_DIRECTDEPS),$($(x)_A).pkg)
ifeq ($(ARCH), x86_64)
o/$(MODE)/third_party/zlib/adler32simd.o: private \
OVERRIDE_CFLAGS += \
o/$(MODE)/third_party/zlib/adler32_simd.o: private \
TARGET_ARCH += \
-mssse3
o/$(MODE)/third_party/zlib/adler32simd.o: private \
o/$(MODE)/third_party/zlib/crc_folding.o \
o/$(MODE)/third_party/zlib/crc32_simd.o: private \
TARGET_ARCH += \
-msse4.2 \
-mpclmul
$(THIRD_PARTY_ZLIB_A_OBJS): private \
OVERRIDE_CPPFLAGS += \
-DADLER32_SIMD_SSSE3
o/$(MODE)/third_party/zlib/adler32.o: private \
-DADLER32_SIMD_SSSE3 \
-DCRC32_SIMD_SSE42_PCLMUL \
-DDEFLATE_SLIDE_HASH_SSE2 \
-DINFLATE_CHUNK_SIMD_SSE2 \
-DINFLATE_CHUNK_READ_64LE
endif
ifeq ($(ARCH), aarch64)
o/$(MODE)/third_party/zlib/deflate.o \
o/$(MODE)/third_party/zlib/crc32_simd.o: private \
TARGET_ARCH += \
-march=armv8-a+aes+crc
$(THIRD_PARTY_ZLIB_A_OBJS): private \
OVERRIDE_CPPFLAGS += \
-DADLER32_SIMD_SSSE3
o/$(MODE)/third_party/zlib/deflate.o: private \
OVERRIDE_CPPFLAGS += \
-DCRC32_SIMD_SSE42_PCLMUL
-DADLER32_SIMD_NEON \
-DCRC32_ARMV8_CRC32 \
-DDEFLATE_SLIDE_HASH_NEON \
-DINFLATE_CHUNK_SIMD_NEON \
-DINFLATE_CHUNK_READ_64LE
endif
$(THIRD_PARTY_ZLIB_A_OBJS): private \
@ -56,6 +75,7 @@ $(THIRD_PARTY_ZLIB_A_OBJS): private \
THIRD_PARTY_ZLIB_LIBS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)))
THIRD_PARTY_ZLIB_SRCS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_SRCS))
THIRD_PARTY_ZLIB_HDRS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_HDRS))
THIRD_PARTY_ZLIB_INCS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_INCS))
THIRD_PARTY_ZLIB_BINS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_BINS))
THIRD_PARTY_ZLIB_CHECKS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_CHECKS))
THIRD_PARTY_ZLIB_OBJS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_OBJS))

View file

@ -10,11 +10,6 @@
#include "libc/mem/mem.h"
#include "third_party/zlib/internal.h"
#include "third_party/zlib/zutil.internal.h"
asm(".ident\t\"\\n\\n\
zlib (zlib License)\\n\
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
/* @(#) $Id$ */

View file

@ -128,6 +128,12 @@ typedef unsigned long ulg;
#endif
#endif
#ifdef _MSC_VER
#define zalign(x) __declspec(align(x))
#else
#define zalign(x) __attribute__((aligned((x))))
#endif
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* ZUTIL_H */