mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-12 14:09:12 +00:00
Make AARCH64 harder, better, faster, stronger
- Perform some housekeeping on scalar math function code - Import ARM's Optimized Routines for SIMD string processing - Upgrade to latest Chromium zlib and enable more SIMD optimizations
This commit is contained in:
parent
550b52abf6
commit
cc1732bc42
143 changed files with 15661 additions and 1329 deletions
1
third_party/radpajama/common-gptneox.h
vendored
1
third_party/radpajama/common-gptneox.h
vendored
|
@ -1,3 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
|
||||
#include "libc/macros.internal.h"
|
||||
|
|
1
third_party/radpajama/gptneox-util.h
vendored
1
third_party/radpajama/gptneox-util.h
vendored
|
@ -1,3 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
#ifndef GPTNEOX_UTIL_H
|
||||
#define GPTNEOX_UTIL_H
|
||||
#include "libc/calls/calls.h"
|
||||
|
|
1
third_party/radpajama/gptneox.h
vendored
1
third_party/radpajama/gptneox.h
vendored
|
@ -1,3 +1,4 @@
|
|||
// -*- c++ -*-
|
||||
#ifndef GPTNEOX_H
|
||||
#define GPTNEOX_H
|
||||
// clang-format off
|
||||
|
|
2
third_party/zip/crc32.c
vendored
2
third_party/zip/crc32.c
vendored
|
@ -687,8 +687,6 @@ ulg crc32(crc, buf, len)
|
|||
pointer, then initialize the crc shift register contents instead.
|
||||
Return the current crc in either case. */
|
||||
{
|
||||
return crc32_z(crc,buf,len);
|
||||
|
||||
register z_uint4 c;
|
||||
register ZCONST ulg near *crc_32_tab;
|
||||
|
||||
|
|
27
third_party/zlib/LICENSE.chromium
vendored
Normal file
27
third_party/zlib/LICENSE.chromium
vendored
Normal file
|
@ -0,0 +1,27 @@
|
|||
// Copyright 2015 The Chromium Authors
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google LLC nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
22
third_party/zlib/LICENSE.zlib
vendored
Normal file
22
third_party/zlib/LICENSE.zlib
vendored
Normal file
|
@ -0,0 +1,22 @@
|
|||
Copyright notice:
|
||||
|
||||
(C) 1995-2022 Jean-loup Gailly and Mark Adler
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
Jean-loup Gailly Mark Adler
|
||||
jloup@gzip.org madler@alumni.caltech.edu
|
22
third_party/zlib/README.cosmo
vendored
22
third_party/zlib/README.cosmo
vendored
|
@ -11,23 +11,9 @@ ORIGIN
|
|||
The zlib sources were obtained from Chromium's zlib fork.
|
||||
|
||||
https://chromium.googlesource.com/chromium/src/third_party/zlib
|
||||
commit 8f22e90f007a7dd466b426513725c13191248315
|
||||
Author: Hans Wennborg <hans@chromium.org>
|
||||
Date: Fri Sep 16 16:14:51 2022 +0000
|
||||
|
||||
[zlib][fuzz] Cap the input size for zlib_inflate_with_header_fuzzer
|
||||
|
||||
To prevent timeouts when processing large inputs with small chunk sizes.
|
||||
|
||||
Bug: 1362206
|
||||
Change-Id: Ie21ea48abf85ee49897243857bf84b0f32d24bd5
|
||||
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3899099
|
||||
Reviewed-by: Adenilson Cavalcanti <cavalcantii@chromium.org>
|
||||
Auto-Submit: Hans Wennborg <hans@chromium.org>
|
||||
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
|
||||
Cr-Commit-Position: refs/heads/main@{#1048044}
|
||||
NOKEYCHECK=True
|
||||
GitOrigin-RevId: fd75b8c2768e7cc3a3e7a06bc563bb03c5ba0ec2
|
||||
commit 14dd4c4455602c9b71a1a89b5cafd1f4030d2e3f
|
||||
Author: Adenilson Cavalcanti <cavalcantii@chromium.org>
|
||||
Date: Tue Apr 11 17:40:40 2023 +0000
|
||||
|
||||
The source code for puff was obtained from zlib itself:
|
||||
|
||||
|
@ -42,7 +28,7 @@ LOCAL CHANGES
|
|||
|
||||
- Changed Trace(stderr) calls to use kprintf()
|
||||
|
||||
- We use our own crc32() implementation from LIBC_STR
|
||||
- Made the type signature of crc32_z() less obnoxious
|
||||
|
||||
- Fix a Chromium Zlib regression where malloc() failures inside
|
||||
deflateInit2() will result in a segmentation fault
|
||||
|
|
37
third_party/zlib/adler32.c
vendored
37
third_party/zlib/adler32.c
vendored
|
@ -1,22 +1,15 @@
|
|||
// clang-format off
|
||||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
// clang-format off
|
||||
/* adler32.c -- compute the Adler-32 checksum of a data stream
|
||||
* Copyright (C) 1995-2011, 2016 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "third_party/zlib/macros.internal.h"
|
||||
#include "third_party/zlib/zconf.h"
|
||||
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* @(#) $Id$ */
|
||||
#include "third_party/zlib/macros.internal.h"
|
||||
|
||||
local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
|
||||
|
||||
|
@ -70,7 +63,10 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
|
|||
# define MOD63(a) a %= BASE
|
||||
#endif
|
||||
|
||||
uint32_t ZLIB_INTERNAL adler32_simd_(uint32_t, const unsigned char *, z_size_t);
|
||||
#include "third_party/zlib/cpu_features.internal.h"
|
||||
#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON)
|
||||
#include "third_party/zlib/adler32_simd.inc"
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
uLong ZEXPORT adler32_z(adler, buf, len)
|
||||
|
@ -82,7 +78,7 @@ uLong ZEXPORT adler32_z(adler, buf, len)
|
|||
unsigned n;
|
||||
|
||||
#if defined(ADLER32_SIMD_SSSE3)
|
||||
if (buf != Z_NULL && len >= 64 && X86_HAVE(SSSE3))
|
||||
if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3)
|
||||
return adler32_simd_(adler, buf, len);
|
||||
#elif defined(ADLER32_SIMD_NEON)
|
||||
if (buf != Z_NULL && len >= 64)
|
||||
|
@ -104,9 +100,24 @@ uLong ZEXPORT adler32_z(adler, buf, len)
|
|||
return adler | (sum2 << 16);
|
||||
}
|
||||
|
||||
#if defined(ADLER32_SIMD_SSSE3)
|
||||
/*
|
||||
* Use SSSE3 to compute the adler32. Since this routine can be
|
||||
* freely used, check CPU features here. zlib convention is to
|
||||
* call adler32(0, NULL, 0), before making calls to adler32().
|
||||
* So this is a good early (and infrequent) place to cache CPU
|
||||
* features for those later, more interesting adler32() calls.
|
||||
*/
|
||||
if (buf == Z_NULL) {
|
||||
if (!len) /* Assume user is calling adler32(0, NULL, 0); */
|
||||
cpu_check_features();
|
||||
return 1L;
|
||||
}
|
||||
#else
|
||||
/* initial Adler-32 value (deferred check for len == 1 speed) */
|
||||
if (buf == Z_NULL)
|
||||
return 1L;
|
||||
#endif
|
||||
|
||||
/* in case short lengths are provided, keep it somewhat fast */
|
||||
if (len < 16) {
|
||||
|
|
371
third_party/zlib/adler32_simd.c
vendored
Normal file
371
third_party/zlib/adler32_simd.c
vendored
Normal file
|
@ -0,0 +1,371 @@
|
|||
asm(".ident\t\"\\n\\n\
|
||||
Chromium (BSD-3 License)\\n\
|
||||
Copyright 2017 The Chromium Authors\"");
|
||||
// clang-format off
|
||||
|
||||
/* adler32_simd.c
|
||||
*
|
||||
* Copyright 2017 The Chromium Authors
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the Chromium source repository LICENSE file.
|
||||
*
|
||||
* Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
|
||||
* the sum of N input data bytes D1 ... DN,
|
||||
*
|
||||
* A = A0 + D1 + D2 + ... + DN
|
||||
*
|
||||
* where A0 is the initial value.
|
||||
*
|
||||
* SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
|
||||
* for example) and accumulating the byte sums can use SSE shuffle-adds (see
|
||||
* the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
|
||||
* similar instructions.
|
||||
*
|
||||
* The adler32 B value (aka s2) sums the A values from each step:
|
||||
*
|
||||
* B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
|
||||
*
|
||||
* B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
|
||||
*
|
||||
* B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
|
||||
*
|
||||
* B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
|
||||
*
|
||||
* Adjacent blocks of 32 input bytes can be iterated with the expressions to
|
||||
* compute the adler32 s1 s2 of M >> 32 input bytes [1].
|
||||
*
|
||||
* As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
|
||||
* overflow the precision of their integer representation (bad). However, s1
|
||||
* and s2 also need to be computed modulo the adler BASE value (reduced). If
|
||||
* at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
|
||||
* a uint32_t type (the NMAX constraint) [2].
|
||||
*
|
||||
* [1] the iterative equations for s2 contain constant factors; these can be
|
||||
* hoisted from the n-blocks do loop of the SIMD code.
|
||||
*
|
||||
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
|
||||
* of the adler s1 s2 of uint32_t type (see adler32.c).
|
||||
*/
|
||||
|
||||
#include "third_party/zlib/adler32_simd.inc"
|
||||
|
||||
/* Definitions from adler32.c: largest prime smaller than 65536 */
|
||||
#define BASE 65521U
|
||||
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
|
||||
#define NMAX 5552
|
||||
|
||||
#if defined(ADLER32_SIMD_SSSE3)
|
||||
|
||||
#include "third_party/intel/tmmintrin.internal.h"
|
||||
|
||||
uint32_t ZLIB_INTERNAL adler32_simd_( /* SSSE3 */
|
||||
uint32_t adler,
|
||||
const unsigned char *buf,
|
||||
z_size_t len)
|
||||
{
|
||||
/*
|
||||
* Split Adler-32 into component sums.
|
||||
*/
|
||||
uint32_t s1 = adler & 0xffff;
|
||||
uint32_t s2 = adler >> 16;
|
||||
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
|
||||
z_size_t blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
|
||||
while (blocks)
|
||||
{
|
||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||
if (n > blocks)
|
||||
n = (unsigned) blocks;
|
||||
blocks -= n;
|
||||
|
||||
const __m128i tap1 =
|
||||
_mm_setr_epi8(32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17);
|
||||
const __m128i tap2 =
|
||||
_mm_setr_epi8(16,15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i zero =
|
||||
_mm_setr_epi8( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
const __m128i ones =
|
||||
_mm_set_epi16( 1, 1, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
/*
|
||||
* Process n blocks of data. At most NMAX data bytes can be
|
||||
* processed before s2 must be reduced modulo BASE.
|
||||
*/
|
||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||
|
||||
do {
|
||||
/*
|
||||
* Load 32 input bytes.
|
||||
*/
|
||||
const __m128i bytes1 = _mm_loadu_si128((__m128i*)(buf));
|
||||
const __m128i bytes2 = _mm_loadu_si128((__m128i*)(buf + 16));
|
||||
|
||||
/*
|
||||
* Add previous block byte sum to v_ps.
|
||||
*/
|
||||
v_ps = _mm_add_epi32(v_ps, v_s1);
|
||||
|
||||
/*
|
||||
* Horizontally add the bytes for s1, multiply-adds the
|
||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||
*/
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
||||
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
||||
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||
|
||||
buf += BLOCK_SIZE;
|
||||
|
||||
} while (--n);
|
||||
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||
|
||||
/*
|
||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||
*/
|
||||
|
||||
#define S23O1 _MM_SHUFFLE(2,3,0,1) /* A B C D -> B A D C */
|
||||
#define S1O32 _MM_SHUFFLE(1,0,3,2) /* A B C D -> C D A B */
|
||||
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
||||
|
||||
s1 += _mm_cvtsi128_si32(v_s1);
|
||||
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
||||
|
||||
s2 = _mm_cvtsi128_si32(v_s2);
|
||||
|
||||
#undef S23O1
|
||||
#undef S1O32
|
||||
|
||||
/*
|
||||
* Reduce.
|
||||
*/
|
||||
s1 %= BASE;
|
||||
s2 %= BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
if (len) {
|
||||
if (len >= 16) {
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
while (len--) {
|
||||
s2 += (s1 += *buf++);
|
||||
}
|
||||
|
||||
if (s1 >= BASE)
|
||||
s1 -= BASE;
|
||||
s2 %= BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
return s1 | (s2 << 16);
|
||||
}
|
||||
|
||||
#elif defined(ADLER32_SIMD_NEON)
|
||||
|
||||
#include "third_party/aarch64/arm_neon.h"
|
||||
|
||||
uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
|
||||
uint32_t adler,
|
||||
const unsigned char *buf,
|
||||
z_size_t len)
|
||||
{
|
||||
/*
|
||||
* Split Adler-32 into component sums.
|
||||
*/
|
||||
uint32_t s1 = adler & 0xffff;
|
||||
uint32_t s2 = adler >> 16;
|
||||
|
||||
/*
|
||||
* Serially compute s1 & s2, until the data is 16-byte aligned.
|
||||
*/
|
||||
if ((uintptr_t)buf & 15) {
|
||||
while ((uintptr_t)buf & 15) {
|
||||
s2 += (s1 += *buf++);
|
||||
--len;
|
||||
}
|
||||
|
||||
if (s1 >= BASE)
|
||||
s1 -= BASE;
|
||||
s2 %= BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
|
||||
z_size_t blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
|
||||
while (blocks)
|
||||
{
|
||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||
if (n > blocks)
|
||||
n = (unsigned) blocks;
|
||||
blocks -= n;
|
||||
|
||||
/*
|
||||
* Process n blocks of data. At most NMAX data bytes can be
|
||||
* processed before s2 must be reduced modulo BASE.
|
||||
*/
|
||||
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n };
|
||||
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
|
||||
|
||||
uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
|
||||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
|
||||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
|
||||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);
|
||||
|
||||
do {
|
||||
/*
|
||||
* Load 32 input bytes.
|
||||
*/
|
||||
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf));
|
||||
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16));
|
||||
|
||||
/*
|
||||
* Add previous block byte sum to v_s2.
|
||||
*/
|
||||
v_s2 = vaddq_u32(v_s2, v_s1);
|
||||
|
||||
/*
|
||||
* Horizontally add the bytes for s1.
|
||||
*/
|
||||
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));
|
||||
|
||||
/*
|
||||
* Vertically add the bytes for s2.
|
||||
*/
|
||||
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1));
|
||||
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
|
||||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2));
|
||||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));
|
||||
|
||||
buf += BLOCK_SIZE;
|
||||
|
||||
} while (--n);
|
||||
|
||||
v_s2 = vshlq_n_u32(v_s2, 5);
|
||||
|
||||
/*
|
||||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
|
||||
*/
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1),
|
||||
(uint16x4_t) { 32, 31, 30, 29 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1),
|
||||
(uint16x4_t) { 28, 27, 26, 25 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2),
|
||||
(uint16x4_t) { 24, 23, 22, 21 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2),
|
||||
(uint16x4_t) { 20, 19, 18, 17 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3),
|
||||
(uint16x4_t) { 16, 15, 14, 13 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3),
|
||||
(uint16x4_t) { 12, 11, 10, 9 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4),
|
||||
(uint16x4_t) { 8, 7, 6, 5 });
|
||||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4),
|
||||
(uint16x4_t) { 4, 3, 2, 1 });
|
||||
|
||||
/*
|
||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||
*/
|
||||
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
|
||||
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
|
||||
uint32x2_t s1s2 = vpadd_u32(sum1, sum2);
|
||||
|
||||
s1 += vget_lane_u32(s1s2, 0);
|
||||
s2 += vget_lane_u32(s1s2, 1);
|
||||
|
||||
/*
|
||||
* Reduce.
|
||||
*/
|
||||
s1 %= BASE;
|
||||
s2 %= BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
if (len) {
|
||||
if (len >= 16) {
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
while (len--) {
|
||||
s2 += (s1 += *buf++);
|
||||
}
|
||||
|
||||
if (s1 >= BASE)
|
||||
s1 -= BASE;
|
||||
s2 %= BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
return s1 | (s2 << 16);
|
||||
}
|
||||
|
||||
#endif /* ADLER32_SIMD_SSSE3 */
|
15
third_party/zlib/adler32_simd.inc
vendored
Normal file
15
third_party/zlib/adler32_simd.inc
vendored
Normal file
|
@ -0,0 +1,15 @@
|
|||
/* adler32_simd.h
|
||||
*
|
||||
* Copyright 2017 The Chromium Authors
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the Chromium source repository LICENSE file.
|
||||
*/
|
||||
|
||||
#include "libc/inttypes.h"
|
||||
#include "libc/limits.h"
|
||||
#include "libc/literal.h"
|
||||
#include "third_party/zlib/zconf.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
||||
uint32_t ZLIB_INTERNAL adler32_simd_(uint32_t adler, const unsigned char *buf,
|
||||
z_size_t len);
|
198
third_party/zlib/adler32simd.c
vendored
198
third_party/zlib/adler32simd.c
vendored
|
@ -1,198 +0,0 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2017 The Chromium Authors │
|
||||
│ Use of this source code is governed by the BSD-style licenses that can │
|
||||
│ be found in the third_party/zlib/LICENSE file. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "third_party/intel/emmintrin.internal.h"
|
||||
#include "third_party/intel/tmmintrin.internal.h"
|
||||
#include "third_party/zlib/internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
Chromium (BSD-3 License)\\n\
|
||||
Copyright 2017 The Chromium Authors\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
|
||||
/**
|
||||
* Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
|
||||
* the sum of N input data bytes D1 ... DN,
|
||||
*
|
||||
* A = A0 + D1 + D2 + ... + DN
|
||||
*
|
||||
* where A0 is the initial value.
|
||||
*
|
||||
* SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
|
||||
* for example) and accumulating the byte sums can use SSE shuffle-adds (see
|
||||
* the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
|
||||
* similar instructions.
|
||||
*
|
||||
* The adler32 B value (aka s2) sums the A values from each step:
|
||||
*
|
||||
* B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
|
||||
*
|
||||
* B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
|
||||
*
|
||||
* B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
|
||||
*
|
||||
* B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
|
||||
*
|
||||
* Adjacent blocks of 32 input bytes can be iterated with the expressions to
|
||||
* compute the adler32 s1 s2 of M >> 32 input bytes [1].
|
||||
*
|
||||
* As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
|
||||
* overflow the precision of their integer representation (bad). However, s1
|
||||
* and s2 also need to be computed modulo the adler BASE value (reduced). If
|
||||
* at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
|
||||
* a uint32_t type (the NMAX constraint) [2].
|
||||
*
|
||||
* [1] the iterative equations for s2 contain constant factors; these can be
|
||||
* hoisted from the n-blocks do loop of the SIMD code.
|
||||
*
|
||||
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
|
||||
* of the adler s1 s2 of uint32_t type (see adler32.c).
|
||||
*/
|
||||
|
||||
/* Definitions from adler32.c: largest prime smaller than 65536 */
|
||||
#define BASE 65521U
|
||||
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
|
||||
#define NMAX 5552
|
||||
|
||||
#ifdef ADLER32_SIMD_SSSE3
|
||||
|
||||
uint32_t adler32_simd_(uint32_t adler, const unsigned char *buf, size_t len) {
|
||||
/*
|
||||
* Split Adler-32 into component sums.
|
||||
*/
|
||||
uint32_t s1 = adler & 0xffff;
|
||||
uint32_t s2 = adler >> 16;
|
||||
|
||||
/*
|
||||
* Process the data in blocks.
|
||||
*/
|
||||
const unsigned BLOCK_SIZE = 1 << 5;
|
||||
|
||||
size_t blocks = len / BLOCK_SIZE;
|
||||
len -= blocks * BLOCK_SIZE;
|
||||
|
||||
while (blocks) {
|
||||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
|
||||
if (n > blocks) n = (unsigned)blocks;
|
||||
blocks -= n;
|
||||
|
||||
const __m128i tap1 = _mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23,
|
||||
22, 21, 20, 19, 18, 17);
|
||||
const __m128i tap2 =
|
||||
_mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
|
||||
const __m128i zero =
|
||||
_mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
||||
const __m128i ones = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
/*
|
||||
* Process n blocks of data. At most NMAX data bytes can be
|
||||
* processed before s2 must be reduced modulo BASE.
|
||||
*/
|
||||
__m128i v_ps = _mm_set_epi32(0, 0, 0, s1 * n);
|
||||
__m128i v_s2 = _mm_set_epi32(0, 0, 0, s2);
|
||||
__m128i v_s1 = _mm_set_epi32(0, 0, 0, 0);
|
||||
|
||||
do {
|
||||
/*
|
||||
* Load 32 input bytes.
|
||||
*/
|
||||
const __m128i bytes1 = _mm_loadu_si128((__m128i *)(buf));
|
||||
const __m128i bytes2 = _mm_loadu_si128((__m128i *)(buf + 16));
|
||||
|
||||
/*
|
||||
* Add previous block byte sum to v_ps.
|
||||
*/
|
||||
v_ps = _mm_add_epi32(v_ps, v_s1);
|
||||
|
||||
/*
|
||||
* Horizontally add the bytes for s1, multiply-adds the
|
||||
* bytes by [ 32, 31, 30, ... ] for s2.
|
||||
*/
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zero));
|
||||
const __m128i mad1 = _mm_maddubs_epi16(bytes1, tap1);
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad1, ones));
|
||||
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zero));
|
||||
const __m128i mad2 = _mm_maddubs_epi16(bytes2, tap2);
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(mad2, ones));
|
||||
|
||||
buf += BLOCK_SIZE;
|
||||
|
||||
} while (--n);
|
||||
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_slli_epi32(v_ps, 5));
|
||||
|
||||
/*
|
||||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
|
||||
*/
|
||||
|
||||
#define S23O1 _MM_SHUFFLE(2, 3, 0, 1) /* A B C D -> B A D C */
|
||||
#define S1O32 _MM_SHUFFLE(1, 0, 3, 2) /* A B C D -> C D A B */
|
||||
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S23O1));
|
||||
v_s1 = _mm_add_epi32(v_s1, _mm_shuffle_epi32(v_s1, S1O32));
|
||||
|
||||
s1 += _mm_cvtsi128_si32(v_s1);
|
||||
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S23O1));
|
||||
v_s2 = _mm_add_epi32(v_s2, _mm_shuffle_epi32(v_s2, S1O32));
|
||||
|
||||
s2 = _mm_cvtsi128_si32(v_s2);
|
||||
|
||||
#undef S23O1
|
||||
#undef S1O32
|
||||
|
||||
/*
|
||||
* Reduce.
|
||||
*/
|
||||
s1 %= BASE;
|
||||
s2 %= BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle leftover data.
|
||||
*/
|
||||
if (len) {
|
||||
if (len >= 16) {
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
s2 += (s1 += *buf++);
|
||||
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
while (len--) {
|
||||
s2 += (s1 += *buf++);
|
||||
}
|
||||
|
||||
if (s1 >= BASE) s1 -= BASE;
|
||||
s2 %= BASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return the recombined sums.
|
||||
*/
|
||||
return s1 | (s2 << 16);
|
||||
}
|
||||
|
||||
#endif /* ADLER32_SIMD_SSSE3 */
|
491
third_party/zlib/chunkcopy.inc
vendored
Normal file
491
third_party/zlib/chunkcopy.inc
vendored
Normal file
|
@ -0,0 +1,491 @@
|
|||
// clang-format off
|
||||
/* chunkcopy.h -- fast chunk copy and set operations
|
||||
* Copyright (C) 2017 ARM, Inc.
|
||||
* Copyright 2017 The Chromium Authors
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the Chromium source repository LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef CHUNKCOPY_H
|
||||
#define CHUNKCOPY_H
|
||||
#include "libc/inttypes.h"
|
||||
#include "libc/limits.h"
|
||||
#include "libc/literal.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
||||
#define Z_STATIC_ASSERT(name, assert) typedef char name[(assert) ? 1 : -1]
|
||||
|
||||
#if __STDC_VERSION__ >= 199901L
|
||||
#define Z_RESTRICT restrict
|
||||
#else
|
||||
#define Z_RESTRICT
|
||||
#endif
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
|
||||
#define Z_BUILTIN_MEMCPY __builtin_memcpy
|
||||
#else
|
||||
#define Z_BUILTIN_MEMCPY zmemcpy
|
||||
#endif
|
||||
|
||||
#if defined(INFLATE_CHUNK_SIMD_NEON)
|
||||
#include "third_party/aarch64/arm_neon.h"
|
||||
typedef uint8x16_t z_vec128i_t;
|
||||
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
|
||||
#include "third_party/intel/emmintrin.internal.h"
|
||||
typedef __m128i z_vec128i_t;
|
||||
#else
|
||||
#error chunkcopy.h inflate chunk SIMD is not defined for your build target
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Suppress MSan errors about copying uninitialized bytes (crbug.com/1376033).
|
||||
*/
|
||||
#define Z_DISABLE_MSAN
|
||||
#if defined(__has_feature)
|
||||
#if __has_feature(memory_sanitizer)
|
||||
#undef Z_DISABLE_MSAN
|
||||
#define Z_DISABLE_MSAN __attribute__((no_sanitize("memory")))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* chunk copy type: the z_vec128i_t type size should be exactly 128-bits
|
||||
* and equal to CHUNKCOPY_CHUNK_SIZE.
|
||||
*/
|
||||
#define CHUNKCOPY_CHUNK_SIZE sizeof(z_vec128i_t)
|
||||
|
||||
Z_STATIC_ASSERT(vector_128_bits_wide,
|
||||
CHUNKCOPY_CHUNK_SIZE == sizeof(int8_t) * 16);
|
||||
|
||||
/*
|
||||
* Ask the compiler to perform a wide, unaligned load with a machine
|
||||
* instruction appropriate for the z_vec128i_t type.
|
||||
*/
|
||||
static inline z_vec128i_t loadchunk(
|
||||
const unsigned char FAR* s) Z_DISABLE_MSAN {
|
||||
z_vec128i_t v;
|
||||
Z_BUILTIN_MEMCPY(&v, s, sizeof(v));
|
||||
return v;
|
||||
}
|
||||
|
||||
/*
|
||||
* Ask the compiler to perform a wide, unaligned store with a machine
|
||||
* instruction appropriate for the z_vec128i_t type.
|
||||
*/
|
||||
static inline void storechunk(
|
||||
unsigned char FAR* d,
|
||||
const z_vec128i_t v) {
|
||||
Z_BUILTIN_MEMCPY(d, &v, sizeof(v));
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform a memcpy-like operation, assuming that length is non-zero and that
|
||||
* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
|
||||
* the length is shorter than this.
|
||||
*
|
||||
* It also guarantees that it will properly unroll the data if the distance
|
||||
* between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
|
||||
* in chunkcopy_relaxed().
|
||||
*
|
||||
* Aside from better memory bus utilisation, this means that short copies
|
||||
* (CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
|
||||
* without iteration, which will hopefully make the branch prediction more
|
||||
* reliable.
|
||||
*/
|
||||
static inline unsigned char FAR* chunkcopy_core(
|
||||
unsigned char FAR* out,
|
||||
const unsigned char FAR* from,
|
||||
unsigned len) Z_DISABLE_MSAN {
|
||||
const int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
|
||||
storechunk(out, loadchunk(from));
|
||||
out += bump;
|
||||
from += bump;
|
||||
len /= CHUNKCOPY_CHUNK_SIZE;
|
||||
while (len-- > 0) {
|
||||
storechunk(out, loadchunk(from));
|
||||
out += CHUNKCOPY_CHUNK_SIZE;
|
||||
from += CHUNKCOPY_CHUNK_SIZE;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Like chunkcopy_core(), but avoid writing beyond of legal output.
|
||||
*
|
||||
* Accepts an additional pointer to the end of safe output. A generic safe
|
||||
* copy would use (out + len), but it's normally the case that the end of the
|
||||
* output buffer is beyond the end of the current copy, and this can still be
|
||||
* exploited.
|
||||
*/
|
||||
static inline unsigned char FAR* chunkcopy_core_safe(
|
||||
unsigned char FAR* out,
|
||||
const unsigned char FAR* from,
|
||||
unsigned len,
|
||||
unsigned char FAR* limit) {
|
||||
Assert(out + len <= limit, "chunk copy exceeds safety limit");
|
||||
if ((limit - out) < (ptrdiff_t)CHUNKCOPY_CHUNK_SIZE) {
|
||||
const unsigned char FAR* Z_RESTRICT rfrom = from;
|
||||
Assert((uintptr_t)out - (uintptr_t)from >= len,
|
||||
"invalid restrict in chunkcopy_core_safe");
|
||||
Assert((uintptr_t)from - (uintptr_t)out >= len,
|
||||
"invalid restrict in chunkcopy_core_safe");
|
||||
if (len & 8) {
|
||||
Z_BUILTIN_MEMCPY(out, rfrom, 8);
|
||||
out += 8;
|
||||
rfrom += 8;
|
||||
}
|
||||
if (len & 4) {
|
||||
Z_BUILTIN_MEMCPY(out, rfrom, 4);
|
||||
out += 4;
|
||||
rfrom += 4;
|
||||
}
|
||||
if (len & 2) {
|
||||
Z_BUILTIN_MEMCPY(out, rfrom, 2);
|
||||
out += 2;
|
||||
rfrom += 2;
|
||||
}
|
||||
if (len & 1) {
|
||||
*out++ = *rfrom++;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
return chunkcopy_core(out, from, len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform short copies until distance can be rewritten as being at least
|
||||
* CHUNKCOPY_CHUNK_SIZE.
|
||||
*
|
||||
* Assumes it's OK to overwrite at least the first 2*CHUNKCOPY_CHUNK_SIZE
|
||||
* bytes of output even if the copy is shorter than this. This assumption
|
||||
* holds within zlib inflate_fast(), which starts every iteration with at
|
||||
* least 258 bytes of output space available (258 being the maximum length
|
||||
* output from a single token; see inffast.c).
|
||||
*/
|
||||
static inline unsigned char FAR* chunkunroll_relaxed(
|
||||
unsigned char FAR* out,
|
||||
unsigned FAR* dist,
|
||||
unsigned FAR* len) Z_DISABLE_MSAN {
|
||||
const unsigned char FAR* from = out - *dist;
|
||||
while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
|
||||
storechunk(out, loadchunk(from));
|
||||
out += *dist;
|
||||
*len -= *dist;
|
||||
*dist += *dist;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
#if defined(INFLATE_CHUNK_SIMD_NEON)
|
||||
/*
|
||||
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
|
||||
* every 64-bit component of the 128-bit result (64-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load64_dup(const void* src) {
|
||||
return vcombine_u8(vld1_u8(src), vld1_u8(src));
|
||||
}
|
||||
|
||||
/*
|
||||
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
|
||||
* every 32-bit component of the 128-bit result (32-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load32_dup(const void* src) {
|
||||
int32_t i32;
|
||||
Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
|
||||
return vreinterpretq_u8_s32(vdupq_n_s32(i32));
|
||||
}
|
||||
|
||||
/*
|
||||
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
|
||||
* every 16-bit component of the 128-bit result (16-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load16_dup(const void* src) {
|
||||
int16_t i16;
|
||||
Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
|
||||
return vreinterpretq_u8_s16(vdupq_n_s16(i16));
|
||||
}
|
||||
|
||||
/*
|
||||
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
|
||||
* component of the 128-bit result (8-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load8_dup(const void* src) {
|
||||
return vld1q_dup_u8((const uint8_t*)src);
|
||||
}
|
||||
|
||||
/*
|
||||
* v_store_128(): store the 128-bit vec in a memory destination (that might
|
||||
* not be 16-byte aligned) void* out.
|
||||
*/
|
||||
static inline void v_store_128(void* out, const z_vec128i_t vec) {
|
||||
vst1q_u8(out, vec);
|
||||
}
|
||||
|
||||
#elif defined(INFLATE_CHUNK_SIMD_SSE2)
|
||||
/*
|
||||
* v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
|
||||
* every 64-bit component of the 128-bit result (64-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load64_dup(const void* src) {
|
||||
int64_t i64;
|
||||
Z_BUILTIN_MEMCPY(&i64, src, sizeof(i64));
|
||||
return _mm_set1_epi64x(i64);
|
||||
}
|
||||
|
||||
/*
|
||||
* v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
|
||||
* every 32-bit component of the 128-bit result (32-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load32_dup(const void* src) {
|
||||
int32_t i32;
|
||||
Z_BUILTIN_MEMCPY(&i32, src, sizeof(i32));
|
||||
return _mm_set1_epi32(i32);
|
||||
}
|
||||
|
||||
/*
|
||||
* v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
|
||||
* every 16-bit component of the 128-bit result (16-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load16_dup(const void* src) {
|
||||
int16_t i16;
|
||||
Z_BUILTIN_MEMCPY(&i16, src, sizeof(i16));
|
||||
return _mm_set1_epi16(i16);
|
||||
}
|
||||
|
||||
/*
|
||||
* v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
|
||||
* component of the 128-bit result (8-bit int splat).
|
||||
*/
|
||||
static inline z_vec128i_t v_load8_dup(const void* src) {
|
||||
return _mm_set1_epi8(*(const char*)src);
|
||||
}
|
||||
|
||||
/*
|
||||
* v_store_128(): store the 128-bit vec in a memory destination (that might
|
||||
* not be 16-byte aligned) void* out.
|
||||
*/
|
||||
static inline void v_store_128(void* out, const z_vec128i_t vec) {
|
||||
_mm_storeu_si128((__m128i*)out, vec);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Perform an overlapping copy which behaves as a memset() operation, but
|
||||
* supporting periods other than one, and assume that length is non-zero and
|
||||
* that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
|
||||
* even if the length is shorter than this.
|
||||
*/
|
||||
static inline unsigned char FAR* chunkset_core(
|
||||
unsigned char FAR* out,
|
||||
unsigned period,
|
||||
unsigned len) {
|
||||
z_vec128i_t v;
|
||||
const int bump = ((len - 1) % sizeof(v)) + 1;
|
||||
|
||||
switch (period) {
|
||||
case 1:
|
||||
v = v_load8_dup(out - 1);
|
||||
v_store_128(out, v);
|
||||
out += bump;
|
||||
len -= bump;
|
||||
while (len > 0) {
|
||||
v_store_128(out, v);
|
||||
out += sizeof(v);
|
||||
len -= sizeof(v);
|
||||
}
|
||||
return out;
|
||||
case 2:
|
||||
v = v_load16_dup(out - 2);
|
||||
v_store_128(out, v);
|
||||
out += bump;
|
||||
len -= bump;
|
||||
if (len > 0) {
|
||||
v = v_load16_dup(out - 2);
|
||||
do {
|
||||
v_store_128(out, v);
|
||||
out += sizeof(v);
|
||||
len -= sizeof(v);
|
||||
} while (len > 0);
|
||||
}
|
||||
return out;
|
||||
case 4:
|
||||
v = v_load32_dup(out - 4);
|
||||
v_store_128(out, v);
|
||||
out += bump;
|
||||
len -= bump;
|
||||
if (len > 0) {
|
||||
v = v_load32_dup(out - 4);
|
||||
do {
|
||||
v_store_128(out, v);
|
||||
out += sizeof(v);
|
||||
len -= sizeof(v);
|
||||
} while (len > 0);
|
||||
}
|
||||
return out;
|
||||
case 8:
|
||||
v = v_load64_dup(out - 8);
|
||||
v_store_128(out, v);
|
||||
out += bump;
|
||||
len -= bump;
|
||||
if (len > 0) {
|
||||
v = v_load64_dup(out - 8);
|
||||
do {
|
||||
v_store_128(out, v);
|
||||
out += sizeof(v);
|
||||
len -= sizeof(v);
|
||||
} while (len > 0);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
out = chunkunroll_relaxed(out, &period, &len);
|
||||
return chunkcopy_core(out, out - period, len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform a memcpy-like operation, but assume that length is non-zero and that
|
||||
* it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
|
||||
* the length is shorter than this.
|
||||
*
|
||||
* Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
|
||||
* of overlapping buffers, regardless of the distance between the pointers.
|
||||
* This is reflected in the `restrict`-qualified pointers, allowing the
|
||||
* compiler to re-order loads and stores.
|
||||
*/
|
||||
static inline unsigned char FAR* chunkcopy_relaxed(
|
||||
unsigned char FAR* Z_RESTRICT out,
|
||||
const unsigned char FAR* Z_RESTRICT from,
|
||||
unsigned len) {
|
||||
Assert((uintptr_t)out - (uintptr_t)from >= len,
|
||||
"invalid restrict in chunkcopy_relaxed");
|
||||
Assert((uintptr_t)from - (uintptr_t)out >= len,
|
||||
"invalid restrict in chunkcopy_relaxed");
|
||||
return chunkcopy_core(out, from, len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Like chunkcopy_relaxed(), but avoid writing beyond of legal output.
|
||||
*
|
||||
* Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
|
||||
* behaviour of overlapping buffers, regardless of the distance between the
|
||||
* pointers. This is reflected in the `restrict`-qualified pointers, allowing
|
||||
* the compiler to re-order loads and stores.
|
||||
*
|
||||
* Accepts an additional pointer to the end of safe output. A generic safe
|
||||
* copy would use (out + len), but it's normally the case that the end of the
|
||||
* output buffer is beyond the end of the current copy, and this can still be
|
||||
* exploited.
|
||||
*/
|
||||
static inline unsigned char FAR* chunkcopy_safe(
|
||||
unsigned char FAR* out,
|
||||
const unsigned char FAR* Z_RESTRICT from,
|
||||
unsigned len,
|
||||
unsigned char FAR* limit) {
|
||||
Assert(out + len <= limit, "chunk copy exceeds safety limit");
|
||||
Assert((uintptr_t)out - (uintptr_t)from >= len,
|
||||
"invalid restrict in chunkcopy_safe");
|
||||
Assert((uintptr_t)from - (uintptr_t)out >= len,
|
||||
"invalid restrict in chunkcopy_safe");
|
||||
|
||||
return chunkcopy_core_safe(out, from, len, limit);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform chunky copy within the same buffer, where the source and destination
|
||||
* may potentially overlap.
|
||||
*
|
||||
* Assumes that len > 0 on entry, and that it's safe to write at least
|
||||
* CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
|
||||
*/
|
||||
static inline unsigned char FAR* chunkcopy_lapped_relaxed(
|
||||
unsigned char FAR* out,
|
||||
unsigned dist,
|
||||
unsigned len) {
|
||||
if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
|
||||
return chunkset_core(out, dist, len);
|
||||
}
|
||||
return chunkcopy_core(out, out - dist, len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Behave like chunkcopy_lapped_relaxed(), but avoid writing beyond of legal
|
||||
* output.
|
||||
*
|
||||
* Accepts an additional pointer to the end of safe output. A generic safe
|
||||
* copy would use (out + len), but it's normally the case that the end of the
|
||||
* output buffer is beyond the end of the current copy, and this can still be
|
||||
* exploited.
|
||||
*/
|
||||
static inline unsigned char FAR* chunkcopy_lapped_safe(
|
||||
unsigned char FAR* out,
|
||||
unsigned dist,
|
||||
unsigned len,
|
||||
unsigned char FAR* limit) {
|
||||
Assert(out + len <= limit, "chunk copy exceeds safety limit");
|
||||
if ((limit - out) < (ptrdiff_t)(3 * CHUNKCOPY_CHUNK_SIZE)) {
|
||||
/* TODO(cavalcantii): try harder to optimise this */
|
||||
while (len-- > 0) {
|
||||
*out = *(out - dist);
|
||||
out++;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
return chunkcopy_lapped_relaxed(out, dist, len);
|
||||
}
|
||||
|
||||
/* TODO(cavalcanti): see crbug.com/1110083. */
|
||||
static inline unsigned char FAR* chunkcopy_safe_ugly(unsigned char FAR* out,
|
||||
unsigned dist,
|
||||
unsigned len,
|
||||
unsigned char FAR* limit) {
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
/* Speed is the same as using chunkcopy_safe
|
||||
w/ GCC on ARM (tested gcc 6.3 and 7.5) and avoids
|
||||
undefined behavior.
|
||||
*/
|
||||
return chunkcopy_core_safe(out, out - dist, len, limit);
|
||||
#elif defined(__clang__) && defined(ARMV8_OS_ANDROID) && !defined(__aarch64__)
|
||||
/* Seems to perform better on 32bit (i.e. Android). */
|
||||
return chunkcopy_core_safe(out, out - dist, len, limit);
|
||||
#else
|
||||
/* Seems to perform better on 64bit. */
|
||||
return chunkcopy_lapped_safe(out, dist, len, limit);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* The chunk-copy code above deals with writing the decoded DEFLATE data to
|
||||
* the output with SIMD methods to increase decode speed. Reading the input
|
||||
* to the DEFLATE decoder with a wide, SIMD method can also increase decode
|
||||
* speed. This option is supported on little endian machines, and reads the
|
||||
* input data in 64-bit (8 byte) chunks.
|
||||
*/
|
||||
|
||||
#ifdef INFLATE_CHUNK_READ_64LE
|
||||
/*
|
||||
* Buffer the input in a uint64_t (8 bytes) in the wide input reading case.
|
||||
*/
|
||||
typedef uint64_t inflate_holder_t;
|
||||
|
||||
/*
|
||||
* Ask the compiler to perform a wide, unaligned load of a uint64_t using a
|
||||
* machine instruction appropriate for the uint64_t type.
|
||||
*/
|
||||
static inline inflate_holder_t read64le(const unsigned char FAR *in) {
|
||||
inflate_holder_t input;
|
||||
Z_BUILTIN_MEMCPY(&input, in, sizeof(input));
|
||||
return input;
|
||||
}
|
||||
#else
|
||||
/*
|
||||
* Otherwise, buffer the input bits using zlib's default input buffer type.
|
||||
*/
|
||||
typedef unsigned long inflate_holder_t;
|
||||
|
||||
#endif /* INFLATE_CHUNK_READ_64LE */
|
||||
|
||||
#undef Z_STATIC_ASSERT
|
||||
#undef Z_RESTRICT
|
||||
#undef Z_BUILTIN_MEMCPY
|
||||
#undef Z_DISABLE_MSAN
|
||||
|
||||
#endif /* CHUNKCOPY_H */
|
5
third_party/zlib/compress.c
vendored
5
third_party/zlib/compress.c
vendored
|
@ -8,11 +8,6 @@
|
|||
#include "third_party/zlib/internal.h"
|
||||
#include "third_party/zlib/macros.internal.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* @(#) $Id$ */
|
||||
|
|
37
third_party/zlib/cpu_features.c
vendored
Normal file
37
third_party/zlib/cpu_features.c
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2023 Justine Alexandra Roberts Tunney │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
#include "libc/dce.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/sysv/consts/auxv.h"
|
||||
#include "libc/sysv/consts/hwap.h"
|
||||
#include "third_party/zlib/cpu_features.internal.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
int arm_cpu_enable_crc32;
|
||||
int arm_cpu_enable_pmull;
|
||||
|
||||
void(cpu_check_features)(void) {
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON)
|
||||
if (IsLinux()) {
|
||||
unsigned long features = getauxval(AT_HWCAP);
|
||||
arm_cpu_enable_crc32 = !!(features & HWCAP_CRC32);
|
||||
arm_cpu_enable_pmull = !!(features & HWCAP_PMULL);
|
||||
}
|
||||
#endif
|
||||
}
|
24
third_party/zlib/cpu_features.internal.h
vendored
Normal file
24
third_party/zlib/cpu_features.internal.h
vendored
Normal file
|
@ -0,0 +1,24 @@
|
|||
#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
#ifdef __x86_64__
|
||||
|
||||
#define x86_cpu_enable_sse2 X86_HAVE(SSE2)
|
||||
#define x86_cpu_enable_ssse3 X86_HAVE(SSSE3)
|
||||
#define x86_cpu_enable_simd (X86_HAVE(SSE4_2) && X86_HAVE(PCLMUL))
|
||||
#define x86_cpu_enable_avx512 X86_HAVE(AVX512F)
|
||||
#define cpu_check_features() (void)0
|
||||
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
#define cpu_check_features zlib_cpu_check_features
|
||||
_Hide extern int arm_cpu_enable_crc32;
|
||||
_Hide extern int arm_cpu_enable_pmull;
|
||||
_Hide void cpu_check_features(void);
|
||||
|
||||
#endif
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_CPU_FEATURES_H_ */
|
75
third_party/zlib/crc32.c
vendored
75
third_party/zlib/crc32.c
vendored
|
@ -13,11 +13,6 @@
|
|||
#include "third_party/zlib/deflate.internal.h"
|
||||
#include "third_party/zlib/internal.h"
|
||||
#include "third_party/zlib/macros.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* @(#) $Id$ */
|
||||
|
@ -33,8 +28,19 @@ asm(".include \"libc/disclaimer.inc\"");
|
|||
produced, so that this one source file can be compiled to an executable.
|
||||
*/
|
||||
|
||||
#ifdef MAKECRCH
|
||||
//# include <stdio.h>
|
||||
# ifndef DYNAMIC_CRC_TABLE
|
||||
# define DYNAMIC_CRC_TABLE
|
||||
# endif /* !DYNAMIC_CRC_TABLE */
|
||||
#endif /* MAKECRCH */
|
||||
|
||||
#include "third_party/zlib/deflate.internal.h"
|
||||
#include "third_party/zlib/cpu_features.internal.h"
|
||||
#include "third_party/zlib/zutil.internal.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
|
||||
|
||||
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
|
||||
# include "crc32_simd.h"
|
||||
#include "third_party/zlib/crc32_simd.internal.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -106,16 +112,25 @@ asm(".include \"libc/disclaimer.inc\"");
|
|||
# endif
|
||||
#endif
|
||||
|
||||
/* Local functions. */
|
||||
local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
|
||||
local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
|
||||
|
||||
/* If available, use the ARM processor CRC32 instruction. */
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
|
||||
&& defined(USE_CANONICAL_ARMV8_CRC32)
|
||||
# define ARMCRC32_CANONICAL_ZLIB
|
||||
#endif
|
||||
|
||||
/* Local functions. */
|
||||
local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
|
||||
local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
|
||||
|
||||
#if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
|
||||
local z_word_t byte_swap OF((z_word_t word));
|
||||
#endif
|
||||
|
||||
#if defined(W) && !defined(ARMCRC32_CANONICAL_ZLIB)
|
||||
local z_crc_t crc_word OF((z_word_t data));
|
||||
local z_word_t crc_word_big OF((z_word_t data));
|
||||
#endif
|
||||
|
||||
#if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
|
||||
/*
|
||||
Swap the bytes in a z_word_t to convert between little and big endian. Any
|
||||
|
@ -149,7 +164,6 @@ local z_word_t byte_swap(word)
|
|||
/* CRC polynomial. */
|
||||
#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
|
||||
|
||||
#define DYNAMIC_CRC_TABLE
|
||||
#ifdef DYNAMIC_CRC_TABLE
|
||||
|
||||
local z_crc_t FAR crc_table[256];
|
||||
|
@ -534,7 +548,7 @@ local void braid(ltl, big, n, w)
|
|||
* Tables for byte-wise and braided CRC-32 calculations, and a table of powers
|
||||
* of x for combining CRC-32s, all made by make_crc_table().
|
||||
*/
|
||||
# include "crc32.h"
|
||||
#include "third_party/zlib/crc32.inc"
|
||||
#endif /* DYNAMIC_CRC_TABLE */
|
||||
|
||||
/* ========================================================================
|
||||
|
@ -617,15 +631,15 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
|
|||
#define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
|
||||
#define Z_BATCH_MIN 800 /* fewest words in a final batch */
|
||||
|
||||
#error this is arm?
|
||||
unsigned long ZEXPORT crc32_z(crc, buf, len)
|
||||
unsigned long crc;
|
||||
const unsigned char FAR *buf;
|
||||
z_size_t len;
|
||||
uint32_t ZEXPORT crc32_z(crc, buf_, len)
|
||||
uint32_t crc;
|
||||
const void FAR *buf_;
|
||||
size_t len;
|
||||
{
|
||||
z_crc_t val;
|
||||
z_word_t crc1, crc2;
|
||||
const z_word_t *word;
|
||||
const unsigned char FAR *buf = buf_;
|
||||
z_word_t val0, val1, val2;
|
||||
z_size_t last, last2, i;
|
||||
z_size_t num;
|
||||
|
@ -743,13 +757,13 @@ local z_word_t crc_word_big(data)
|
|||
|
||||
#endif
|
||||
|
||||
#if 0 /* [jart] favor LIBC_STR crc32() */
|
||||
/* ========================================================================= */
|
||||
unsigned long ZEXPORT crc32_z(crc, buf, len)
|
||||
unsigned long crc;
|
||||
const unsigned char FAR *buf;
|
||||
z_size_t len;
|
||||
uint32_t ZEXPORT crc32_z(crc, buf_, len)
|
||||
uint32_t crc;
|
||||
const void FAR *buf_;
|
||||
size_t len;
|
||||
{
|
||||
const unsigned char FAR *buf = buf_;
|
||||
/*
|
||||
* zlib convention is to call crc32(0, NULL, 0); before making
|
||||
* calls to crc32(). So this is a good, early (and infrequent)
|
||||
|
@ -767,7 +781,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
|
|||
}
|
||||
|
||||
#endif
|
||||
#if defined(CRC32_SIMD_SSE42_PCLMUL)
|
||||
#if defined(CRC32_SIMD_AVX512_PCLMUL)
|
||||
if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
|
||||
/* crc32 64-byte chunks */
|
||||
z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
|
||||
crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
|
||||
/* check remaining data */
|
||||
len -= chunk_size;
|
||||
if (!len)
|
||||
return crc;
|
||||
/* Fall into the default crc32 for the remaining data. */
|
||||
buf += chunk_size;
|
||||
}
|
||||
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
|
||||
if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
|
||||
/* crc32 16-byte chunks */
|
||||
z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
|
||||
|
@ -1114,11 +1140,9 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
|
|||
/* Return the CRC, post-conditioned. */
|
||||
return crc ^ 0xffffffff;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if 0 /* [jart] favor LIBC_STR crc32() */
|
||||
/* ========================================================================= */
|
||||
unsigned long ZEXPORT crc32(crc, buf, len)
|
||||
unsigned long crc;
|
||||
|
@ -1162,7 +1186,6 @@ unsigned long ZEXPORT crc32(crc, buf, len)
|
|||
#endif
|
||||
return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
|
||||
}
|
||||
#endif
|
||||
|
||||
/* ========================================================================= */
|
||||
uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
|
||||
|
|
9448
third_party/zlib/crc32.inc
vendored
Normal file
9448
third_party/zlib/crc32.inc
vendored
Normal file
File diff suppressed because it is too large
Load diff
626
third_party/zlib/crc32_simd.c
vendored
Normal file
626
third_party/zlib/crc32_simd.c
vendored
Normal file
|
@ -0,0 +1,626 @@
|
|||
/* crc32_simd.c
|
||||
*
|
||||
* Copyright 2017 The Chromium Authors
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the Chromium source repository LICENSE file.
|
||||
*/
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
Chromium (BSD-3 License)\\n\
|
||||
Copyright 2017 The Chromium Authors\"");
|
||||
// clang-format off
|
||||
|
||||
#include "third_party/zlib/crc32_simd.internal.h"
|
||||
#if defined(CRC32_SIMD_AVX512_PCLMUL)
|
||||
|
||||
/*
|
||||
* crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
|
||||
* length must be at least 256, and a multiple of 64. Based on:
|
||||
*
|
||||
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
||||
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
|
||||
*/
|
||||
|
||||
#include "third_party/intel/emmintrin.internal.h"
|
||||
#include "third_party/intel/smmintrin.internal.h"
|
||||
#include "third_party/intel/wmmintrin.internal.h"
|
||||
#include "third_party/intel/immintrin.internal.h"
|
||||
|
||||
uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
|
||||
const unsigned char *buf,
|
||||
z_size_t len,
|
||||
uint32_t crc)
|
||||
{
|
||||
/*
|
||||
* Definitions of the bit-reflected domain constants k1,k2,k3,k4
|
||||
* are similar to those given at the end of the paper, and remaining
|
||||
* constants and CRC32+Barrett polynomials remain unchanged.
|
||||
*
|
||||
* Replace the index of x from 128 to 512. As follows:
|
||||
* k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
|
||||
* k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
|
||||
* k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
|
||||
* k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
|
||||
*/
|
||||
static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
|
||||
0x011542778a, 0x01322d1430,
|
||||
0x011542778a, 0x01322d1430,
|
||||
0x011542778a, 0x01322d1430 };
|
||||
static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
|
||||
0x0154442bd4, 0x01c6e41596,
|
||||
0x0154442bd4, 0x01c6e41596,
|
||||
0x0154442bd4, 0x01c6e41596 };
|
||||
static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
|
||||
static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
|
||||
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
|
||||
__m128i a0, a1, a2, a3;
|
||||
|
||||
/*
|
||||
* There's at least one block of 256.
|
||||
*/
|
||||
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
|
||||
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
|
||||
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
|
||||
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
|
||||
|
||||
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
|
||||
|
||||
x0 = _mm512_load_si512((__m512i *)k1k2);
|
||||
|
||||
buf += 256;
|
||||
len -= 256;
|
||||
|
||||
/*
|
||||
* Parallel fold blocks of 256, if any.
|
||||
*/
|
||||
while (len >= 256)
|
||||
{
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
|
||||
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
|
||||
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
|
||||
|
||||
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
|
||||
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
|
||||
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
|
||||
|
||||
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
|
||||
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
|
||||
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
|
||||
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
|
||||
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
x2 = _mm512_xor_si512(x2, x6);
|
||||
x3 = _mm512_xor_si512(x3, x7);
|
||||
x4 = _mm512_xor_si512(x4, x8);
|
||||
|
||||
x1 = _mm512_xor_si512(x1, y5);
|
||||
x2 = _mm512_xor_si512(x2, y6);
|
||||
x3 = _mm512_xor_si512(x3, y7);
|
||||
x4 = _mm512_xor_si512(x4, y8);
|
||||
|
||||
buf += 256;
|
||||
len -= 256;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold into 512-bits.
|
||||
*/
|
||||
x0 = _mm512_load_si512((__m512i *)k3k4);
|
||||
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x2);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x3);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x4);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
|
||||
/*
|
||||
* Single fold blocks of 64, if any.
|
||||
*/
|
||||
while (len >= 64)
|
||||
{
|
||||
x2 = _mm512_loadu_si512((__m512i *)buf);
|
||||
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x2);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold 512-bits to 384-bits.
|
||||
*/
|
||||
a0 = _mm_load_si128((__m128i *)k5k6);
|
||||
|
||||
a1 = _mm512_extracti32x4_epi32(x1, 0);
|
||||
a2 = _mm512_extracti32x4_epi32(x1, 1);
|
||||
|
||||
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
|
||||
|
||||
a1 = _mm_xor_si128(a1, a3);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
|
||||
/*
|
||||
* Fold 384-bits to 256-bits.
|
||||
*/
|
||||
a2 = _mm512_extracti32x4_epi32(x1, 2);
|
||||
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
|
||||
a1 = _mm_xor_si128(a1, a3);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
|
||||
/*
|
||||
* Fold 256-bits to 128-bits.
|
||||
*/
|
||||
a2 = _mm512_extracti32x4_epi32(x1, 3);
|
||||
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
|
||||
a1 = _mm_xor_si128(a1, a3);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
|
||||
/*
|
||||
* Fold 128-bits to 64-bits.
|
||||
*/
|
||||
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
|
||||
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
|
||||
a1 = _mm_srli_si128(a1, 8);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
|
||||
a0 = _mm_loadl_epi64((__m128i*)k7k8);
|
||||
a2 = _mm_srli_si128(a1, 4);
|
||||
a1 = _mm_and_si128(a1, a3);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
|
||||
/*
|
||||
* Barret reduce to 32-bits.
|
||||
*/
|
||||
a0 = _mm_load_si128((__m128i*)poly);
|
||||
|
||||
a2 = _mm_and_si128(a1, a3);
|
||||
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
|
||||
a2 = _mm_and_si128(a2, a3);
|
||||
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
|
||||
/*
|
||||
* Return the crc32.
|
||||
*/
|
||||
return _mm_extract_epi32(a1, 1);
|
||||
}
|
||||
|
||||
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
|
||||
|
||||
/*
|
||||
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
|
||||
* length must be at least 64, and a multiple of 16.
|
||||
*/
|
||||
|
||||
#include "third_party/intel/emmintrin.internal.h"
|
||||
#include "third_party/intel/smmintrin.internal.h"
|
||||
#include "third_party/intel/wmmintrin.internal.h"
|
||||
|
||||
uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
|
||||
const unsigned char *buf,
|
||||
z_size_t len,
|
||||
uint32_t crc)
|
||||
{
|
||||
/*
|
||||
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
|
||||
* the CRC32+Barrett polynomials given at the end of the paper.
|
||||
*/
|
||||
static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
|
||||
static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
|
||||
static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
|
||||
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
|
||||
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
|
||||
|
||||
/*
|
||||
* There's at least one block of 64.
|
||||
*/
|
||||
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
|
||||
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
|
||||
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
|
||||
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
|
||||
|
||||
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
|
||||
|
||||
x0 = _mm_load_si128((__m128i *)k1k2);
|
||||
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
|
||||
/*
|
||||
* Parallel fold blocks of 64, if any.
|
||||
*/
|
||||
while (len >= 64)
|
||||
{
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
|
||||
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
|
||||
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
|
||||
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
|
||||
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
|
||||
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
|
||||
|
||||
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
|
||||
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
|
||||
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
|
||||
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
|
||||
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_xor_si128(x2, x6);
|
||||
x3 = _mm_xor_si128(x3, x7);
|
||||
x4 = _mm_xor_si128(x4, x8);
|
||||
|
||||
x1 = _mm_xor_si128(x1, y5);
|
||||
x2 = _mm_xor_si128(x2, y6);
|
||||
x3 = _mm_xor_si128(x3, y7);
|
||||
x4 = _mm_xor_si128(x4, y8);
|
||||
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold into 128-bits.
|
||||
*/
|
||||
x0 = _mm_load_si128((__m128i *)k3k4);
|
||||
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x3);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
|
||||
/*
|
||||
* Single fold blocks of 16, if any.
|
||||
*/
|
||||
while (len >= 16)
|
||||
{
|
||||
x2 = _mm_loadu_si128((__m128i *)buf);
|
||||
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
|
||||
buf += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold 128-bits to 64-bits.
|
||||
*/
|
||||
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
|
||||
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
|
||||
x1 = _mm_srli_si128(x1, 8);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
|
||||
x0 = _mm_loadl_epi64((__m128i*)k5k0);
|
||||
|
||||
x2 = _mm_srli_si128(x1, 4);
|
||||
x1 = _mm_and_si128(x1, x3);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
|
||||
/*
|
||||
* Barret reduce to 32-bits.
|
||||
*/
|
||||
x0 = _mm_load_si128((__m128i*)poly);
|
||||
|
||||
x2 = _mm_and_si128(x1, x3);
|
||||
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
|
||||
x2 = _mm_and_si128(x2, x3);
|
||||
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
|
||||
/*
|
||||
* Return the crc32.
|
||||
*/
|
||||
return _mm_extract_epi32(x1, 1);
|
||||
}
|
||||
|
||||
#elif defined(CRC32_ARMV8_CRC32)
|
||||
|
||||
/* CRC32 checksums using ARMv8-a crypto instructions.
|
||||
*/
|
||||
|
||||
#if defined(__clang__)
|
||||
/* We need some extra types for using PMULL.
|
||||
*/
|
||||
#if defined(__aarch64__)
|
||||
#include "third_party/aarch64/arm_neon.h"
|
||||
#include "third_party/aarch64/arm_acle.h"
|
||||
#endif
|
||||
|
||||
/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
|
||||
* armv8 target, which is incompatible with ThinLTO optimizations on Android.
|
||||
* (Namely, mixing and matching different module-level targets makes ThinLTO
|
||||
* warn, and Android defaults to armv7-a. This restriction does not apply to
|
||||
* function-level `target`s, however.)
|
||||
*
|
||||
* Since we only need four crc intrinsics, and since clang's implementation of
|
||||
* those are just wrappers around compiler builtins, it's simplest to #define
|
||||
* those builtins directly. If this #define list grows too much (or we depend on
|
||||
* an intrinsic that isn't a trivial wrapper), we may have to find a better way
|
||||
* to go about this.
|
||||
*
|
||||
* NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
|
||||
* feature for this target (ignoring feature)." This appears to be a harmless
|
||||
* bug in clang.
|
||||
*
|
||||
* These definitions must appear *after* including arm_acle.h otherwise that
|
||||
* header may end up defining functions named __builtin_arm_crc32* that call
|
||||
* themselves, creating an infinite loop when the intrinsic is called.
|
||||
*/
|
||||
/* XXX: Cannot hook into builtins with XCode for arm64. */
|
||||
#if !defined(ARMV8_OS_MACOS)
|
||||
#define __crc32b __builtin_arm_crc32b
|
||||
#define __crc32d __builtin_arm_crc32d
|
||||
#define __crc32w __builtin_arm_crc32w
|
||||
#define __crc32cw __builtin_arm_crc32cw
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__)
|
||||
#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
|
||||
#else // !defined(__aarch64__)
|
||||
#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
|
||||
#endif // defined(__aarch64__)
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
|
||||
* allowed. We can just include arm_acle.h.
|
||||
*/
|
||||
#include "third_party/aarch64/arm_neon.h"
|
||||
#include "third_party/aarch64/arm_acle.h"
|
||||
#define TARGET_ARMV8_WITH_CRC
|
||||
#else // !defined(__GNUC__) && !defined(_aarch64__)
|
||||
#error ARM CRC32 SIMD extensions only supported for Clang and GCC
|
||||
#endif
|
||||
|
||||
TARGET_ARMV8_WITH_CRC
|
||||
uint32_t ZLIB_INTERNAL armv8_crc32_little(
|
||||
const unsigned char *buf,
|
||||
z_size_t len,
|
||||
uint32_t crc)
|
||||
{
|
||||
uint32_t c = (uint32_t) ~crc;
|
||||
|
||||
while (len && ((uintptr_t)buf & 7)) {
|
||||
c = __crc32b(c, *buf++);
|
||||
--len;
|
||||
}
|
||||
|
||||
const uint64_t *buf8 = (const uint64_t *)buf;
|
||||
|
||||
while (len >= 64) {
|
||||
c = __crc32d(c, *buf8++);
|
||||
c = __crc32d(c, *buf8++);
|
||||
c = __crc32d(c, *buf8++);
|
||||
c = __crc32d(c, *buf8++);
|
||||
|
||||
c = __crc32d(c, *buf8++);
|
||||
c = __crc32d(c, *buf8++);
|
||||
c = __crc32d(c, *buf8++);
|
||||
c = __crc32d(c, *buf8++);
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
while (len >= 8) {
|
||||
c = __crc32d(c, *buf8++);
|
||||
len -= 8;
|
||||
}
|
||||
|
||||
buf = (const unsigned char *)buf8;
|
||||
|
||||
while (len--) {
|
||||
c = __crc32b(c, *buf++);
|
||||
}
|
||||
|
||||
return ~c;
|
||||
}
|
||||
|
||||
#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
|
||||
|
||||
/*
|
||||
* crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
|
||||
* length must be at least 64, and a multiple of 16. Based on:
|
||||
*
|
||||
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
||||
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
|
||||
*/
|
||||
TARGET_ARMV8_WITH_CRC
|
||||
static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
|
||||
{
|
||||
uint8x16_t r;
|
||||
__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
|
||||
: "=w" (r) : "w" (a), "w" (b) );
|
||||
return r;
|
||||
}
|
||||
|
||||
TARGET_ARMV8_WITH_CRC
|
||||
static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
|
||||
{
|
||||
uint8x16_t r;
|
||||
__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
|
||||
: "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
|
||||
return r;
|
||||
}
|
||||
|
||||
TARGET_ARMV8_WITH_CRC
|
||||
static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
|
||||
{
|
||||
uint8x16_t r;
|
||||
__asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
|
||||
: "=w" (r) : "w" (a), "w" (b) );
|
||||
return r;
|
||||
}
|
||||
|
||||
TARGET_ARMV8_WITH_CRC
|
||||
uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
|
||||
const unsigned char *buf,
|
||||
z_size_t len,
|
||||
uint32_t crc)
|
||||
{
|
||||
/*
|
||||
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
|
||||
* the CRC32+Barrett polynomials given at the end of the paper.
|
||||
*/
|
||||
static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
|
||||
static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
|
||||
static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
|
||||
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
|
||||
|
||||
uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
|
||||
|
||||
/*
|
||||
* There's at least one block of 64.
|
||||
*/
|
||||
x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
|
||||
x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
|
||||
x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
|
||||
x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
|
||||
|
||||
x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
|
||||
|
||||
x0 = vld1q_u64(k1k2);
|
||||
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
|
||||
/*
|
||||
* Parallel fold blocks of 64, if any.
|
||||
*/
|
||||
while (len >= 64)
|
||||
{
|
||||
x5 = (uint64x2_t) pmull_lo(x1, x0);
|
||||
x6 = (uint64x2_t) pmull_lo(x2, x0);
|
||||
x7 = (uint64x2_t) pmull_lo(x3, x0);
|
||||
x8 = (uint64x2_t) pmull_lo(x4, x0);
|
||||
|
||||
y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
|
||||
y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
|
||||
y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
|
||||
y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
|
||||
|
||||
x1 = (uint64x2_t) pmull_hi(x1, x0);
|
||||
x2 = (uint64x2_t) pmull_hi(x2, x0);
|
||||
x3 = (uint64x2_t) pmull_hi(x3, x0);
|
||||
x4 = (uint64x2_t) pmull_hi(x4, x0);
|
||||
|
||||
x1 = veorq_u64(x1, x5);
|
||||
x2 = veorq_u64(x2, x6);
|
||||
x3 = veorq_u64(x3, x7);
|
||||
x4 = veorq_u64(x4, x8);
|
||||
|
||||
x1 = veorq_u64(x1, y5);
|
||||
x2 = veorq_u64(x2, y6);
|
||||
x3 = veorq_u64(x3, y7);
|
||||
x4 = veorq_u64(x4, y8);
|
||||
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold into 128-bits.
|
||||
*/
|
||||
x0 = vld1q_u64(k3k4);
|
||||
|
||||
x5 = (uint64x2_t) pmull_lo(x1, x0);
|
||||
x1 = (uint64x2_t) pmull_hi(x1, x0);
|
||||
x1 = veorq_u64(x1, x2);
|
||||
x1 = veorq_u64(x1, x5);
|
||||
|
||||
x5 = (uint64x2_t) pmull_lo(x1, x0);
|
||||
x1 = (uint64x2_t) pmull_hi(x1, x0);
|
||||
x1 = veorq_u64(x1, x3);
|
||||
x1 = veorq_u64(x1, x5);
|
||||
|
||||
x5 = (uint64x2_t) pmull_lo(x1, x0);
|
||||
x1 = (uint64x2_t) pmull_hi(x1, x0);
|
||||
x1 = veorq_u64(x1, x4);
|
||||
x1 = veorq_u64(x1, x5);
|
||||
|
||||
/*
|
||||
* Single fold blocks of 16, if any.
|
||||
*/
|
||||
while (len >= 16)
|
||||
{
|
||||
x2 = vld1q_u64((const uint64_t *)buf);
|
||||
|
||||
x5 = (uint64x2_t) pmull_lo(x1, x0);
|
||||
x1 = (uint64x2_t) pmull_hi(x1, x0);
|
||||
x1 = veorq_u64(x1, x2);
|
||||
x1 = veorq_u64(x1, x5);
|
||||
|
||||
buf += 16;
|
||||
len -= 16;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fold 128-bits to 64-bits.
|
||||
*/
|
||||
static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
|
||||
|
||||
x2 = (uint64x2_t) pmull_01(x1, x0);
|
||||
x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
|
||||
x3 = (uint64x2_t) vld1q_u32(mask);
|
||||
x1 = veorq_u64(x1, x2);
|
||||
|
||||
x0 = vld1q_u64(k5k0);
|
||||
|
||||
x2 = (uint64x2_t) pmull_01(x2, x0);
|
||||
x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
|
||||
x1 = vandq_u64(x1, x3);
|
||||
x1 = (uint64x2_t) pmull_lo(x1, x0);
|
||||
x1 = veorq_u64(x1, x2);
|
||||
|
||||
/*
|
||||
* Barret reduce to 32-bits.
|
||||
*/
|
||||
x0 = vld1q_u64(poly);
|
||||
|
||||
x2 = vandq_u64(x1, x3);
|
||||
x2 = (uint64x2_t) pmull_01(x2, x0);
|
||||
x2 = vandq_u64(x2, x3);
|
||||
x2 = (uint64x2_t) pmull_lo(x2, x0);
|
||||
x1 = veorq_u64(x1, x2);
|
||||
|
||||
/*
|
||||
* Return the crc32.
|
||||
*/
|
||||
return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
|
||||
}
|
||||
#endif /* aarch64 specific code. */
|
||||
|
||||
#endif
|
57
third_party/zlib/crc32_simd.internal.h
vendored
Executable file
57
third_party/zlib/crc32_simd.internal.h
vendored
Executable file
|
@ -0,0 +1,57 @@
|
|||
#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_
|
||||
#include "third_party/zlib/deflate.internal.h"
|
||||
#include "third_party/zlib/zconf.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
// clang-format off
|
||||
|
||||
/*
|
||||
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
|
||||
* length must be at least 64, and a multiple of 16.
|
||||
*/
|
||||
uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
|
||||
z_size_t len,
|
||||
uint32_t crc);
|
||||
|
||||
uint32_t ZLIB_INTERNAL crc32_avx512_simd_(const unsigned char* buf,
|
||||
z_size_t len,
|
||||
uint32_t crc);
|
||||
|
||||
/*
|
||||
* crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
|
||||
* for computing the crc32 of an arbitrary length buffer.
|
||||
*/
|
||||
#define Z_CRC32_SSE42_MINIMUM_LENGTH 64
|
||||
#define Z_CRC32_SSE42_CHUNKSIZE_MASK 15
|
||||
#define Z_CRC32_AVX512_MINIMUM_LENGTH 256
|
||||
#define Z_CRC32_AVX512_CHUNKSIZE_MASK 63
|
||||
|
||||
/*
|
||||
* CRC32 checksums using ARMv8-a crypto instructions.
|
||||
*/
|
||||
uint32_t ZLIB_INTERNAL armv8_crc32_little(const unsigned char* buf,
|
||||
z_size_t len,
|
||||
uint32_t crc);
|
||||
|
||||
/* aarch64 specific code. */
|
||||
#if defined(__aarch64__)
|
||||
|
||||
/* 128 is the sweet spot at the time of coding (late 2020). */
|
||||
#define Z_CRC32_PMULL_MINIMUM_LENGTH 128
|
||||
#define Z_CRC32_PMULL_CHUNKSIZE_MASK 15
|
||||
|
||||
/*
|
||||
* CRC32 checksums using ARMv8-a PMULL instructions, where the buffer
|
||||
* length must be at least 64, and a multiple of 16.
|
||||
*/
|
||||
uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(const unsigned char* buf,
|
||||
z_size_t len,
|
||||
uint32_t crc);
|
||||
|
||||
#endif
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_CRC32_SIMD_INTERNAL_H_ */
|
503
third_party/zlib/crc_folding.c
vendored
Normal file
503
third_party/zlib/crc_folding.c
vendored
Normal file
|
@ -0,0 +1,503 @@
|
|||
#include "libc/fmt/conv.h"
|
||||
#include "libc/inttypes.h"
|
||||
#include "libc/limits.h"
|
||||
#include "libc/literal.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "third_party/intel/emmintrin.internal.h"
|
||||
#include "third_party/intel/immintrin.internal.h"
|
||||
#include "third_party/intel/wmmintrin.internal.h"
|
||||
#include "third_party/zlib/deflate.internal.h"
|
||||
// clang-format off
|
||||
|
||||
/*
|
||||
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
|
||||
* instruction.
|
||||
*
|
||||
* A white paper describing this algorithm can be found at:
|
||||
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||||
*
|
||||
* Copyright (C) 2013 Intel Corporation. All rights reserved.
|
||||
* Authors:
|
||||
* Wajdi Feghali <wajdi.k.feghali@intel.com>
|
||||
* Jim Guilford <james.guilford@intel.com>
|
||||
* Vinodh Gopal <vinodh.gopal@intel.com>
|
||||
* Erdinc Ozturk <erdinc.ozturk@intel.com>
|
||||
* Jim Kukunas <james.t.kukunas@linux.intel.com>
|
||||
*
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#ifdef CRC32_SIMD_SSE42_PCLMUL
|
||||
|
||||
#define CRC_LOAD(s) \
|
||||
do { \
|
||||
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
|
||||
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
|
||||
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
|
||||
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
|
||||
__m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
|
||||
|
||||
#define CRC_SAVE(s) \
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
|
||||
_mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
|
||||
} while (0);
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
|
||||
{
|
||||
CRC_LOAD(s)
|
||||
|
||||
xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
|
||||
xmm_crc1 = _mm_setzero_si128();
|
||||
xmm_crc2 = _mm_setzero_si128();
|
||||
xmm_crc3 = _mm_setzero_si128();
|
||||
|
||||
CRC_SAVE(s)
|
||||
|
||||
s->strm->adler = 0;
|
||||
}
|
||||
|
||||
local void fold_1(deflate_state *const s,
|
||||
__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(
|
||||
0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc3, ps_res;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
|
||||
|
||||
*xmm_crc0 = *xmm_crc1;
|
||||
*xmm_crc1 = *xmm_crc2;
|
||||
*xmm_crc2 = x_tmp3;
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
local void fold_2(deflate_state *const s,
|
||||
__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(
|
||||
0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp3, x_tmp2;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
|
||||
*xmm_crc3 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
|
||||
|
||||
*xmm_crc2 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
|
||||
|
||||
*xmm_crc0 = x_tmp2;
|
||||
*xmm_crc1 = x_tmp3;
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res20);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res31);
|
||||
}
|
||||
|
||||
local void fold_3(deflate_state *const s,
|
||||
__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(
|
||||
0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
|
||||
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc3 = *xmm_crc2;
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
|
||||
|
||||
*xmm_crc2 = *xmm_crc1;
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
|
||||
|
||||
*xmm_crc1 = *xmm_crc0;
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
|
||||
|
||||
*xmm_crc0 = x_tmp3;
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res10);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res21);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res32);
|
||||
}
|
||||
|
||||
local void fold_4(deflate_state *const s,
|
||||
__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3)
|
||||
{
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(
|
||||
0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
|
||||
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
|
||||
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
|
||||
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
|
||||
|
||||
x_tmp0 = *xmm_crc0;
|
||||
x_tmp1 = *xmm_crc1;
|
||||
x_tmp2 = *xmm_crc2;
|
||||
x_tmp3 = *xmm_crc3;
|
||||
|
||||
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
|
||||
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
|
||||
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
|
||||
ps_t0 = _mm_castsi128_ps(x_tmp0);
|
||||
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
|
||||
|
||||
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
|
||||
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
|
||||
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
|
||||
ps_t1 = _mm_castsi128_ps(x_tmp1);
|
||||
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
|
||||
|
||||
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
|
||||
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
|
||||
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
|
||||
ps_t2 = _mm_castsi128_ps(x_tmp2);
|
||||
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
|
||||
|
||||
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
|
||||
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
ps_t3 = _mm_castsi128_ps(x_tmp3);
|
||||
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
|
||||
|
||||
*xmm_crc0 = _mm_castps_si128(ps_res0);
|
||||
*xmm_crc1 = _mm_castps_si128(ps_res1);
|
||||
*xmm_crc2 = _mm_castps_si128(ps_res2);
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res3);
|
||||
}
|
||||
|
||||
local const unsigned zalign(32) pshufb_shf_table[60] = {
|
||||
0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
|
||||
0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
|
||||
0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
|
||||
0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
|
||||
0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
|
||||
0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
|
||||
0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
|
||||
0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
|
||||
0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
|
||||
0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
|
||||
0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
|
||||
0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
|
||||
0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
|
||||
0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
|
||||
0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
|
||||
};
|
||||
|
||||
local void partial_fold(deflate_state *const s, const size_t len,
|
||||
__m128i *xmm_crc0, __m128i *xmm_crc1,
|
||||
__m128i *xmm_crc2, __m128i *xmm_crc3,
|
||||
__m128i *xmm_crc_part)
|
||||
{
|
||||
|
||||
const __m128i xmm_fold4 = _mm_set_epi32(
|
||||
0x00000001, 0x54442bd4,
|
||||
0x00000001, 0xc6e41596);
|
||||
const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
|
||||
|
||||
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
|
||||
__m128i xmm_a0_0, xmm_a0_1;
|
||||
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
|
||||
|
||||
xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
|
||||
xmm_shr = xmm_shl;
|
||||
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
|
||||
|
||||
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
|
||||
|
||||
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
|
||||
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
|
||||
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
|
||||
|
||||
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
|
||||
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
|
||||
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
|
||||
|
||||
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
|
||||
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
|
||||
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
|
||||
|
||||
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
|
||||
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
|
||||
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
|
||||
|
||||
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
|
||||
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
|
||||
|
||||
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
|
||||
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
|
||||
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
|
||||
|
||||
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
|
||||
ps_res = _mm_xor_ps(ps_res, psa0_1);
|
||||
|
||||
*xmm_crc3 = _mm_castps_si128(ps_res);
|
||||
}
|
||||
|
||||
ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
|
||||
unsigned char *dst, const unsigned char *src, long len)
|
||||
{
|
||||
unsigned long algn_diff;
|
||||
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
|
||||
|
||||
CRC_LOAD(s)
|
||||
|
||||
if (len < 16) {
|
||||
if (len == 0)
|
||||
return;
|
||||
goto partial;
|
||||
}
|
||||
|
||||
algn_diff = (0 - (uintptr_t)src) & 0xF;
|
||||
if (algn_diff) {
|
||||
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
|
||||
dst += algn_diff;
|
||||
src += algn_diff;
|
||||
len -= algn_diff;
|
||||
|
||||
partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
|
||||
&xmm_crc_part);
|
||||
}
|
||||
|
||||
while ((len -= 64) >= 0) {
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
|
||||
|
||||
fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
_mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
|
||||
|
||||
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
|
||||
|
||||
src += 64;
|
||||
dst += 64;
|
||||
}
|
||||
|
||||
/*
|
||||
* len = num bytes left - 64
|
||||
*/
|
||||
if (len + 16 >= 0) {
|
||||
len += 16;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
|
||||
|
||||
fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
_mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
|
||||
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
|
||||
|
||||
if (len == 0)
|
||||
goto done;
|
||||
|
||||
dst += 48;
|
||||
src += 48;
|
||||
} else if (len + 32 >= 0) {
|
||||
len += 32;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
|
||||
|
||||
fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
_mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
|
||||
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
|
||||
|
||||
if (len == 0)
|
||||
goto done;
|
||||
|
||||
dst += 32;
|
||||
src += 32;
|
||||
} else if (len + 48 >= 0) {
|
||||
len += 48;
|
||||
|
||||
xmm_t0 = _mm_load_si128((__m128i *)src);
|
||||
|
||||
fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_t0);
|
||||
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
|
||||
|
||||
if (len == 0)
|
||||
goto done;
|
||||
|
||||
dst += 16;
|
||||
src += 16;
|
||||
} else {
|
||||
len += 64;
|
||||
if (len == 0)
|
||||
goto done;
|
||||
}
|
||||
|
||||
partial:
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
/* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
|
||||
{
|
||||
int32_t parts[4] = {0, 0, 0, 0};
|
||||
memcpy(&parts, src, len);
|
||||
xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
|
||||
}
|
||||
#else
|
||||
{
|
||||
int64_t parts[2] = {0, 0};
|
||||
memcpy(&parts, src, len);
|
||||
xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
|
||||
}
|
||||
#endif
|
||||
|
||||
_mm_storeu_si128((__m128i *)dst, xmm_crc_part);
|
||||
partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
|
||||
&xmm_crc_part);
|
||||
done:
|
||||
CRC_SAVE(s)
|
||||
}
|
||||
|
||||
local const unsigned zalign(16) crc_k[] = {
|
||||
0xccaa009e, 0x00000000, /* rk1 */
|
||||
0x751997d0, 0x00000001, /* rk2 */
|
||||
0xccaa009e, 0x00000000, /* rk5 */
|
||||
0x63cd6124, 0x00000001, /* rk6 */
|
||||
0xf7011640, 0x00000001, /* rk7 */
|
||||
0xdb710640, 0x00000001 /* rk8 */
|
||||
};
|
||||
|
||||
local const unsigned zalign(16) crc_mask[4] = {
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
|
||||
};
|
||||
|
||||
local const unsigned zalign(16) crc_mask2[4] = {
|
||||
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
|
||||
};
|
||||
|
||||
unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
|
||||
{
|
||||
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
|
||||
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
|
||||
|
||||
unsigned crc;
|
||||
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
|
||||
|
||||
__m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
|
||||
__m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
|
||||
__m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
|
||||
__m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
|
||||
|
||||
/*
|
||||
* k1
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k);
|
||||
|
||||
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
|
||||
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
|
||||
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
|
||||
|
||||
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
|
||||
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
|
||||
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
|
||||
|
||||
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
|
||||
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
|
||||
/*
|
||||
* k5
|
||||
*/
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
|
||||
xmm_crc0 = xmm_crc3;
|
||||
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
|
||||
|
||||
/*
|
||||
* k7
|
||||
*/
|
||||
xmm_crc1 = xmm_crc3;
|
||||
xmm_crc2 = xmm_crc3;
|
||||
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
|
||||
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
|
||||
|
||||
xmm_crc2 = xmm_crc3;
|
||||
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
|
||||
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
|
||||
|
||||
crc = _mm_extract_epi32(xmm_crc3, 2);
|
||||
return ~crc;
|
||||
}
|
||||
|
||||
#endif /* CRC32_SIMD_SSE42_PCLMUL */
|
233
third_party/zlib/deflate.c
vendored
233
third_party/zlib/deflate.c
vendored
|
@ -5,13 +5,15 @@
|
|||
* Copyright (C) 1995-2022 Jean-loup Gailly and Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
#include "libc/assert.h"
|
||||
#include "third_party/zlib/cpu_features.internal.h"
|
||||
#include "third_party/zlib/deflate.internal.h"
|
||||
#include "third_party/zlib/insert_string.internal.h"
|
||||
#include "third_party/zlib/internal.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
deflate 1.2.12.1 (zlib License)\\n\
|
||||
zlib 1.2.13 (zlib License)\\n\
|
||||
Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
|
||||
Invented 1990 Phillip Walter Katz\"");
|
||||
// clang-format off
|
||||
|
@ -62,6 +64,12 @@ Invented 1990 Phillip Walter Katz\"");
|
|||
|
||||
/* @(#) $Id$ */
|
||||
|
||||
#if defined(DEFLATE_SLIDE_HASH_SSE2) || defined(DEFLATE_SLIDE_HASH_NEON)
|
||||
#include "third_party/zlib/slide_hash_simd.inc"
|
||||
#endif
|
||||
|
||||
#include "third_party/zlib/insert_string.inc"
|
||||
|
||||
#ifdef FASTEST
|
||||
/* See http://crbug.com/1113596 */
|
||||
#error "FASTEST is not supported in Chromium's zlib."
|
||||
|
@ -101,13 +109,7 @@ local void lm_init OF((deflate_state *s));
|
|||
local void putShortMSB OF((deflate_state *s, uInt b));
|
||||
local void flush_pending OF((z_streamp strm));
|
||||
local unsigned read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
|
||||
#ifdef ASMV
|
||||
# pragma message("Assembler code may have bugs -- use at your own risk")
|
||||
void match_init OF((void)); /* asm code initialization */
|
||||
uInt longest_match OF((deflate_state *s, IPos cur_match));
|
||||
#else
|
||||
local uInt longest_match OF((deflate_state *s, IPos cur_match));
|
||||
#endif
|
||||
|
||||
#ifdef ZLIB_DEBUG
|
||||
local void check_match OF((deflate_state *s, IPos start, IPos match,
|
||||
|
@ -181,9 +183,9 @@ local const config configuration_table[10] = {
|
|||
*/
|
||||
#define CLEAR_HASH(s) \
|
||||
do { \
|
||||
s->head[s->hash_size-1] = NIL; \
|
||||
s->head[s->hash_size - 1] = NIL; \
|
||||
zmemzero((Bytef *)s->head, \
|
||||
(unsigned)(s->hash_size-1)*sizeof(*s->head)); \
|
||||
(unsigned)(s->hash_size - 1)*sizeof(*s->head)); \
|
||||
} while (0)
|
||||
|
||||
/* ===========================================================================
|
||||
|
@ -245,6 +247,14 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
|
|||
deflate_state *s;
|
||||
int wrap = 1;
|
||||
|
||||
// Needed to activate optimized insert_string() that helps compression
|
||||
// for all wrapper formats (e.g. RAW, ZLIB, GZIP).
|
||||
// Feature detection is not triggered while using RAW mode (i.e. we never
|
||||
// call crc32() with a NULL buffer).
|
||||
#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL)
|
||||
cpu_check_features();
|
||||
#endif
|
||||
|
||||
if (strm == Z_NULL) return Z_STREAM_ERROR;
|
||||
|
||||
strm->msg = Z_NULL;
|
||||
|
@ -271,6 +281,8 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
|
|||
|
||||
if (windowBits < 0) { /* suppress zlib wrapper */
|
||||
wrap = 0;
|
||||
if (windowBits < -15)
|
||||
return Z_STREAM_ERROR;
|
||||
windowBits = -windowBits;
|
||||
}
|
||||
#ifdef GZIP
|
||||
|
@ -300,7 +312,7 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
|
|||
s->chromium_zlib_hash = 0;
|
||||
#if !defined(USE_ZLIB_RABIN_KARP_ROLLING_HASH)
|
||||
#if defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
|
||||
if (X86_HAVE(SSE4_2))
|
||||
if (x86_cpu_enable_simd)
|
||||
s->chromium_zlib_hash = 1;
|
||||
#elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_ARMV8_CRC32)
|
||||
if (arm_cpu_enable_crc32)
|
||||
|
@ -315,16 +327,15 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
|
|||
|
||||
s->hash_size = 1 << s->hash_bits;
|
||||
s->hash_mask = s->hash_size - 1;
|
||||
s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
|
||||
s->hash_shift = ((s->hash_bits + MIN_MATCH-1) / MIN_MATCH);
|
||||
|
||||
s->window = (Bytef *) ZALLOC(strm,
|
||||
s->w_size + window_padding,
|
||||
2*sizeof(Byte));
|
||||
|
||||
/* Avoid use of unitialized values in the window, see crbug.com/1137613 and
|
||||
* crbug.com/1144420 */
|
||||
if (s->window) { /* [jart] fix regression in malloc failure checking */
|
||||
zmemzero(s->window, (s->w_size + window_padding) * (2 * sizeof(Byte)));
|
||||
zmemzero(s->window, (s->w_size + window_padding) * (2 * sizeof(Byte)));
|
||||
}
|
||||
s->prev = (Posf *) ZALLOC(strm, s->w_size, sizeof(Pos));
|
||||
/* Avoid use of uninitialized value, see:
|
||||
|
@ -355,11 +366,11 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
|
|||
* sym_buf value to read moves forward three bytes. From that symbol, up to
|
||||
* 31 bits are written to pending_buf. The closest the written pending_buf
|
||||
* bits gets to the next sym_buf symbol to read is just before the last
|
||||
* code is written. At that time, 31*(n-2) bits have been written, just
|
||||
* after 24*(n-2) bits have been consumed from sym_buf. sym_buf starts at
|
||||
* 8*n bits into pending_buf. (Note that the symbol buffer fills when n-1
|
||||
* code is written. At that time, 31*(n - 2) bits have been written, just
|
||||
* after 24*(n - 2) bits have been consumed from sym_buf. sym_buf starts at
|
||||
* 8*n bits into pending_buf. (Note that the symbol buffer fills when n - 1
|
||||
* symbols are written.) The closest the writing gets to what is unread is
|
||||
* then n+14 bits. Here n is lit_bufsize, which is 16384 by default, and
|
||||
* then n + 14 bits. Here n is lit_bufsize, which is 16384 by default, and
|
||||
* can range from 128 to 32768.
|
||||
*
|
||||
* Therefore, at a minimum, there are 142 bits of space between what is
|
||||
|
@ -404,7 +415,7 @@ int ZEXPORT deflateInit2(strm, level, method, windowBits, memLevel, strategy)
|
|||
/* =========================================================================
|
||||
* Check for a valid deflate stream state. Return 0 if ok, 1 if not.
|
||||
*/
|
||||
local int deflateStateCheck (strm)
|
||||
local int deflateStateCheck(strm)
|
||||
z_streamp strm;
|
||||
{
|
||||
deflate_state *s;
|
||||
|
@ -427,7 +438,7 @@ local int deflateStateCheck (strm)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
|
||||
int ZEXPORT deflateSetDictionary(strm, dictionary, dictLength)
|
||||
z_streamp strm;
|
||||
const Bytef *dictionary;
|
||||
uInt dictLength;
|
||||
|
@ -492,7 +503,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength)
|
||||
int ZEXPORT deflateGetDictionary(strm, dictionary, dictLength)
|
||||
z_streamp strm;
|
||||
Bytef *dictionary;
|
||||
uInt *dictLength;
|
||||
|
@ -514,7 +525,7 @@ int ZEXPORT deflateGetDictionary (strm, dictionary, dictLength)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflateResetKeep (strm)
|
||||
int ZEXPORT deflateResetKeep(strm)
|
||||
z_streamp strm;
|
||||
{
|
||||
deflate_state *s;
|
||||
|
@ -552,7 +563,7 @@ int ZEXPORT deflateResetKeep (strm)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflateReset (strm)
|
||||
int ZEXPORT deflateReset(strm)
|
||||
z_streamp strm;
|
||||
{
|
||||
int ret;
|
||||
|
@ -564,7 +575,7 @@ int ZEXPORT deflateReset (strm)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflateSetHeader (strm, head)
|
||||
int ZEXPORT deflateSetHeader(strm, head)
|
||||
z_streamp strm;
|
||||
gz_headerp head;
|
||||
{
|
||||
|
@ -575,7 +586,7 @@ int ZEXPORT deflateSetHeader (strm, head)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflatePending (strm, pending, bits)
|
||||
int ZEXPORT deflatePending(strm, pending, bits)
|
||||
unsigned *pending;
|
||||
int *bits;
|
||||
z_streamp strm;
|
||||
|
@ -589,7 +600,7 @@ int ZEXPORT deflatePending (strm, pending, bits)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflatePrime (strm, bits, value)
|
||||
int ZEXPORT deflatePrime(strm, bits, value)
|
||||
z_streamp strm;
|
||||
int bits;
|
||||
int value;
|
||||
|
@ -684,36 +695,50 @@ int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
|
|||
}
|
||||
|
||||
/* =========================================================================
|
||||
* For the default windowBits of 15 and memLevel of 8, this function returns
|
||||
* a close to exact, as well as small, upper bound on the compressed size.
|
||||
* They are coded as constants here for a reason--if the #define's are
|
||||
* changed, then this function needs to be changed as well. The return
|
||||
* value for 15 and 8 only works for those exact settings.
|
||||
* For the default windowBits of 15 and memLevel of 8, this function returns a
|
||||
* close to exact, as well as small, upper bound on the compressed size. This
|
||||
* is an expansion of ~0.03%, plus a small constant.
|
||||
*
|
||||
* For any setting other than those defaults for windowBits and memLevel,
|
||||
* the value returned is a conservative worst case for the maximum expansion
|
||||
* resulting from using fixed blocks instead of stored blocks, which deflate
|
||||
* can emit on compressed data for some combinations of the parameters.
|
||||
* For any setting other than those defaults for windowBits and memLevel, one
|
||||
* of two worst case bounds is returned. This is at most an expansion of ~4% or
|
||||
* ~13%, plus a small constant.
|
||||
*
|
||||
* This function could be more sophisticated to provide closer upper bounds for
|
||||
* every combination of windowBits and memLevel. But even the conservative
|
||||
* upper bound of about 14% expansion does not seem onerous for output buffer
|
||||
* allocation.
|
||||
* Both the 0.03% and 4% derive from the overhead of stored blocks. The first
|
||||
* one is for stored blocks of 16383 bytes (memLevel == 8), whereas the second
|
||||
* is for stored blocks of 127 bytes (the worst case memLevel == 1). The
|
||||
* expansion results from five bytes of header for each stored block.
|
||||
*
|
||||
* The larger expansion of 13% results from a window size less than or equal to
|
||||
* the symbols buffer size (windowBits <= memLevel + 7). In that case some of
|
||||
* the data being compressed may have slid out of the sliding window, impeding
|
||||
* a stored block from being emitted. Then the only choice is a fixed or
|
||||
* dynamic block, where a fixed block limits the maximum expansion to 9 bits
|
||||
* per 8-bit byte, plus 10 bits for every block. The smallest block size for
|
||||
* which this can occur is 255 (memLevel == 2).
|
||||
*
|
||||
* Shifts are used to approximate divisions, for speed.
|
||||
*/
|
||||
uLong ZEXPORT deflateBound(strm, sourceLen)
|
||||
z_streamp strm;
|
||||
uLong sourceLen;
|
||||
{
|
||||
deflate_state *s;
|
||||
uLong complen, wraplen;
|
||||
uLong fixedlen, storelen, wraplen;
|
||||
|
||||
/* conservative upper bound for compressed data */
|
||||
complen = sourceLen +
|
||||
((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 5;
|
||||
/* upper bound for fixed blocks with 9-bit literals and length 255
|
||||
(memLevel == 2, which is the lowest that may not use stored blocks) --
|
||||
~13% overhead plus a small constant */
|
||||
fixedlen = sourceLen + (sourceLen >> 3) + (sourceLen >> 8) +
|
||||
(sourceLen >> 9) + 4;
|
||||
|
||||
/* if can't get parameters, return conservative bound plus zlib wrapper */
|
||||
/* upper bound for stored blocks with length 127 (memLevel == 1) --
|
||||
~4% overhead plus a small constant */
|
||||
storelen = sourceLen + (sourceLen >> 5) + (sourceLen >> 7) +
|
||||
(sourceLen >> 11) + 7;
|
||||
|
||||
/* if can't get parameters, return larger bound plus a zlib wrapper */
|
||||
if (deflateStateCheck(strm))
|
||||
return complen + 6;
|
||||
return (fixedlen > storelen ? fixedlen : storelen) + 6;
|
||||
|
||||
/* compute wrapper length */
|
||||
s = strm->state;
|
||||
|
@ -750,11 +775,12 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
|
|||
wraplen = 6;
|
||||
}
|
||||
|
||||
/* if not default parameters, return conservative bound */
|
||||
/* if not default parameters, return one of the conservative bounds */
|
||||
if (s->w_bits != 15 || s->hash_bits != 8 + 7)
|
||||
return complen + wraplen;
|
||||
return (s->w_bits <= s->hash_bits ? fixedlen : storelen) + wraplen;
|
||||
|
||||
/* default settings: return tight bound for that case */
|
||||
/* default settings: return tight bound for that case -- ~0.03% overhead
|
||||
plus a small constant */
|
||||
return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
|
||||
(sourceLen >> 25) + 13 - 6 + wraplen;
|
||||
}
|
||||
|
@ -764,7 +790,7 @@ uLong ZEXPORT deflateBound(strm, sourceLen)
|
|||
* IN assertion: the stream state is correct and there is enough room in
|
||||
* pending_buf.
|
||||
*/
|
||||
local void putShortMSB (s, b)
|
||||
local void putShortMSB(s, b)
|
||||
deflate_state *s;
|
||||
uInt b;
|
||||
{
|
||||
|
@ -811,7 +837,7 @@ local void flush_pending(strm)
|
|||
} while (0)
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflate (strm, flush)
|
||||
int ZEXPORT deflate(strm, flush)
|
||||
z_streamp strm;
|
||||
int flush;
|
||||
{
|
||||
|
@ -866,7 +892,7 @@ int ZEXPORT deflate (strm, flush)
|
|||
s->status = BUSY_STATE;
|
||||
if (s->status == INIT_STATE) {
|
||||
/* zlib header */
|
||||
uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8;
|
||||
uInt header = (Z_DEFLATED + ((s->w_bits - 8) << 4)) << 8;
|
||||
uInt level_flags;
|
||||
|
||||
if (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2)
|
||||
|
@ -1127,7 +1153,7 @@ int ZEXPORT deflate (strm, flush)
|
|||
}
|
||||
|
||||
/* ========================================================================= */
|
||||
int ZEXPORT deflateEnd (strm)
|
||||
int ZEXPORT deflateEnd(strm)
|
||||
z_streamp strm;
|
||||
{
|
||||
int status;
|
||||
|
@ -1153,7 +1179,7 @@ int ZEXPORT deflateEnd (strm)
|
|||
* To simplify the source, this is not supported for 16-bit MSDOS (which
|
||||
* doesn't have enough memory anyway to duplicate compression states).
|
||||
*/
|
||||
int ZEXPORT deflateCopy (dest, source)
|
||||
int ZEXPORT deflateCopy(dest, source)
|
||||
z_streamp dest;
|
||||
z_streamp source;
|
||||
{
|
||||
|
@ -1243,7 +1269,7 @@ local unsigned read_buf(strm, buf, size)
|
|||
/* ===========================================================================
|
||||
* Initialize the "longest match" routines for a new zlib stream
|
||||
*/
|
||||
local void lm_init (s)
|
||||
local void lm_init(s)
|
||||
deflate_state *s;
|
||||
{
|
||||
s->window_size = (ulg)2L*s->w_size;
|
||||
|
@ -1264,11 +1290,6 @@ local void lm_init (s)
|
|||
s->match_length = s->prev_length = MIN_MATCH-1;
|
||||
s->match_available = 0;
|
||||
s->ins_h = 0;
|
||||
#ifndef FASTEST
|
||||
#ifdef ASMV
|
||||
match_init(); /* initialize the asm code */
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef FASTEST
|
||||
|
@ -1281,10 +1302,6 @@ local void lm_init (s)
|
|||
* string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1
|
||||
* OUT assertion: the match length is not greater than s->lookahead.
|
||||
*/
|
||||
#ifndef ASMV
|
||||
/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
|
||||
* match.S. The code will be functionally equivalent.
|
||||
*/
|
||||
local uInt longest_match(s, cur_match)
|
||||
deflate_state *s;
|
||||
IPos cur_match; /* current match */
|
||||
|
@ -1309,10 +1326,10 @@ local uInt longest_match(s, cur_match)
|
|||
*/
|
||||
register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
|
||||
register ush scan_start = *(ushf*)scan;
|
||||
register ush scan_end = *(ushf*)(scan+best_len-1);
|
||||
register ush scan_end = *(ushf*)(scan + best_len - 1);
|
||||
#else
|
||||
register Bytef *strend = s->window + s->strstart + MAX_MATCH;
|
||||
register Byte scan_end1 = scan[best_len-1];
|
||||
register Byte scan_end1 = scan[best_len - 1];
|
||||
register Byte scan_end = scan[best_len];
|
||||
#endif
|
||||
|
||||
|
@ -1330,7 +1347,8 @@ local uInt longest_match(s, cur_match)
|
|||
*/
|
||||
if ((uInt)nice_match > s->lookahead) nice_match = (int)s->lookahead;
|
||||
|
||||
Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
|
||||
Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
|
||||
"need lookahead");
|
||||
|
||||
do {
|
||||
Assert(cur_match < s->strstart, "no future");
|
||||
|
@ -1348,15 +1366,15 @@ local uInt longest_match(s, cur_match)
|
|||
/* This code assumes sizeof(unsigned short) == 2. Do not use
|
||||
* UNALIGNED_OK if your compiler uses a different size.
|
||||
*/
|
||||
if (*(ushf*)(match+best_len-1) != scan_end ||
|
||||
if (*(ushf*)(match + best_len - 1) != scan_end ||
|
||||
*(ushf*)match != scan_start) continue;
|
||||
|
||||
/* It is not necessary to compare scan[2] and match[2] since they are
|
||||
* always equal when the other bytes match, given that the hash keys
|
||||
* are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
|
||||
* strstart+3, +5, ... up to strstart+257. We check for insufficient
|
||||
* strstart + 3, + 5, up to strstart + 257. We check for insufficient
|
||||
* lookahead only every 4th comparison; the 128th check will be made
|
||||
* at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
|
||||
* at strstart + 257. If MAX_MATCH-2 is not a multiple of 8, it is
|
||||
* necessary to put more guard bytes at the end of the window, or
|
||||
* to check more often for insufficient lookahead.
|
||||
*/
|
||||
|
@ -1372,28 +1390,29 @@ local uInt longest_match(s, cur_match)
|
|||
}
|
||||
scan++, match++;
|
||||
do {
|
||||
} while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
|
||||
} while (*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
|
||||
*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
|
||||
*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
|
||||
*(ushf*)(scan += 2) == *(ushf*)(match += 2) &&
|
||||
scan < strend);
|
||||
/* The funny "do {}" generates better code on most compilers */
|
||||
|
||||
/* Here, scan <= window+strstart+257 */
|
||||
Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
|
||||
/* Here, scan <= window + strstart + 257 */
|
||||
Assert(scan <= s->window+(unsigned)(s->window_size - 1),
|
||||
"wild scan");
|
||||
if (*scan == *match) scan++;
|
||||
|
||||
len = (MAX_MATCH - 1) - (int)(strend-scan);
|
||||
len = (MAX_MATCH - 1) - (int)(strend - scan);
|
||||
scan = strend - (MAX_MATCH-1);
|
||||
|
||||
#else /* UNALIGNED_OK */
|
||||
|
||||
if (match[best_len] != scan_end ||
|
||||
match[best_len-1] != scan_end1 ||
|
||||
match[best_len - 1] != scan_end1 ||
|
||||
*match != *scan ||
|
||||
*++match != scan[1]) continue;
|
||||
|
||||
/* The check at best_len-1 can be removed because it will be made
|
||||
/* The check at best_len - 1 can be removed because it will be made
|
||||
* again later. (This heuristic is not always a win.)
|
||||
* It is not necessary to compare scan[2] and match[2] since they
|
||||
* are always equal when the other bytes match, given that
|
||||
|
@ -1412,7 +1431,7 @@ local uInt longest_match(s, cur_match)
|
|||
}
|
||||
|
||||
/* We check for insufficient lookahead only every 8th comparison;
|
||||
* the 256th check will be made at strstart+258.
|
||||
* the 256th check will be made at strstart + 258.
|
||||
*/
|
||||
do {
|
||||
} while (*++scan == *++match && *++scan == *++match &&
|
||||
|
@ -1421,7 +1440,8 @@ local uInt longest_match(s, cur_match)
|
|||
*++scan == *++match && *++scan == *++match &&
|
||||
scan < strend);
|
||||
|
||||
Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
|
||||
Assert(scan <= s->window + (unsigned)(s->window_size - 1),
|
||||
"wild scan");
|
||||
|
||||
len = MAX_MATCH - (int)(strend - scan);
|
||||
scan = strend - MAX_MATCH;
|
||||
|
@ -1433,9 +1453,9 @@ local uInt longest_match(s, cur_match)
|
|||
best_len = len;
|
||||
if (len >= nice_match) break;
|
||||
#ifdef UNALIGNED_OK
|
||||
scan_end = *(ushf*)(scan+best_len-1);
|
||||
scan_end = *(ushf*)(scan + best_len - 1);
|
||||
#else
|
||||
scan_end1 = scan[best_len-1];
|
||||
scan_end1 = scan[best_len - 1];
|
||||
scan_end = scan[best_len];
|
||||
#endif
|
||||
}
|
||||
|
@ -1445,7 +1465,6 @@ local uInt longest_match(s, cur_match)
|
|||
if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
|
||||
return s->lookahead;
|
||||
}
|
||||
#endif /* ASMV */
|
||||
|
||||
#else /* FASTEST */
|
||||
|
||||
|
@ -1466,7 +1485,8 @@ local uInt longest_match(s, cur_match)
|
|||
*/
|
||||
Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
|
||||
|
||||
Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
|
||||
Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
|
||||
"need lookahead");
|
||||
|
||||
Assert(cur_match < s->strstart, "no future");
|
||||
|
||||
|
@ -1476,7 +1496,7 @@ local uInt longest_match(s, cur_match)
|
|||
*/
|
||||
if (match[0] != scan[0] || match[1] != scan[1]) return MIN_MATCH-1;
|
||||
|
||||
/* The check at best_len-1 can be removed because it will be made
|
||||
/* The check at best_len - 1 can be removed because it will be made
|
||||
* again later. (This heuristic is not always a win.)
|
||||
* It is not necessary to compare scan[2] and match[2] since they
|
||||
* are always equal when the other bytes match, given that
|
||||
|
@ -1486,7 +1506,7 @@ local uInt longest_match(s, cur_match)
|
|||
Assert(*scan == *match, "match[2]?");
|
||||
|
||||
/* We check for insufficient lookahead only every 8th comparison;
|
||||
* the 256th check will be made at strstart+258.
|
||||
* the 256th check will be made at strstart + 258.
|
||||
*/
|
||||
do {
|
||||
} while (*++scan == *++match && *++scan == *++match &&
|
||||
|
@ -1495,7 +1515,7 @@ local uInt longest_match(s, cur_match)
|
|||
*++scan == *++match && *++scan == *++match &&
|
||||
scan < strend);
|
||||
|
||||
Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
|
||||
Assert(scan <= s->window + (unsigned)(s->window_size - 1), "wild scan");
|
||||
|
||||
len = MAX_MATCH - (int)(strend - scan);
|
||||
|
||||
|
@ -1531,7 +1551,7 @@ local void check_match(s, start, match, length)
|
|||
z_error(__FILE__, __LINE__, "invalid match");
|
||||
}
|
||||
if (z_verbose > 1) {
|
||||
kprintf("\\[%d,%d]", start-match, length);
|
||||
kprintf("\\[%d,%d]", start - match, length);
|
||||
do { kprintf("%c", s->window[start++]); } while (--length != 0);
|
||||
}
|
||||
}
|
||||
|
@ -1577,9 +1597,9 @@ local void fill_window(s)
|
|||
/* If the window is almost full and there is insufficient lookahead,
|
||||
* move the upper half to the lower one to make room in the upper half.
|
||||
*/
|
||||
if (s->strstart >= wsize+MAX_DIST(s)) {
|
||||
if (s->strstart >= wsize + MAX_DIST(s)) {
|
||||
|
||||
zmemcpy(s->window, s->window+wsize, (unsigned)wsize - more);
|
||||
zmemcpy(s->window, s->window + wsize, (unsigned)wsize - more);
|
||||
s->match_start -= wsize;
|
||||
s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
|
||||
s->block_start -= (long) wsize;
|
||||
|
@ -1934,7 +1954,7 @@ local block_state deflate_fast(s, flush)
|
|||
if (s->lookahead == 0) break; /* flush the current block */
|
||||
}
|
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
/* Insert the string window[strstart .. strstart + 2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain:
|
||||
*/
|
||||
hash_head = NIL;
|
||||
|
@ -1984,7 +2004,7 @@ local block_state deflate_fast(s, flush)
|
|||
|
||||
if (!s->chromium_zlib_hash) {
|
||||
s->ins_h = s->window[s->strstart];
|
||||
UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
|
||||
UPDATE_HASH(s, s->ins_h, s->window[s->strstart + 1]);
|
||||
#if MIN_MATCH != 3
|
||||
Call UPDATE_HASH() MIN_MATCH-3 more times
|
||||
#endif
|
||||
|
@ -1996,7 +2016,7 @@ local block_state deflate_fast(s, flush)
|
|||
} else {
|
||||
/* No match, output a literal byte */
|
||||
Tracevv(("%c", s->window[s->strstart]));
|
||||
_tr_tally_lit (s, s->window[s->strstart], bflush);
|
||||
_tr_tally_lit(s, s->window[s->strstart], bflush);
|
||||
s->lookahead--;
|
||||
s->strstart++;
|
||||
}
|
||||
|
@ -2040,7 +2060,7 @@ local block_state deflate_slow(s, flush)
|
|||
if (s->lookahead == 0) break; /* flush the current block */
|
||||
}
|
||||
|
||||
/* Insert the string window[strstart .. strstart+2] in the
|
||||
/* Insert the string window[strstart .. strstart + 2] in the
|
||||
* dictionary, and set hash_head to the head of the hash chain:
|
||||
*/
|
||||
hash_head = NIL;
|
||||
|
@ -2085,20 +2105,20 @@ local block_state deflate_slow(s, flush)
|
|||
if (s->prev_match == -1) {
|
||||
/* The window has slid one byte past the previous match,
|
||||
* so the first byte cannot be compared. */
|
||||
check_match(s, s->strstart, s->prev_match+1, s->prev_length-1);
|
||||
check_match(s, s->strstart, s->prev_match + 1, s->prev_length - 1);
|
||||
} else {
|
||||
check_match(s, s->strstart-1, s->prev_match, s->prev_length);
|
||||
check_match(s, s->strstart - 1, s->prev_match, s->prev_length);
|
||||
}
|
||||
|
||||
_tr_tally_dist(s, s->strstart -1 - s->prev_match,
|
||||
_tr_tally_dist(s, s->strstart - 1 - s->prev_match,
|
||||
s->prev_length - MIN_MATCH, bflush);
|
||||
|
||||
/* Insert in hash table all strings up to the end of the match.
|
||||
* strstart-1 and strstart are already inserted. If there is not
|
||||
* strstart - 1 and strstart are already inserted. If there is not
|
||||
* enough lookahead, the last two strings are not inserted in
|
||||
* the hash table.
|
||||
*/
|
||||
s->lookahead -= s->prev_length-1;
|
||||
s->lookahead -= s->prev_length - 1;
|
||||
s->prev_length -= 2;
|
||||
do {
|
||||
if (++s->strstart <= max_insert) {
|
||||
|
@ -2116,8 +2136,8 @@ local block_state deflate_slow(s, flush)
|
|||
* single literal. If there was a match but the current match
|
||||
* is longer, truncate the previous match to a single literal.
|
||||
*/
|
||||
Tracevv(("%c", s->window[s->strstart-1]));
|
||||
_tr_tally_lit(s, s->window[s->strstart-1], bflush);
|
||||
Tracevv(("%c", s->window[s->strstart - 1]));
|
||||
_tr_tally_lit(s, s->window[s->strstart - 1], bflush);
|
||||
if (bflush) {
|
||||
FLUSH_BLOCK_ONLY(s, 0);
|
||||
}
|
||||
|
@ -2135,8 +2155,8 @@ local block_state deflate_slow(s, flush)
|
|||
}
|
||||
Assert (flush != Z_NO_FLUSH, "no flush?");
|
||||
if (s->match_available) {
|
||||
Tracevv(("%c", s->window[s->strstart-1]));
|
||||
_tr_tally_lit(s, s->window[s->strstart-1], bflush);
|
||||
Tracevv(("%c", s->window[s->strstart - 1]));
|
||||
_tr_tally_lit(s, s->window[s->strstart - 1], bflush);
|
||||
s->match_available = 0;
|
||||
}
|
||||
s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1;
|
||||
|
@ -2193,7 +2213,8 @@ local block_state deflate_rle(s, flush)
|
|||
if (s->match_length > s->lookahead)
|
||||
s->match_length = s->lookahead;
|
||||
}
|
||||
Assert(scan <= s->window+(uInt)(s->window_size-1), "wild scan");
|
||||
Assert(scan <= s->window + (uInt)(s->window_size - 1),
|
||||
"wild scan");
|
||||
}
|
||||
|
||||
/* Emit match if have run of MIN_MATCH or longer, else emit literal */
|
||||
|
@ -2208,7 +2229,7 @@ local block_state deflate_rle(s, flush)
|
|||
} else {
|
||||
/* No match, output a literal byte */
|
||||
Tracevv(("%c", s->window[s->strstart]));
|
||||
_tr_tally_lit (s, s->window[s->strstart], bflush);
|
||||
_tr_tally_lit(s, s->window[s->strstart], bflush);
|
||||
s->lookahead--;
|
||||
s->strstart++;
|
||||
}
|
||||
|
@ -2248,7 +2269,7 @@ local block_state deflate_huff(s, flush)
|
|||
/* Output a literal byte */
|
||||
s->match_length = 0;
|
||||
Tracevv(("%c", s->window[s->strstart]));
|
||||
_tr_tally_lit (s, s->window[s->strstart], bflush);
|
||||
_tr_tally_lit(s, s->window[s->strstart], bflush);
|
||||
s->lookahead--;
|
||||
s->strstart++;
|
||||
if (bflush) FLUSH_BLOCK(s, 0);
|
||||
|
|
5
third_party/zlib/gz/gzclose.c
vendored
5
third_party/zlib/gz/gzclose.c
vendored
|
@ -7,11 +7,6 @@
|
|||
*/
|
||||
#include "third_party/zlib/gz/gzguts.inc"
|
||||
#include "third_party/zlib/macros.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* gzclose() is in a separate file so that it is linked in only if it is used.
|
||||
|
|
7
third_party/zlib/gz/gzlib.c
vendored
7
third_party/zlib/gz/gzlib.c
vendored
|
@ -14,15 +14,10 @@
|
|||
#include "third_party/zlib/gz/gzguts.inc"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
// clang-format off
|
||||
|
||||
#define LSEEK lseek
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* Local functions */
|
||||
static void gz_reset(gz_statep);
|
||||
static gzFile gz_open(const void *, int, const char *);
|
||||
|
|
5
third_party/zlib/gz/gzread.c
vendored
5
third_party/zlib/gz/gzread.c
vendored
|
@ -12,11 +12,6 @@
|
|||
#include "libc/str/str.h"
|
||||
#include "third_party/zlib/gz/gzguts.inc"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* Local functions */
|
||||
|
|
5
third_party/zlib/gz/gzwrite.c
vendored
5
third_party/zlib/gz/gzwrite.c
vendored
|
@ -9,11 +9,6 @@
|
|||
#include "libc/fmt/fmt.h"
|
||||
#include "libc/mem/mem.h"
|
||||
#include "third_party/zlib/gz/gzguts.inc"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* Local functions */
|
||||
|
|
5
third_party/zlib/infback.c
vendored
5
third_party/zlib/infback.c
vendored
|
@ -11,11 +11,6 @@
|
|||
#include "third_party/zlib/internal.h"
|
||||
#include "third_party/zlib/macros.internal.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/*
|
||||
|
|
5
third_party/zlib/inffast.c
vendored
5
third_party/zlib/inffast.c
vendored
|
@ -5,11 +5,6 @@
|
|||
* Copyright (C) 1995-2017 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
|
387
third_party/zlib/inffast_chunk.c
vendored
Normal file
387
third_party/zlib/inffast_chunk.c
vendored
Normal file
|
@ -0,0 +1,387 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
/* inffast_chunk.c -- fast decoding
|
||||
* Copyright (C) 1995-2017 Mark Adler
|
||||
* Copyright 2023 The Chromium Authors
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
Chromium (BSD-3 License)\\n\
|
||||
Copyright 2017 The Chromium Authors\"");
|
||||
// clang-format off
|
||||
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
#include "third_party/zlib/inftrees.internal.h"
|
||||
#include "third_party/zlib/inflate.internal.h"
|
||||
#include "third_party/zlib/inffast_chunk.internal.h"
|
||||
#include "third_party/zlib/chunkcopy.inc"
|
||||
|
||||
#ifdef ASMINF
|
||||
# pragma message("Assembler code may have bugs -- use at your own risk")
|
||||
#else
|
||||
|
||||
/*
|
||||
Decode literal, length, and distance codes and write out the resulting
|
||||
literal and match bytes until either not enough input or output is
|
||||
available, an end-of-block is encountered, or a data error is encountered.
|
||||
When large enough input and output buffers are supplied to inflate(), for
|
||||
example, a 16K input buffer and a 64K output buffer, more than 95% of the
|
||||
inflate() execution time is spent in this routine.
|
||||
|
||||
Entry assumptions:
|
||||
|
||||
state->mode == LEN
|
||||
strm->avail_in >= INFLATE_FAST_MIN_INPUT (6 or 8 bytes + 7 bytes)
|
||||
strm->avail_out >= INFLATE_FAST_MIN_OUTPUT (258 bytes + 2 bytes)
|
||||
start >= strm->avail_out
|
||||
state->bits < 8
|
||||
(state->hold >> state->bits) == 0
|
||||
strm->next_out[0..strm->avail_out] does not overlap with
|
||||
strm->next_in[0..strm->avail_in]
|
||||
strm->state->window is allocated with an additional
|
||||
CHUNKCOPY_CHUNK_SIZE-1 bytes of padding beyond strm->state->wsize
|
||||
|
||||
On return, state->mode is one of:
|
||||
|
||||
LEN -- ran out of enough output space or enough available input
|
||||
TYPE -- reached end of block code, inflate() to interpret next block
|
||||
BAD -- error in block data
|
||||
|
||||
Notes:
|
||||
|
||||
INFLATE_FAST_MIN_INPUT: 6 or 8 bytes + 7 bytes
|
||||
|
||||
- The maximum input bits used by a length/distance pair is 15 bits for the
|
||||
length code, 5 bits for the length extra, 15 bits for the distance code,
|
||||
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
|
||||
Therefore if strm->avail_in >= 6, then there is enough input to avoid
|
||||
checking for available input while decoding.
|
||||
|
||||
- The wide input data reading option reads 64 input bits at a time. Thus,
|
||||
if strm->avail_in >= 8, then there is enough input to avoid checking for
|
||||
available input while decoding. Reading consumes the input with:
|
||||
|
||||
hold |= read64le(in) << bits;
|
||||
in += 6;
|
||||
bits += 48;
|
||||
|
||||
reporting 6 bytes of new input because |bits| is 0..15 (2 bytes rounded
|
||||
up, worst case) and 6 bytes is enough to decode as noted above. At exit,
|
||||
hold &= (1U << bits) - 1 drops excess input to keep the invariant:
|
||||
|
||||
(state->hold >> state->bits) == 0
|
||||
|
||||
INFLATE_FAST_MIN_OUTPUT: 258 bytes + 2 bytes for literals = 260 bytes
|
||||
|
||||
- The maximum bytes that a single length/distance pair can output is 258
|
||||
bytes, which is the maximum length that can be coded. inflate_fast()
|
||||
requires strm->avail_out >= 260 for each loop to avoid checking for
|
||||
available output space while decoding.
|
||||
*/
|
||||
void ZLIB_INTERNAL inflate_fast_chunk_(strm, start)
|
||||
z_streamp strm;
|
||||
unsigned start; /* inflate()'s starting value for strm->avail_out */
|
||||
{
|
||||
struct inflate_state FAR *state;
|
||||
z_const unsigned char FAR *in; /* local strm->next_in */
|
||||
z_const unsigned char FAR *last; /* have enough input while in < last */
|
||||
unsigned char FAR *out; /* local strm->next_out */
|
||||
unsigned char FAR *beg; /* inflate()'s initial strm->next_out */
|
||||
unsigned char FAR *end; /* while out < end, enough space available */
|
||||
unsigned char FAR *limit; /* safety limit for chunky copies */
|
||||
#ifdef INFLATE_STRICT
|
||||
unsigned dmax; /* maximum distance from zlib header */
|
||||
#endif
|
||||
unsigned wsize; /* window size or zero if not using window */
|
||||
unsigned whave; /* valid bytes in the window */
|
||||
unsigned wnext; /* window write index */
|
||||
unsigned char FAR *window; /* allocated sliding window, if wsize != 0 */
|
||||
inflate_holder_t hold; /* local strm->hold */
|
||||
unsigned bits; /* local strm->bits */
|
||||
code const FAR *lcode; /* local strm->lencode */
|
||||
code const FAR *dcode; /* local strm->distcode */
|
||||
unsigned lmask; /* mask for first level of length codes */
|
||||
unsigned dmask; /* mask for first level of distance codes */
|
||||
code const *here; /* retrieved table entry */
|
||||
unsigned op; /* code bits, operation, extra bits, or */
|
||||
/* window position, window bytes to copy */
|
||||
unsigned len; /* match length, unused bytes */
|
||||
unsigned dist; /* match distance */
|
||||
unsigned char FAR *from; /* where to copy match from */
|
||||
|
||||
/* copy state to local variables */
|
||||
state = (struct inflate_state FAR *)strm->state;
|
||||
in = strm->next_in;
|
||||
last = in + (strm->avail_in - (INFLATE_FAST_MIN_INPUT - 1));
|
||||
out = strm->next_out;
|
||||
beg = out - (start - strm->avail_out);
|
||||
end = out + (strm->avail_out - (INFLATE_FAST_MIN_OUTPUT - 1));
|
||||
limit = out + strm->avail_out;
|
||||
#ifdef INFLATE_STRICT
|
||||
dmax = state->dmax;
|
||||
#endif
|
||||
wsize = state->wsize;
|
||||
whave = state->whave;
|
||||
wnext = (state->wnext == 0 && whave >= wsize) ? wsize : state->wnext;
|
||||
window = state->window;
|
||||
hold = state->hold;
|
||||
bits = state->bits;
|
||||
lcode = state->lencode;
|
||||
dcode = state->distcode;
|
||||
lmask = (1U << state->lenbits) - 1;
|
||||
dmask = (1U << state->distbits) - 1;
|
||||
|
||||
#ifdef INFLATE_CHUNK_READ_64LE
|
||||
#define REFILL() do { \
|
||||
Assert(bits < 64, "### Too many bits in inflate_fast."); \
|
||||
hold |= read64le(in) << bits; \
|
||||
in += 7; \
|
||||
in -= bits >> 3; \
|
||||
bits |= 56; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
/* decode literals and length/distances until end-of-block or not enough
|
||||
input data or output space */
|
||||
do {
|
||||
#ifdef INFLATE_CHUNK_READ_64LE
|
||||
REFILL();
|
||||
#else
|
||||
if (bits < 15) {
|
||||
hold += (unsigned long)(*in++) << bits;
|
||||
bits += 8;
|
||||
hold += (unsigned long)(*in++) << bits;
|
||||
bits += 8;
|
||||
}
|
||||
#endif
|
||||
here = lcode + (hold & lmask);
|
||||
#ifdef INFLATE_CHUNK_READ_64LE
|
||||
if (here->op == 0) { /* literal */
|
||||
Tracevv((here->val >= 0x20 && here->val < 0x7f ?
|
||||
"inflate: literal '%c'\n" :
|
||||
"inflate: literal 0x%02x\n", here->val));
|
||||
*out++ = (unsigned char)(here->val);
|
||||
hold >>= here->bits;
|
||||
bits -= here->bits;
|
||||
here = lcode + (hold & lmask);
|
||||
if (here->op == 0) { /* literal */
|
||||
Tracevv((here->val >= 0x20 && here->val < 0x7f ?
|
||||
"inflate: 2nd literal '%c'\n" :
|
||||
"inflate: 2nd literal 0x%02x\n", here->val));
|
||||
*out++ = (unsigned char)(here->val);
|
||||
hold >>= here->bits;
|
||||
bits -= here->bits;
|
||||
here = lcode + (hold & lmask);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
dolen:
|
||||
op = (unsigned)(here->bits);
|
||||
hold >>= op;
|
||||
bits -= op;
|
||||
op = (unsigned)(here->op);
|
||||
if (op == 0) { /* literal */
|
||||
Tracevv((here->val >= 0x20 && here->val < 0x7f ?
|
||||
"inflate: literal '%c'\n" :
|
||||
"inflate: literal 0x%02x\n", here->val));
|
||||
*out++ = (unsigned char)(here->val);
|
||||
}
|
||||
else if (op & 16) { /* length base */
|
||||
len = (unsigned)(here->val);
|
||||
op &= 15; /* number of extra bits */
|
||||
if (op) {
|
||||
#ifndef INFLATE_CHUNK_READ_64LE
|
||||
if (bits < op) {
|
||||
hold += (unsigned long)(*in++) << bits;
|
||||
bits += 8;
|
||||
}
|
||||
#endif
|
||||
len += (unsigned)hold & ((1U << op) - 1);
|
||||
hold >>= op;
|
||||
bits -= op;
|
||||
}
|
||||
Tracevv(("inflate: length %u\n", len));
|
||||
#ifndef INFLATE_CHUNK_READ_64LE
|
||||
if (bits < 15) {
|
||||
hold += (unsigned long)(*in++) << bits;
|
||||
bits += 8;
|
||||
hold += (unsigned long)(*in++) << bits;
|
||||
bits += 8;
|
||||
}
|
||||
#endif
|
||||
here = dcode + (hold & dmask);
|
||||
dodist:
|
||||
op = (unsigned)(here->bits);
|
||||
hold >>= op;
|
||||
bits -= op;
|
||||
op = (unsigned)(here->op);
|
||||
if (op & 16) { /* distance base */
|
||||
dist = (unsigned)(here->val);
|
||||
op &= 15; /* number of extra bits */
|
||||
/* we have two fast-path loads: 10+10 + 15+5 + 15 = 55,
|
||||
but we may need to refill here in the worst case */
|
||||
if (bits < op) {
|
||||
#ifdef INFLATE_CHUNK_READ_64LE
|
||||
REFILL();
|
||||
#else
|
||||
hold += (unsigned long)(*in++) << bits;
|
||||
bits += 8;
|
||||
if (bits < op) {
|
||||
hold += (unsigned long)(*in++) << bits;
|
||||
bits += 8;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
dist += (unsigned)hold & ((1U << op) - 1);
|
||||
#ifdef INFLATE_STRICT
|
||||
if (dist > dmax) {
|
||||
strm->msg = (char *)"invalid distance too far back";
|
||||
state->mode = BAD;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
hold >>= op;
|
||||
bits -= op;
|
||||
Tracevv(("inflate: distance %u\n", dist));
|
||||
op = (unsigned)(out - beg); /* max distance in output */
|
||||
if (dist > op) { /* see if copy from window */
|
||||
op = dist - op; /* distance back in window */
|
||||
if (op > whave) {
|
||||
if (state->sane) {
|
||||
strm->msg =
|
||||
(char *)"invalid distance too far back";
|
||||
state->mode = BAD;
|
||||
break;
|
||||
}
|
||||
#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
|
||||
if (len <= op - whave) {
|
||||
do {
|
||||
*out++ = 0;
|
||||
} while (--len);
|
||||
continue;
|
||||
}
|
||||
len -= op - whave;
|
||||
do {
|
||||
*out++ = 0;
|
||||
} while (--op > whave);
|
||||
if (op == 0) {
|
||||
from = out - dist;
|
||||
do {
|
||||
*out++ = *from++;
|
||||
} while (--len);
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
from = window;
|
||||
if (wnext >= op) { /* contiguous in window */
|
||||
from += wnext - op;
|
||||
}
|
||||
else { /* wrap around window */
|
||||
op -= wnext;
|
||||
from += wsize - op;
|
||||
if (op < len) { /* some from end of window */
|
||||
len -= op;
|
||||
out = chunkcopy_safe(out, from, op, limit);
|
||||
from = window; /* more from start of window */
|
||||
op = wnext;
|
||||
/* This (rare) case can create a situation where
|
||||
the first chunkcopy below must be checked.
|
||||
*/
|
||||
}
|
||||
}
|
||||
if (op < len) { /* still need some from output */
|
||||
out = chunkcopy_safe(out, from, op, limit);
|
||||
len -= op;
|
||||
/* When dist is small the amount of data that can be
|
||||
copied from the window is also small, and progress
|
||||
towards the dangerous end of the output buffer is
|
||||
also small. This means that for trivial memsets and
|
||||
for chunkunroll_relaxed() a safety check is
|
||||
unnecessary. However, these conditions may not be
|
||||
entered at all, and in that case it's possible that
|
||||
the main copy is near the end.
|
||||
*/
|
||||
out = chunkunroll_relaxed(out, &dist, &len);
|
||||
out = chunkcopy_safe_ugly(out, dist, len, limit);
|
||||
} else {
|
||||
/* from points to window, so there is no risk of
|
||||
overlapping pointers requiring memset-like behaviour
|
||||
*/
|
||||
out = chunkcopy_safe(out, from, len, limit);
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Whole reference is in range of current output. No
|
||||
range checks are necessary because we start with room
|
||||
for at least 258 bytes of output, so unroll and roundoff
|
||||
operations can write beyond `out+len` so long as they
|
||||
stay within 258 bytes of `out`.
|
||||
*/
|
||||
out = chunkcopy_lapped_relaxed(out, dist, len);
|
||||
}
|
||||
}
|
||||
else if ((op & 64) == 0) { /* 2nd level distance code */
|
||||
here = dcode + here->val + (hold & ((1U << op) - 1));
|
||||
goto dodist;
|
||||
}
|
||||
else {
|
||||
strm->msg = (char *)"invalid distance code";
|
||||
state->mode = BAD;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if ((op & 64) == 0) { /* 2nd level length code */
|
||||
here = lcode + here->val + (hold & ((1U << op) - 1));
|
||||
goto dolen;
|
||||
}
|
||||
else if (op & 32) { /* end-of-block */
|
||||
Tracevv(("inflate: end of block\n"));
|
||||
state->mode = TYPE;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
strm->msg = (char *)"invalid literal/length code";
|
||||
state->mode = BAD;
|
||||
break;
|
||||
}
|
||||
} while (in < last && out < end);
|
||||
|
||||
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
|
||||
len = bits >> 3;
|
||||
in -= len;
|
||||
bits -= len << 3;
|
||||
hold &= (1U << bits) - 1;
|
||||
|
||||
/* update state and return */
|
||||
strm->next_in = in;
|
||||
strm->next_out = out;
|
||||
strm->avail_in = (unsigned)(in < last ?
|
||||
(INFLATE_FAST_MIN_INPUT - 1) + (last - in) :
|
||||
(INFLATE_FAST_MIN_INPUT - 1) - (in - last));
|
||||
strm->avail_out = (unsigned)(out < end ?
|
||||
(INFLATE_FAST_MIN_OUTPUT - 1) + (end - out) :
|
||||
(INFLATE_FAST_MIN_OUTPUT - 1) - (out - end));
|
||||
state->hold = hold;
|
||||
state->bits = bits;
|
||||
|
||||
Assert((state->hold >> state->bits) == 0, "invalid input data state");
|
||||
}
|
||||
|
||||
/*
|
||||
inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
|
||||
- Using bit fields for code structure
|
||||
- Different op definition to avoid & for extra bits (do & for table bits)
|
||||
- Three separate decoding do-loops for direct, window, and wnext == 0
|
||||
- Special case for distance > 1 copies to do overlapped load and store copy
|
||||
- Explicit branch predictions (based on measured branch probabilities)
|
||||
- Deferring match copy and interspersed it with decoding subsequent codes
|
||||
- Swapping literal/length else
|
||||
- Swapping window/direct else
|
||||
- Larger unrolled copy loops (three is about right)
|
||||
- Moving len -= 3 statement into middle of loop
|
||||
*/
|
||||
|
||||
#endif /* !ASMINF */
|
42
third_party/zlib/inffast_chunk.internal.h
vendored
Normal file
42
third_party/zlib/inffast_chunk.internal.h
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
/* inffast_chunk.h -- header to use inffast_chunk.c
|
||||
* Copyright (C) 1995-2003, 2010 Mark Adler
|
||||
* Copyright (C) 2017 ARM, Inc.
|
||||
* Copyright 2023 The Chromium Authors
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
|
||||
/* WARNING: this file should *not* be used by applications. It is
|
||||
part of the implementation of the compression library and is
|
||||
subject to change. Applications should only use zlib.h.
|
||||
*/
|
||||
|
||||
#include "third_party/zlib/inffast.internal.h"
|
||||
|
||||
/* INFLATE_FAST_MIN_INPUT:
|
||||
The minimum number of input bytes needed so that we can safely call
|
||||
inflate_fast() with only one up-front bounds check. One
|
||||
length/distance code pair (15 bits for the length code, 5 bits for length
|
||||
extra, 15 bits for the distance code, 13 bits for distance extra) requires
|
||||
reading up to 48 input bits. Additionally, in the same iteraction, we may
|
||||
decode two literals from the root-table (requiring MIN_OUTPUT = 258 + 2).
|
||||
|
||||
Each root-table entry is up to 10 bits, for a total of 68 input bits each
|
||||
iteraction.
|
||||
|
||||
The refill variant reads 8 bytes from the buffer at a time, and advances
|
||||
the input pointer by up to 7 bytes, ensuring there are at least 56-bits
|
||||
available in the bit-buffer. The technique was documented by Fabian Giesen
|
||||
on his blog as variant 4 in the article 'Reading bits in far too many ways':
|
||||
https://fgiesen.wordpress.com/2018/02/20/
|
||||
|
||||
In the worst case, we may refill twice in the same iteraction, requiring
|
||||
MIN_INPUT = 8 + 7.
|
||||
*/
|
||||
#ifdef INFLATE_CHUNK_READ_64LE
|
||||
#undef INFLATE_FAST_MIN_INPUT
|
||||
#define INFLATE_FAST_MIN_INPUT 15
|
||||
#undef INFLATE_FAST_MIN_OUTPUT
|
||||
#define INFLATE_FAST_MIN_OUTPUT 260
|
||||
#endif
|
||||
|
||||
void inflate_fast_chunk_(z_streamp strm, unsigned start);
|
74
third_party/zlib/inflate.c
vendored
74
third_party/zlib/inflate.c
vendored
|
@ -5,15 +5,11 @@
|
|||
* Copyright (C) 1995-2022 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "third_party/zlib/inffast.internal.h"
|
||||
#include "third_party/zlib/inflate.internal.h"
|
||||
#include "third_party/zlib/inftrees.internal.h"
|
||||
#include "third_party/zlib/internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
zlib 1.2.13 (zlib License)\\n\
|
||||
Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
|
||||
Invented 1990 Phillip Walter Katz\"");
|
||||
// clang-format off
|
||||
|
||||
/*
|
||||
|
@ -93,6 +89,13 @@ asm(".include \"libc/disclaimer.inc\"");
|
|||
* The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
|
||||
*/
|
||||
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
#include "third_party/zlib/inftrees.internal.h"
|
||||
#include "third_party/zlib/inflate.internal.h"
|
||||
#include "third_party/zlib/inffast_chunk.internal.h"
|
||||
#include "third_party/zlib/internal.h"
|
||||
#include "third_party/zlib/chunkcopy.inc"
|
||||
|
||||
#ifdef MAKEFIXED
|
||||
# ifndef BUILDFIXED
|
||||
# define BUILDFIXED
|
||||
|
@ -176,6 +179,8 @@ int windowBits;
|
|||
|
||||
/* extract wrap request from windowBits parameter */
|
||||
if (windowBits < 0) {
|
||||
if (windowBits < -15)
|
||||
return Z_STREAM_ERROR;
|
||||
wrap = 0;
|
||||
windowBits = -windowBits;
|
||||
}
|
||||
|
@ -322,7 +327,14 @@ struct inflate_state FAR *state;
|
|||
}
|
||||
|
||||
#ifdef MAKEFIXED
|
||||
#include <stdio.h>
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/dprintf.h"
|
||||
#include "libc/calls/weirdtypes.h"
|
||||
#include "libc/fmt/fmt.h"
|
||||
#include "libc/mem/fmt.h"
|
||||
#include "libc/stdio/stdio.h"
|
||||
#include "libc/stdio/temp.h"
|
||||
#include "third_party/musl/tempnam.h"
|
||||
|
||||
/*
|
||||
Write out the inffixed.h that is #include'd above. Defining MAKEFIXED also
|
||||
|
@ -408,10 +420,20 @@ unsigned copy;
|
|||
|
||||
/* if it hasn't been done already, allocate space for the window */
|
||||
if (state->window == Z_NULL) {
|
||||
unsigned wsize = 1U << state->wbits;
|
||||
state->window = (unsigned char FAR *)
|
||||
ZALLOC(strm, 1U << state->wbits,
|
||||
ZALLOC(strm, wsize + CHUNKCOPY_CHUNK_SIZE,
|
||||
sizeof(unsigned char));
|
||||
if (state->window == Z_NULL) return 1;
|
||||
#ifdef INFLATE_CLEAR_UNUSED_UNDEFINED
|
||||
/* Copies from the overflow portion of this buffer are undefined and
|
||||
may cause analysis tools to raise a warning if we don't initialize
|
||||
it. However, this undefined data overwrites other undefined data
|
||||
and is subsequently either overwritten or left deliberately
|
||||
undefined at the end of decode; so there's really no point.
|
||||
*/
|
||||
zmemzero(state->window + wsize, CHUNKCOPY_CHUNK_SIZE);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* if window not in use yet, initialize */
|
||||
|
@ -1065,7 +1087,7 @@ int flush;
|
|||
if (have >= INFLATE_FAST_MIN_INPUT &&
|
||||
left >= INFLATE_FAST_MIN_OUTPUT) {
|
||||
RESTORE();
|
||||
inflate_fast(strm, out);
|
||||
inflate_fast_chunk_(strm, out);
|
||||
LOAD();
|
||||
if (state->mode == TYPE)
|
||||
state->back = -1;
|
||||
|
@ -1200,17 +1222,16 @@ int flush;
|
|||
else
|
||||
from = state->window + (state->wnext - copy);
|
||||
if (copy > state->length) copy = state->length;
|
||||
if (copy > left) copy = left;
|
||||
put = chunkcopy_safe(put, from, copy, put + left);
|
||||
}
|
||||
else { /* copy from output */
|
||||
from = put - state->offset;
|
||||
copy = state->length;
|
||||
if (copy > left) copy = left;
|
||||
put = chunkcopy_lapped_safe(put, state->offset, copy, put + left);
|
||||
}
|
||||
if (copy > left) copy = left;
|
||||
left -= copy;
|
||||
state->length -= copy;
|
||||
do {
|
||||
*put++ = *from++;
|
||||
} while (--copy);
|
||||
if (state->length == 0) state->mode = LEN;
|
||||
break;
|
||||
case LIT:
|
||||
|
@ -1279,6 +1300,29 @@ int flush;
|
|||
Note: a memory error from inflate() is non-recoverable.
|
||||
*/
|
||||
inf_leave:
|
||||
#if defined(ZLIB_DEBUG)
|
||||
/* XXX(cavalcantii): I put this in place back in 2017 to help debug faulty
|
||||
* client code relying on undefined behavior when chunk_copy first landed.
|
||||
*
|
||||
* It is save to say after all these years that Chromium code is well
|
||||
* behaved and works fine with the optimization, therefore we can enable
|
||||
* this only for DEBUG builds.
|
||||
*
|
||||
* We write a defined value in the unused space to help mark
|
||||
* where the stream has ended. We don't use zeros as that can
|
||||
* mislead clients relying on undefined behavior (i.e. assuming
|
||||
* that the data is over when the buffer has a zero/null value).
|
||||
*
|
||||
* The basic idea is that if client code is not relying on the zlib context
|
||||
* to inform the amount of decompressed data, but instead reads the output
|
||||
* buffer until a zero/null is found, it will fail faster and harder
|
||||
* when the remaining of the buffer is marked with a symbol (e.g. 0x55).
|
||||
*/
|
||||
if (left >= CHUNKCOPY_CHUNK_SIZE)
|
||||
memset(put, 0x55, CHUNKCOPY_CHUNK_SIZE);
|
||||
else
|
||||
memset(put, 0x55, left);
|
||||
#endif
|
||||
RESTORE();
|
||||
if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
|
||||
(state->mode < CHECK || flush != Z_FINISH)))
|
||||
|
|
12
third_party/zlib/inftrees.c
vendored
12
third_party/zlib/inftrees.c
vendored
|
@ -1,3 +1,4 @@
|
|||
// clang-format off
|
||||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
|
@ -6,17 +7,16 @@
|
|||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
*/
|
||||
#include "third_party/zlib/inftrees.internal.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
zlib 1.2.13 (zlib License)\\n\
|
||||
Copyright 1995-2022 Jean-loup Gailly and Mark Adler\\n\
|
||||
Invented 1990 Phillip Walter Katz\"");
|
||||
// clang-format off
|
||||
|
||||
#define MAXBITS 15
|
||||
|
||||
const char inflate_copyright[] =
|
||||
" inflate 1.2.12.1 Copyright 1995-2022 Mark Adler ";
|
||||
/*
|
||||
If you use the zlib library in a product, an acknowledgment is welcome
|
||||
in the documentation of your product. If for some reason you cannot
|
||||
|
@ -69,7 +69,7 @@ unsigned short FAR *work;
|
|||
35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
|
||||
static const unsigned short lext[31] = { /* Length codes 257..285 extra */
|
||||
16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
|
||||
19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 76, 202};
|
||||
19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 194, 65};
|
||||
static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
|
||||
1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
|
||||
257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
|
||||
|
|
1
third_party/zlib/inftrees.internal.h
vendored
1
third_party/zlib/inftrees.internal.h
vendored
|
@ -5,7 +5,6 @@
|
|||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
/* clang-format off */
|
||||
|
||||
/* inftrees.h -- header to use inftrees.c
|
||||
* Copyright (C) 1995-2005, 2010 Mark Adler
|
||||
* For conditions of distribution and use, see copyright notice in zlib.h
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
#ifndef COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "third_party/zlib/deflate.internal.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
/* clang-format off */
|
||||
// clang-format off
|
||||
/* insert_string.h
|
||||
*
|
||||
* Copyright 2019 The Chromium Authors
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the Chromium source repository LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef INSERT_STRING_H
|
||||
#define INSERT_STRING_H
|
||||
|
||||
#ifndef INLINE
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
|
@ -15,9 +17,11 @@ COSMOPOLITAN_C_START_
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#include "third_party/zlib/cpu_features.internal.h"
|
||||
|
||||
// clang-format off
|
||||
#if defined(CRC32_SIMD_SSE42_PCLMUL)
|
||||
// #include <smmintrin.h> /* Required to make MSVC bot build pass. */
|
||||
#include "third_party/intel/smmintrin.internal.h" /* Required to make MSVC bot build pass. */
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
#define TARGET_CPU_WITH_CRC __attribute__((target("sse4.2")))
|
||||
|
@ -51,19 +55,17 @@ COSMOPOLITAN_C_START_
|
|||
|
||||
#if defined(TARGET_CPU_WITH_CRC)
|
||||
|
||||
// TODO(jart): Why does this fail alignment check?
|
||||
noubsan local INLINE Pos insert_string_simd(deflate_state* const s,
|
||||
const Pos str) {
|
||||
TARGET_CPU_WITH_CRC
|
||||
local INLINE Pos insert_string_simd(deflate_state* const s, const Pos str) {
|
||||
Pos ret;
|
||||
unsigned *ip, val, h = 0;
|
||||
unsigned val, h = 0;
|
||||
|
||||
ip = (unsigned*)&s->window[str];
|
||||
val = *ip;
|
||||
zmemcpy(&val, &s->window[str], sizeof(val));
|
||||
|
||||
if (s->level >= 6) val &= 0xFFFFFF;
|
||||
|
||||
/* Compute hash from the CRC32C of |val|. */
|
||||
asm("crc32l\t%1,%0" : "+r"(h) : "rm"(val));
|
||||
h = _cpu_crc32c_hash_u32(h, val);
|
||||
|
||||
ret = s->head[h & s->hash_mask];
|
||||
s->head[h & s->hash_mask] = str;
|
||||
|
@ -131,13 +133,11 @@ local INLINE Pos insert_string(deflate_state* const s, const Pos str) {
|
|||
* the Rabin-Karp hash instead.
|
||||
*/ /* FALLTHROUGH Rabin-Karp */
|
||||
#elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_SIMD_SSE42_PCLMUL)
|
||||
if (X86_HAVE(SSE4_2)) return insert_string_simd(s, str);
|
||||
if (x86_cpu_enable_simd) return insert_string_simd(s, str);
|
||||
#elif defined(TARGET_CPU_WITH_CRC) && defined(CRC32_ARMV8_CRC32)
|
||||
if (arm_cpu_enable_crc32) return insert_string_simd(s, str);
|
||||
#endif
|
||||
return insert_string_c(s, str); /* Rabin-Karp */
|
||||
}
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_ZLIB_INSERT_STRING_H_ */
|
||||
#endif /* INSERT_STRING_H */
|
117
third_party/zlib/slide_hash_simd.inc
vendored
Normal file
117
third_party/zlib/slide_hash_simd.inc
vendored
Normal file
|
@ -0,0 +1,117 @@
|
|||
// clang-format off
|
||||
/* slide_hash_simd.h
|
||||
*
|
||||
* Copyright 2022 The Chromium Authors
|
||||
* Use of this source code is governed by a BSD-style license that can be
|
||||
* found in the Chromium source repository LICENSE file.
|
||||
*/
|
||||
|
||||
#ifndef SLIDE_HASH_SIMD_H
|
||||
#define SLIDE_HASH_SIMD_H
|
||||
|
||||
#include "third_party/zlib/deflate.internal.h"
|
||||
|
||||
#ifndef INLINE
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define INLINE __inline
|
||||
#else
|
||||
#define INLINE inline
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(CPU_NO_SIMD)
|
||||
|
||||
#error SIMD has been disabled for your build target
|
||||
|
||||
#elif defined(DEFLATE_SLIDE_HASH_SSE2)
|
||||
|
||||
#include "third_party/intel/emmintrin.internal.h" /* SSE2 */
|
||||
|
||||
#define Z_SLIDE_INIT_SIMD(wsize) _mm_set1_epi16((ush)(wsize))
|
||||
|
||||
#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
|
||||
for (const Posf* const end = table + size; table != end;) { \
|
||||
__m128i vO = _mm_loadu_si128((__m128i *)(table + 0)); \
|
||||
vO = _mm_subs_epu16(vO, vector_wsize); \
|
||||
_mm_storeu_si128((__m128i *)(table + 0), vO); \
|
||||
table += 8; \
|
||||
}
|
||||
|
||||
typedef __m128i z_vec128i_u16x8_t;
|
||||
|
||||
#elif defined(DEFLATE_SLIDE_HASH_NEON)
|
||||
|
||||
#include "third_party/aarch64/arm_neon.h" /* NEON */
|
||||
|
||||
#define Z_SLIDE_INIT_SIMD(wsize) vdupq_n_u16((ush)(wsize))
|
||||
|
||||
#define Z_SLIDE_HASH_SIMD(table, size, vector_wsize) \
|
||||
for (const Posf* const end = table + size; table != end;) { \
|
||||
uint16x8_t vO = vld1q_u16(table + 0); \
|
||||
uint16x8_t v8 = vld1q_u16(table + 8); \
|
||||
vO = vqsubq_u16(vO, vector_wsize); \
|
||||
v8 = vqsubq_u16(v8, vector_wsize); \
|
||||
vst1q_u16(table + 0, vO); \
|
||||
vst1q_u16(table + 8, v8); \
|
||||
table += 8 + 8; \
|
||||
}
|
||||
|
||||
typedef uint16x8_t z_vec128i_u16x8_t;
|
||||
|
||||
#else
|
||||
|
||||
#error slide_hash_simd is not defined for your build target
|
||||
|
||||
#endif
|
||||
|
||||
/* ===========================================================================
|
||||
* Slide the hash table when sliding the window down (could be avoided with 32
|
||||
* bit values at the expense of memory usage). We slide even when level == 0 to
|
||||
* keep the hash table consistent if we switch back to level > 0 later.
|
||||
*/
|
||||
local INLINE void slide_hash_simd(
|
||||
Posf *head, Posf *prev, const uInt w_size, const uInt hash_size) {
|
||||
/*
|
||||
* The SIMD implementation of the hash table slider assumes:
|
||||
*
|
||||
* 1. hash chain offset is 2 bytes. Should be true as Pos is "ush" type.
|
||||
*/
|
||||
Assert(sizeof(Pos) == 2, "Pos type size error: should be 2 bytes");
|
||||
Assert(sizeof(ush) == 2, "ush type size error: should be 2 bytes");
|
||||
|
||||
Assert(hash_size <= (1 << 16), "Hash table maximum size error");
|
||||
Assert(hash_size >= (1 << 8), "Hash table minimum size error");
|
||||
Assert(w_size == (ush)w_size, "Prev table size error");
|
||||
|
||||
/*
|
||||
* 2. The hash & prev table sizes are a multiple of 32 bytes (256 bits),
|
||||
* since the NEON table slider moves two 128-bit items per loop (loop is
|
||||
* unrolled on NEON for performance, see http://crbug.com/863257).
|
||||
*/
|
||||
Assert(!((hash_size * sizeof(head[0])) & (32 - 1)),
|
||||
"Hash table size error: should be a multiple of 32 bytes");
|
||||
Assert(!((w_size * sizeof(prev[0])) & (32 - 1)),
|
||||
"Prev table size error: should be a multiple of 32 bytes");
|
||||
|
||||
/*
|
||||
* Duplicate (ush)w_size in each uint16_t component of a 128-bit vector.
|
||||
*/
|
||||
const z_vec128i_u16x8_t vec_wsize = Z_SLIDE_INIT_SIMD(w_size);
|
||||
|
||||
/*
|
||||
* Slide {head,prev} hash chain values: subtracts (ush)w_size from every
|
||||
* value with a saturating SIMD subtract, to clamp the result to 0(NIL),
|
||||
* to implement slide_hash() `(m >= wsize ? m - wsize : NIL);` code.
|
||||
*/
|
||||
Z_SLIDE_HASH_SIMD(head, hash_size, vec_wsize);
|
||||
#ifndef FASTEST
|
||||
Z_SLIDE_HASH_SIMD(prev, w_size, vec_wsize);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#undef z_vec128i_u16x8_t
|
||||
#undef Z_SLIDE_HASH_SIMD
|
||||
#undef Z_SLIDE_INIT_SIMD
|
||||
|
||||
#endif /* SLIDE_HASH_SIMD_H */
|
141
third_party/zlib/trees.c
vendored
141
third_party/zlib/trees.c
vendored
|
@ -14,11 +14,6 @@
|
|||
#include "libc/stdio/temp.h"
|
||||
#include "libc/str/str.h"
|
||||
#include "third_party/zlib/deflate.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/*
|
||||
|
@ -174,7 +169,7 @@ local void gen_trees_header OF((void));
|
|||
|
||||
#else /* !ZLIB_DEBUG */
|
||||
# define send_code(s, c, tree) \
|
||||
{ if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
|
||||
{ if (z_verbose>2) kprintf("\ncd %3d ",(c)); \
|
||||
send_bits(s, tree[c].Code, tree[c].Len); }
|
||||
#endif
|
||||
|
||||
|
@ -204,7 +199,7 @@ local void send_bits(s, value, length)
|
|||
s->bits_sent += (ulg)length;
|
||||
|
||||
/* If not enough room in bi_buf, use (valid) bits from bi_buf and
|
||||
* (16 - bi_valid) bits from value, leaving (width - (16-bi_valid))
|
||||
* (16 - bi_valid) bits from value, leaving (width - (16 - bi_valid))
|
||||
* unused bits in value.
|
||||
*/
|
||||
if (s->bi_valid > (int)Buf_size - length) {
|
||||
|
@ -267,7 +262,7 @@ local void tr_static_init()
|
|||
length = 0;
|
||||
for (code = 0; code < LENGTH_CODES-1; code++) {
|
||||
base_length[code] = length;
|
||||
for (n = 0; n < (1<<extra_lbits[code]); n++) {
|
||||
for (n = 0; n < (1 << extra_lbits[code]); n++) {
|
||||
_length_code[length++] = (uch)code;
|
||||
}
|
||||
}
|
||||
|
@ -276,13 +271,13 @@ local void tr_static_init()
|
|||
* in two different ways: code 284 + 5 bits or code 285, so we
|
||||
* overwrite length_code[255] to use the best encoding:
|
||||
*/
|
||||
_length_code[length-1] = (uch)code;
|
||||
_length_code[length - 1] = (uch)code;
|
||||
|
||||
/* Initialize the mapping dist (0..32K) -> dist code (0..29) */
|
||||
dist = 0;
|
||||
for (code = 0 ; code < 16; code++) {
|
||||
base_dist[code] = dist;
|
||||
for (n = 0; n < (1<<extra_dbits[code]); n++) {
|
||||
for (n = 0; n < (1 << extra_dbits[code]); n++) {
|
||||
_dist_code[dist++] = (uch)code;
|
||||
}
|
||||
}
|
||||
|
@ -290,11 +285,11 @@ local void tr_static_init()
|
|||
dist >>= 7; /* from now on, all distances are divided by 128 */
|
||||
for ( ; code < D_CODES; code++) {
|
||||
base_dist[code] = dist << 7;
|
||||
for (n = 0; n < (1<<(extra_dbits[code]-7)); n++) {
|
||||
for (n = 0; n < (1 << (extra_dbits[code] - 7)); n++) {
|
||||
_dist_code[256 + dist++] = (uch)code;
|
||||
}
|
||||
}
|
||||
Assert (dist == 256, "tr_static_init: 256+dist != 512");
|
||||
Assert (dist == 256, "tr_static_init: 256 + dist != 512");
|
||||
|
||||
/* Construct the codes of the static literal tree */
|
||||
for (bits = 0; bits <= MAX_BITS; bits++) bl_count[bits] = 0;
|
||||
|
@ -327,11 +322,12 @@ local void tr_static_init()
|
|||
*/
|
||||
#ifdef GEN_TREES_H
|
||||
# ifndef ZLIB_DEBUG
|
||||
//# include <stdio.h>
|
||||
# endif
|
||||
|
||||
# define SEPARATOR(i, last, width) \
|
||||
((i) == (last)? "\n};\n\n" : \
|
||||
((i) % (width) == (width)-1 ? ",\n" : ", "))
|
||||
((i) % (width) == (width) - 1 ? ",\n" : ", "))
|
||||
|
||||
void gen_trees_header()
|
||||
{
|
||||
|
@ -468,7 +464,7 @@ local void pqdownheap(s, tree, k)
|
|||
while (j <= s->heap_len) {
|
||||
/* Set j to the smallest of the two sons: */
|
||||
if (j < s->heap_len &&
|
||||
smaller(tree, s->heap[j+1], s->heap[j], s->depth)) {
|
||||
smaller(tree, s->heap[j + 1], s->heap[j], s->depth)) {
|
||||
j++;
|
||||
}
|
||||
/* Exit if v is smaller than both sons */
|
||||
|
@ -517,7 +513,7 @@ local void gen_bitlen(s, desc)
|
|||
*/
|
||||
tree[s->heap[s->heap_max]].Len = 0; /* root of the heap */
|
||||
|
||||
for (h = s->heap_max+1; h < HEAP_SIZE; h++) {
|
||||
for (h = s->heap_max + 1; h < HEAP_SIZE; h++) {
|
||||
n = s->heap[h];
|
||||
bits = tree[tree[n].Dad].Len + 1;
|
||||
if (bits > max_length) bits = max_length, overflow++;
|
||||
|
@ -528,7 +524,7 @@ local void gen_bitlen(s, desc)
|
|||
|
||||
s->bl_count[bits]++;
|
||||
xbits = 0;
|
||||
if (n >= base) xbits = extra[n-base];
|
||||
if (n >= base) xbits = extra[n - base];
|
||||
f = tree[n].Freq;
|
||||
s->opt_len += (ulg)f * (unsigned)(bits + xbits);
|
||||
if (stree) s->static_len += (ulg)f * (unsigned)(stree[n].Len + xbits);
|
||||
|
@ -540,10 +536,10 @@ local void gen_bitlen(s, desc)
|
|||
|
||||
/* Find the first bit length which could increase: */
|
||||
do {
|
||||
bits = max_length-1;
|
||||
bits = max_length - 1;
|
||||
while (s->bl_count[bits] == 0) bits--;
|
||||
s->bl_count[bits]--; /* move one leaf down the tree */
|
||||
s->bl_count[bits+1] += 2; /* move one overflow item as its brother */
|
||||
s->bl_count[bits]--; /* move one leaf down the tree */
|
||||
s->bl_count[bits + 1] += 2; /* move one overflow item as its brother */
|
||||
s->bl_count[max_length]--;
|
||||
/* The brother of the overflow item also moves one step up,
|
||||
* but this does not affect bl_count[max_length]
|
||||
|
@ -579,7 +575,7 @@ local void gen_bitlen(s, desc)
|
|||
* OUT assertion: the field code is set for all tree elements of non
|
||||
* zero code length.
|
||||
*/
|
||||
local void gen_codes (tree, max_code, bl_count)
|
||||
local void gen_codes(tree, max_code, bl_count)
|
||||
ct_data *tree; /* the tree to decorate */
|
||||
int max_code; /* largest code with non zero frequency */
|
||||
ushf *bl_count; /* number of codes at each bit length */
|
||||
|
@ -593,13 +589,13 @@ local void gen_codes (tree, max_code, bl_count)
|
|||
* without bit reversal.
|
||||
*/
|
||||
for (bits = 1; bits <= MAX_BITS; bits++) {
|
||||
code = (code + bl_count[bits-1]) << 1;
|
||||
code = (code + bl_count[bits - 1]) << 1;
|
||||
next_code[bits] = (ush)code;
|
||||
}
|
||||
/* Check that the bit counts in bl_count are consistent. The last code
|
||||
* must be all ones.
|
||||
*/
|
||||
Assert (code + bl_count[MAX_BITS]-1 == (1<<MAX_BITS)-1,
|
||||
Assert (code + bl_count[MAX_BITS] - 1 == (1 << MAX_BITS) - 1,
|
||||
"inconsistent bit counts");
|
||||
Tracev(("\ngen_codes: max_code %d ", max_code));
|
||||
|
||||
|
@ -610,7 +606,7 @@ local void gen_codes (tree, max_code, bl_count)
|
|||
tree[n].Code = (ush)bi_reverse(next_code[len]++, len);
|
||||
|
||||
Tracecv(tree != static_ltree, ("\nn %3d %c l %2d c %4x (%x) ",
|
||||
n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len]-1));
|
||||
n, (isgraph(n) ? n : ' '), len, tree[n].Code, next_code[len] - 1));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -634,7 +630,7 @@ local void build_tree(s, desc)
|
|||
int node; /* new node being created */
|
||||
|
||||
/* Construct the initial heap, with least frequent element in
|
||||
* heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n+1].
|
||||
* heap[SMALLEST]. The sons of heap[n] are heap[2*n] and heap[2*n + 1].
|
||||
* heap[0] is not used.
|
||||
*/
|
||||
s->heap_len = 0, s->heap_max = HEAP_SIZE;
|
||||
|
@ -662,7 +658,7 @@ local void build_tree(s, desc)
|
|||
}
|
||||
desc->max_code = max_code;
|
||||
|
||||
/* The elements heap[heap_len/2+1 .. heap_len] are leaves of the tree,
|
||||
/* The elements heap[heap_len/2 + 1 .. heap_len] are leaves of the tree,
|
||||
* establish sub-heaps of increasing lengths:
|
||||
*/
|
||||
for (n = s->heap_len/2; n >= 1; n--) pqdownheap(s, tree, n);
|
||||
|
@ -710,7 +706,7 @@ local void build_tree(s, desc)
|
|||
* Scan a literal or distance tree to determine the frequencies of the codes
|
||||
* in the bit length tree.
|
||||
*/
|
||||
local void scan_tree (s, tree, max_code)
|
||||
local void scan_tree(s, tree, max_code)
|
||||
deflate_state *s;
|
||||
ct_data *tree; /* the tree to be scanned */
|
||||
int max_code; /* and its largest code of non zero frequency */
|
||||
|
@ -724,10 +720,10 @@ local void scan_tree (s, tree, max_code)
|
|||
int min_count = 4; /* min repeat count */
|
||||
|
||||
if (nextlen == 0) max_count = 138, min_count = 3;
|
||||
tree[max_code+1].Len = (ush)0xffff; /* guard */
|
||||
tree[max_code + 1].Len = (ush)0xffff; /* guard */
|
||||
|
||||
for (n = 0; n <= max_code; n++) {
|
||||
curlen = nextlen; nextlen = tree[n+1].Len;
|
||||
curlen = nextlen; nextlen = tree[n + 1].Len;
|
||||
if (++count < max_count && curlen == nextlen) {
|
||||
continue;
|
||||
} else if (count < min_count) {
|
||||
|
@ -755,7 +751,7 @@ local void scan_tree (s, tree, max_code)
|
|||
* Send a literal or distance tree in compressed form, using the codes in
|
||||
* bl_tree.
|
||||
*/
|
||||
local void send_tree (s, tree, max_code)
|
||||
local void send_tree(s, tree, max_code)
|
||||
deflate_state *s;
|
||||
ct_data *tree; /* the tree to be scanned */
|
||||
int max_code; /* and its largest code of non zero frequency */
|
||||
|
@ -768,11 +764,11 @@ local void send_tree (s, tree, max_code)
|
|||
int max_count = 7; /* max repeat count */
|
||||
int min_count = 4; /* min repeat count */
|
||||
|
||||
/* tree[max_code+1].Len = -1; */ /* guard already set */
|
||||
/* tree[max_code + 1].Len = -1; */ /* guard already set */
|
||||
if (nextlen == 0) max_count = 138, min_count = 3;
|
||||
|
||||
for (n = 0; n <= max_code; n++) {
|
||||
curlen = nextlen; nextlen = tree[n+1].Len;
|
||||
curlen = nextlen; nextlen = tree[n + 1].Len;
|
||||
if (++count < max_count && curlen == nextlen) {
|
||||
continue;
|
||||
} else if (count < min_count) {
|
||||
|
@ -783,13 +779,13 @@ local void send_tree (s, tree, max_code)
|
|||
send_code(s, curlen, s->bl_tree); count--;
|
||||
}
|
||||
Assert(count >= 3 && count <= 6, " 3_6?");
|
||||
send_code(s, REP_3_6, s->bl_tree); send_bits(s, count-3, 2);
|
||||
send_code(s, REP_3_6, s->bl_tree); send_bits(s, count - 3, 2);
|
||||
|
||||
} else if (count <= 10) {
|
||||
send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count-3, 3);
|
||||
send_code(s, REPZ_3_10, s->bl_tree); send_bits(s, count - 3, 3);
|
||||
|
||||
} else {
|
||||
send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count-11, 7);
|
||||
send_code(s, REPZ_11_138, s->bl_tree); send_bits(s, count - 11, 7);
|
||||
}
|
||||
count = 0; prevlen = curlen;
|
||||
if (nextlen == 0) {
|
||||
|
@ -817,8 +813,8 @@ local int build_bl_tree(s)
|
|||
|
||||
/* Build the bit length tree: */
|
||||
build_tree(s, (tree_desc *)(&(s->bl_desc)));
|
||||
/* opt_len now includes the length of the tree representations, except
|
||||
* the lengths of the bit lengths codes and the 5+5+4 bits for the counts.
|
||||
/* opt_len now includes the length of the tree representations, except the
|
||||
* lengths of the bit lengths codes and the 5 + 5 + 4 bits for the counts.
|
||||
*/
|
||||
|
||||
/* Determine the number of bit length codes to send. The pkzip format
|
||||
|
@ -829,8 +825,8 @@ local int build_bl_tree(s)
|
|||
if (s->bl_tree[bl_order[max_blindex]].Len != 0) break;
|
||||
}
|
||||
/* Update opt_len to include the bit length tree and counts */
|
||||
s->opt_len += 3*((ulg)max_blindex+1) + 5+5+4;
|
||||
Tracev(("\ndyn trees: dyn %ld, stat %ld",
|
||||
s->opt_len += 3*((ulg)max_blindex + 1) + 5 + 5 + 4;
|
||||
Tracev(( "\ndyn trees: dyn %ld, stat %ld",
|
||||
s->opt_len, s->static_len));
|
||||
|
||||
return max_blindex;
|
||||
|
@ -850,21 +846,21 @@ local void send_all_trees(s, lcodes, dcodes, blcodes)
|
|||
Assert (lcodes >= 257 && dcodes >= 1 && blcodes >= 4, "not enough codes");
|
||||
Assert (lcodes <= L_CODES && dcodes <= D_CODES && blcodes <= BL_CODES,
|
||||
"too many codes");
|
||||
Tracev(("\nbl counts: "));
|
||||
send_bits(s, lcodes-257, 5); /* not +255 as stated in appnote.txt */
|
||||
send_bits(s, dcodes-1, 5);
|
||||
send_bits(s, blcodes-4, 4); /* not -3 as stated in appnote.txt */
|
||||
Tracev(( "\nbl counts: "));
|
||||
send_bits(s, lcodes - 257, 5); /* not +255 as stated in appnote.txt */
|
||||
send_bits(s, dcodes - 1, 5);
|
||||
send_bits(s, blcodes - 4, 4); /* not -3 as stated in appnote.txt */
|
||||
for (rank = 0; rank < blcodes; rank++) {
|
||||
Tracev(("\nbl code %2d ", bl_order[rank]));
|
||||
Tracev(( "\nbl code %2d ", bl_order[rank]));
|
||||
send_bits(s, s->bl_tree[bl_order[rank]].Len, 3);
|
||||
}
|
||||
Tracev(("\nbl tree: sent %ld", s->bits_sent));
|
||||
Tracev(( "\nbl tree: sent %ld", s->bits_sent));
|
||||
|
||||
send_tree(s, (ct_data *)s->dyn_ltree, lcodes-1); /* literal tree */
|
||||
Tracev(("\nlit tree: sent %ld", s->bits_sent));
|
||||
send_tree(s, (ct_data *)s->dyn_ltree, lcodes - 1); /* literal tree */
|
||||
Tracev(( "\nlit tree: sent %ld", s->bits_sent));
|
||||
|
||||
send_tree(s, (ct_data *)s->dyn_dtree, dcodes-1); /* distance tree */
|
||||
Tracev(("\ndist tree: sent %ld", s->bits_sent));
|
||||
send_tree(s, (ct_data *)s->dyn_dtree, dcodes - 1); /* distance tree */
|
||||
Tracev(( "\ndist tree: sent %ld", s->bits_sent));
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
|
@ -876,7 +872,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
|
|||
ulg stored_len; /* length of input block */
|
||||
int last; /* one if this is the last block for a file */
|
||||
{
|
||||
send_bits(s, (STORED_BLOCK<<1)+last, 3); /* send block type */
|
||||
send_bits(s, (STORED_BLOCK<<1) + last, 3); /* send block type */
|
||||
bi_windup(s); /* align on byte boundary */
|
||||
put_short(s, (ush)stored_len);
|
||||
put_short(s, (ush)~stored_len);
|
||||
|
@ -887,7 +883,7 @@ void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
|
|||
s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
|
||||
s->compressed_len += (stored_len + 4) << 3;
|
||||
s->bits_sent += 2*16;
|
||||
s->bits_sent += stored_len<<3;
|
||||
s->bits_sent += stored_len << 3;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -937,11 +933,11 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
|
|||
|
||||
/* Construct the literal and distance trees */
|
||||
build_tree(s, (tree_desc *)(&(s->l_desc)));
|
||||
Tracev(("\nlit data: dyn %ld, stat %ld", s->opt_len,
|
||||
Tracev(( "\nlit data: dyn %ld, stat %ld", s->opt_len,
|
||||
s->static_len));
|
||||
|
||||
build_tree(s, (tree_desc *)(&(s->d_desc)));
|
||||
Tracev(("\ndist data: dyn %ld, stat %ld", s->opt_len,
|
||||
Tracev(( "\ndist data: dyn %ld, stat %ld", s->opt_len,
|
||||
s->static_len));
|
||||
/* At this point, opt_len and static_len are the total bit lengths of
|
||||
* the compressed block data, excluding the tree representations.
|
||||
|
@ -953,14 +949,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
|
|||
max_blindex = build_bl_tree(s);
|
||||
|
||||
/* Determine the best encoding. Compute the block lengths in bytes. */
|
||||
opt_lenb = (s->opt_len+3+7)>>3;
|
||||
static_lenb = (s->static_len+3+7)>>3;
|
||||
opt_lenb = (s->opt_len + 3 + 7) >> 3;
|
||||
static_lenb = (s->static_len + 3 + 7) >> 3;
|
||||
|
||||
Tracev(("\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
|
||||
Tracev(( "\nopt %lu(%lu) stat %lu(%lu) stored %lu lit %u ",
|
||||
opt_lenb, s->opt_len, static_lenb, s->static_len, stored_len,
|
||||
s->sym_next / 3));
|
||||
|
||||
if (static_lenb <= opt_lenb) opt_lenb = static_lenb;
|
||||
#ifndef FORCE_STATIC
|
||||
if (static_lenb <= opt_lenb || s->strategy == Z_FIXED)
|
||||
#endif
|
||||
opt_lenb = static_lenb;
|
||||
|
||||
} else {
|
||||
Assert(buf != (char*)0, "lost buf");
|
||||
|
@ -970,7 +969,7 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
|
|||
#ifdef FORCE_STORED
|
||||
if (buf != (char*)0) { /* force stored block */
|
||||
#else
|
||||
if (stored_len+4 <= opt_lenb && buf != (char*)0) {
|
||||
if (stored_len + 4 <= opt_lenb && buf != (char*)0) {
|
||||
/* 4: two words for the lengths */
|
||||
#endif
|
||||
/* The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
|
||||
|
@ -981,21 +980,17 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
|
|||
*/
|
||||
_tr_stored_block(s, buf, stored_len, last);
|
||||
|
||||
#ifdef FORCE_STATIC
|
||||
} else if (static_lenb >= 0) { /* force static trees */
|
||||
#else
|
||||
} else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
|
||||
#endif
|
||||
send_bits(s, (STATIC_TREES<<1)+last, 3);
|
||||
} else if (static_lenb == opt_lenb) {
|
||||
send_bits(s, (STATIC_TREES<<1) + last, 3);
|
||||
compress_block(s, (const ct_data *)static_ltree,
|
||||
(const ct_data *)static_dtree);
|
||||
#ifdef ZLIB_DEBUG
|
||||
s->compressed_len += 3 + s->static_len;
|
||||
#endif
|
||||
} else {
|
||||
send_bits(s, (DYN_TREES<<1)+last, 3);
|
||||
send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
|
||||
max_blindex+1);
|
||||
send_bits(s, (DYN_TREES<<1) + last, 3);
|
||||
send_all_trees(s, s->l_desc.max_code + 1, s->d_desc.max_code + 1,
|
||||
max_blindex + 1);
|
||||
compress_block(s, (const ct_data *)s->dyn_ltree,
|
||||
(const ct_data *)s->dyn_dtree);
|
||||
#ifdef ZLIB_DEBUG
|
||||
|
@ -1014,18 +1009,18 @@ void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
|
|||
s->compressed_len += 7; /* align on byte boundary */
|
||||
#endif
|
||||
}
|
||||
Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len>>3,
|
||||
s->compressed_len-7*last));
|
||||
Tracev(("\ncomprlen %lu(%lu) ", s->compressed_len >> 3,
|
||||
s->compressed_len - 7*last));
|
||||
}
|
||||
|
||||
/* ===========================================================================
|
||||
* Save the match info and tally the frequency counts. Return true if
|
||||
* the current block must be flushed.
|
||||
*/
|
||||
int ZLIB_INTERNAL _tr_tally (s, dist, lc)
|
||||
int ZLIB_INTERNAL _tr_tally(s, dist, lc)
|
||||
deflate_state *s;
|
||||
unsigned dist; /* distance of matched string */
|
||||
unsigned lc; /* match length-MIN_MATCH or unmatched char (if dist==0) */
|
||||
unsigned lc; /* match length - MIN_MATCH or unmatched char (dist==0) */
|
||||
{
|
||||
s->sym_buf[s->sym_next++] = (uch)dist;
|
||||
s->sym_buf[s->sym_next++] = (uch)(dist >> 8);
|
||||
|
@ -1041,7 +1036,7 @@ int ZLIB_INTERNAL _tr_tally (s, dist, lc)
|
|||
(ush)lc <= (ush)(MAX_MATCH-MIN_MATCH) &&
|
||||
(ush)d_code(dist) < (ush)D_CODES, "_tr_tally: bad match");
|
||||
|
||||
s->dyn_ltree[_length_code[lc]+LITERALS+1].Freq++;
|
||||
s->dyn_ltree[_length_code[lc] + LITERALS + 1].Freq++;
|
||||
s->dyn_dtree[d_code(dist)].Freq++;
|
||||
}
|
||||
return (s->sym_next == s->sym_end);
|
||||
|
@ -1071,7 +1066,7 @@ local void compress_block(s, ltree, dtree)
|
|||
} else {
|
||||
/* Here, lc is the match length - MIN_MATCH */
|
||||
code = _length_code[lc];
|
||||
send_code(s, code+LITERALS+1, ltree); /* send the length code */
|
||||
send_code(s, code + LITERALS + 1, ltree); /* send length code */
|
||||
extra = extra_lbits[code];
|
||||
if (extra != 0) {
|
||||
lc -= base_length[code];
|
||||
|
@ -1187,6 +1182,6 @@ local void bi_windup(s)
|
|||
s->bi_buf = 0;
|
||||
s->bi_valid = 0;
|
||||
#ifdef ZLIB_DEBUG
|
||||
s->bits_sent = (s->bits_sent+7) & ~7;
|
||||
s->bits_sent = (s->bits_sent + 7) & ~7;
|
||||
#endif
|
||||
}
|
||||
|
|
1
third_party/zlib/trees.inc
vendored
1
third_party/zlib/trees.inc
vendored
|
@ -125,4 +125,3 @@ local const int base_dist[D_CODES] = {
|
|||
32, 48, 64, 96, 128, 192, 256, 384, 512, 768,
|
||||
1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 24576
|
||||
};
|
||||
|
||||
|
|
5
third_party/zlib/uncompr.c
vendored
5
third_party/zlib/uncompr.c
vendored
|
@ -9,11 +9,6 @@
|
|||
#include "third_party/zlib/internal.h"
|
||||
#include "third_party/zlib/macros.internal.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* @(#) $Id$ */
|
||||
|
|
5
third_party/zlib/zlib.h
vendored
5
third_party/zlib/zlib.h
vendored
|
@ -1696,6 +1696,11 @@ uLong adler32_combine(uLong adler1, uLong adler2, int64_t len2);
|
|||
*/
|
||||
uLong crc32(uLong crc, const Bytef *buf, uInt len);
|
||||
|
||||
/**
|
||||
* Same as crc32(), but with a size_t length.
|
||||
*/
|
||||
uint32_t crc32_z(uint32_t crc, const void *buf, size_t len);
|
||||
|
||||
/**
|
||||
* Combine two CRC-32 check values into one. For two sequences of bytes,
|
||||
* seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
|
||||
|
|
38
third_party/zlib/zlib.mk
vendored
38
third_party/zlib/zlib.mk
vendored
|
@ -9,6 +9,7 @@ THIRD_PARTY_ZLIB_A = o/$(MODE)/third_party/zlib/zlib.a
|
|||
THIRD_PARTY_ZLIB_A_FILES := $(wildcard third_party/zlib/*)
|
||||
THIRD_PARTY_ZLIB_A_HDRS = $(filter %.h,$(THIRD_PARTY_ZLIB_A_FILES))
|
||||
THIRD_PARTY_ZLIB_A_SRCS = $(filter %.c,$(THIRD_PARTY_ZLIB_A_FILES))
|
||||
THIRD_PARTY_ZLIB_A_INCS = $(filter %.inc,$(THIRD_PARTY_ZLIB_A_FILES))
|
||||
THIRD_PARTY_ZLIB_A_OBJS = $(THIRD_PARTY_ZLIB_A_SRCS:%.c=o/$(MODE)/%.o)
|
||||
|
||||
THIRD_PARTY_ZLIB_A_CHECKS = \
|
||||
|
@ -18,6 +19,7 @@ THIRD_PARTY_ZLIB_A_CHECKS = \
|
|||
THIRD_PARTY_ZLIB_A_DIRECTDEPS = \
|
||||
LIBC_INTRIN \
|
||||
LIBC_NEXGEN32E \
|
||||
LIBC_SYSV \
|
||||
LIBC_STR \
|
||||
LIBC_STUBS
|
||||
|
||||
|
@ -34,18 +36,35 @@ $(THIRD_PARTY_ZLIB_A).pkg: \
|
|||
$(foreach x,$(THIRD_PARTY_ZLIB_A_DIRECTDEPS),$($(x)_A).pkg)
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
o/$(MODE)/third_party/zlib/adler32simd.o: private \
|
||||
OVERRIDE_CFLAGS += \
|
||||
o/$(MODE)/third_party/zlib/adler32_simd.o: private \
|
||||
TARGET_ARCH += \
|
||||
-mssse3
|
||||
o/$(MODE)/third_party/zlib/adler32simd.o: private \
|
||||
o/$(MODE)/third_party/zlib/crc_folding.o \
|
||||
o/$(MODE)/third_party/zlib/crc32_simd.o: private \
|
||||
TARGET_ARCH += \
|
||||
-msse4.2 \
|
||||
-mpclmul
|
||||
$(THIRD_PARTY_ZLIB_A_OBJS): private \
|
||||
OVERRIDE_CPPFLAGS += \
|
||||
-DADLER32_SIMD_SSSE3
|
||||
o/$(MODE)/third_party/zlib/adler32.o: private \
|
||||
-DADLER32_SIMD_SSSE3 \
|
||||
-DCRC32_SIMD_SSE42_PCLMUL \
|
||||
-DDEFLATE_SLIDE_HASH_SSE2 \
|
||||
-DINFLATE_CHUNK_SIMD_SSE2 \
|
||||
-DINFLATE_CHUNK_READ_64LE
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), aarch64)
|
||||
o/$(MODE)/third_party/zlib/deflate.o \
|
||||
o/$(MODE)/third_party/zlib/crc32_simd.o: private \
|
||||
TARGET_ARCH += \
|
||||
-march=armv8-a+aes+crc
|
||||
$(THIRD_PARTY_ZLIB_A_OBJS): private \
|
||||
OVERRIDE_CPPFLAGS += \
|
||||
-DADLER32_SIMD_SSSE3
|
||||
o/$(MODE)/third_party/zlib/deflate.o: private \
|
||||
OVERRIDE_CPPFLAGS += \
|
||||
-DCRC32_SIMD_SSE42_PCLMUL
|
||||
-DADLER32_SIMD_NEON \
|
||||
-DCRC32_ARMV8_CRC32 \
|
||||
-DDEFLATE_SLIDE_HASH_NEON \
|
||||
-DINFLATE_CHUNK_SIMD_NEON \
|
||||
-DINFLATE_CHUNK_READ_64LE
|
||||
endif
|
||||
|
||||
$(THIRD_PARTY_ZLIB_A_OBJS): private \
|
||||
|
@ -56,6 +75,7 @@ $(THIRD_PARTY_ZLIB_A_OBJS): private \
|
|||
THIRD_PARTY_ZLIB_LIBS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)))
|
||||
THIRD_PARTY_ZLIB_SRCS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_SRCS))
|
||||
THIRD_PARTY_ZLIB_HDRS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_HDRS))
|
||||
THIRD_PARTY_ZLIB_INCS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_INCS))
|
||||
THIRD_PARTY_ZLIB_BINS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_BINS))
|
||||
THIRD_PARTY_ZLIB_CHECKS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_CHECKS))
|
||||
THIRD_PARTY_ZLIB_OBJS = $(foreach x,$(THIRD_PARTY_ZLIB_ARTIFACTS),$($(x)_OBJS))
|
||||
|
|
5
third_party/zlib/zutil.c
vendored
5
third_party/zlib/zutil.c
vendored
|
@ -10,11 +10,6 @@
|
|||
#include "libc/mem/mem.h"
|
||||
#include "third_party/zlib/internal.h"
|
||||
#include "third_party/zlib/zutil.internal.h"
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
zlib (zlib License)\\n\
|
||||
Copyright 1995-2017 Jean-loup Gailly and Mark Adler\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
/* @(#) $Id$ */
|
||||
|
|
6
third_party/zlib/zutil.internal.h
vendored
6
third_party/zlib/zutil.internal.h
vendored
|
@ -128,6 +128,12 @@ typedef unsigned long ulg;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define zalign(x) __declspec(align(x))
|
||||
#else
|
||||
#define zalign(x) __attribute__((aligned((x))))
|
||||
#endif
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* ZUTIL_H */
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue