cosmopolitan/third_party/zlib/crc32_simd.inc

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

624 lines
18 KiB
PHP
Raw Normal View History

/* crc32_simd.c
*
* Copyright 2017 The Chromium Authors
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
Release Cosmopolitan v3.3 This change upgrades to GCC 12.3 and GNU binutils 2.42. The GNU linker appears to have changed things so that only a single de-duplicated str table is present in the binary, and it gets placed wherever the linker wants, regardless of what the linker script says. To cope with that we need to stop using .ident to embed licenses. As such, this change does significant work to revamp how third party licenses are defined in the codebase, using `.section .notice,"aR",@progbits`. This new GCC 12.3 toolchain has support for GNU indirect functions. It lets us support __target_clones__ for the first time. This is used for optimizing the performance of libc string functions such as strlen and friends so far on x86, by ensuring AVX systems favor a second codepath that uses VEX encoding. It shaves some latency off certain operations. It's a useful feature to have for scientific computing for the reasons explained by the test/libcxx/openmp_test.cc example which compiles for fifteen different microarchitectures. Thanks to the upgrades, it's now also possible to use newer instruction sets, such as AVX512FP16, VNNI. Cosmo now uses the %gs register on x86 by default for TLS. Doing it is helpful for any program that links `cosmo_dlopen()`. Such programs had to recompile their binaries at startup to change the TLS instructions. That's not great, since it means every page in the executable needs to be faulted. The work of rewriting TLS-related x86 opcodes, is moved to fixupobj.com instead. This is great news for MacOS x86 users, since we previously needed to morph the binary every time for that platform but now that's no longer necessary. The only platforms where we need fixup of TLS x86 opcodes at runtime are now Windows, OpenBSD, and NetBSD. On Windows we morph TLS to point deeper into the TIB, based on a TlsAlloc assignment, and on OpenBSD/NetBSD we morph %gs back into %fs since the kernels do not allow us to specify a value for the %gs register. OpenBSD users are now required to use APE Loader to run Cosmo binaries and assimilation is no longer possible. OpenBSD kernel needs to change to allow programs to specify a value for the %gs register, or it needs to stop marking executable pages loaded by the kernel as mimmutable(). This release fixes __constructor__, .ctor, .init_array, and lastly the .preinit_array so they behave the exact same way as glibc. We no longer use hex constants to define math.h symbols like M_PI.
2024-02-20 19:12:09 +00:00
__static_yoink("chromium_notice");
#include "third_party/intel/x86gprintrin.internal.h"
#include "third_party/zlib/crc32_simd.internal.h"
#if defined(CRC32_SIMD_AVX512_PCLMUL)
/*
* crc32_avx512_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 256, and a multiple of 64. Based on:
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
*/
#include "third_party/intel/emmintrin.internal.h"
#include "third_party/intel/smmintrin.internal.h"
#include "third_party/intel/wmmintrin.internal.h"
#include "third_party/intel/immintrin.internal.h"
uint32_t ZLIB_INTERNAL crc32_avx512_simd_( /* AVX512+PCLMUL */
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
/*
* Definitions of the bit-reflected domain constants k1,k2,k3,k4
* are similar to those given at the end of the paper, and remaining
* constants and CRC32+Barrett polynomials remain unchanged.
*
* Replace the index of x from 128 to 512. As follows:
* k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 = 0x011542778a
* k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 = 0x01322d1430
* k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 = 0x0154442bd4
* k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 = 0x01c6e41596
*/
static const uint64_t zalign(64) k1k2[] = { 0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430 };
static const uint64_t zalign(64) k3k4[] = { 0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k5k6[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k7k8[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a0, a1, a2, a3;
/*
* There's at least one block of 256.
*/
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
x0 = _mm512_load_si512((__m512i *)k1k2);
buf += 256;
len -= 256;
/*
* Parallel fold blocks of 256, if any.
*/
while (len >= 256)
{
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, x5);
x2 = _mm512_xor_si512(x2, x6);
x3 = _mm512_xor_si512(x3, x7);
x4 = _mm512_xor_si512(x4, x8);
x1 = _mm512_xor_si512(x1, y5);
x2 = _mm512_xor_si512(x2, y6);
x3 = _mm512_xor_si512(x3, y7);
x4 = _mm512_xor_si512(x4, y8);
buf += 256;
len -= 256;
}
/*
* Fold into 512-bits.
*/
x0 = _mm512_load_si512((__m512i *)k3k4);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x3);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x4);
x1 = _mm512_xor_si512(x1, x5);
/*
* Single fold blocks of 64, if any.
*/
while (len >= 64)
{
x2 = _mm512_loadu_si512((__m512i *)buf);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
buf += 64;
len -= 64;
}
/*
* Fold 512-bits to 384-bits.
*/
a0 = _mm_load_si128((__m128i *)k5k6);
a1 = _mm512_extracti32x4_epi32(x1, 0);
a2 = _mm512_extracti32x4_epi32(x1, 1);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
/*
* Fold 384-bits to 256-bits.
*/
a2 = _mm512_extracti32x4_epi32(x1, 2);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
/*
* Fold 256-bits to 128-bits.
*/
a2 = _mm512_extracti32x4_epi32(x1, 3);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
/*
* Fold 128-bits to 64-bits.
*/
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
a1 = _mm_srli_si128(a1, 8);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_loadl_epi64((__m128i*)k7k8);
a2 = _mm_srli_si128(a1, 4);
a1 = _mm_and_si128(a1, a3);
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
/*
* Barret reduce to 32-bits.
*/
a0 = _mm_load_si128((__m128i*)poly);
a2 = _mm_and_si128(a1, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
a2 = _mm_and_si128(a2, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
/*
* Return the crc32.
*/
return _mm_extract_epi32(a1, 1);
}
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
/*
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16.
*/
#include "third_party/intel/emmintrin.internal.h"
#include "third_party/intel/smmintrin.internal.h"
#include "third_party/intel/wmmintrin.internal.h"
uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
/*
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
* the CRC32+Barrett polynomials given at the end of the paper.
*/
static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
/*
* There's at least one block of 64.
*/
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
x0 = _mm_load_si128((__m128i *)k1k2);
buf += 64;
len -= 64;
/*
* Parallel fold blocks of 64, if any.
*/
while (len >= 64)
{
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_xor_si128(x4, x8);
x1 = _mm_xor_si128(x1, y5);
x2 = _mm_xor_si128(x2, y6);
x3 = _mm_xor_si128(x3, y7);
x4 = _mm_xor_si128(x4, y8);
buf += 64;
len -= 64;
}
/*
* Fold into 128-bits.
*/
x0 = _mm_load_si128((__m128i *)k3k4);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x3);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x4);
x1 = _mm_xor_si128(x1, x5);
/*
* Single fold blocks of 16, if any.
*/
while (len >= 16)
{
x2 = _mm_loadu_si128((__m128i *)buf);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
buf += 16;
len -= 16;
}
/*
* Fold 128-bits to 64-bits.
*/
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_loadl_epi64((__m128i*)k5k0);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
/*
* Barret reduce to 32-bits.
*/
x0 = _mm_load_si128((__m128i*)poly);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
/*
* Return the crc32.
*/
return _mm_extract_epi32(x1, 1);
}
#elif defined(CRC32_ARMV8_CRC32)
/* CRC32 checksums using ARMv8-a crypto instructions.
*/
#if defined(__clang__)
/* We need some extra types for using PMULL.
*/
#if defined(__aarch64__)
#include "third_party/aarch64/arm_neon.internal.h"
#include "third_party/aarch64/arm_acle.internal.h"
#endif
/* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
* armv8 target, which is incompatible with ThinLTO optimizations on Android.
* (Namely, mixing and matching different module-level targets makes ThinLTO
* warn, and Android defaults to armv7-a. This restriction does not apply to
* function-level `target`s, however.)
*
* Since we only need four crc intrinsics, and since clang's implementation of
* those are just wrappers around compiler builtins, it's simplest to #define
* those builtins directly. If this #define list grows too much (or we depend on
* an intrinsic that isn't a trivial wrapper), we may have to find a better way
* to go about this.
*
* NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
* feature for this target (ignoring feature)." This appears to be a harmless
* bug in clang.
*
* These definitions must appear *after* including arm_acle.h otherwise that
* header may end up defining functions named __builtin_arm_crc32* that call
* themselves, creating an infinite loop when the intrinsic is called.
*/
/* XXX: Cannot hook into builtins with XCode for arm64. */
#if !defined(ARMV8_OS_MACOS)
#define __crc32b __builtin_arm_crc32b
#define __crc32d __builtin_arm_crc32d
#define __crc32w __builtin_arm_crc32w
#define __crc32cw __builtin_arm_crc32cw
#endif
#if defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
#else // !defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
#endif // defined(__aarch64__)
#elif defined(__GNUC__)
/* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
* allowed. We can just include arm_acle.h.
*/
#include "third_party/aarch64/arm_neon.internal.h"
#include "third_party/aarch64/arm_acle.internal.h"
#define TARGET_ARMV8_WITH_CRC
#else // !defined(__GNUC__) && !defined(_aarch64__)
#error ARM CRC32 SIMD extensions only supported for Clang and GCC
#endif
TARGET_ARMV8_WITH_CRC
uint32_t ZLIB_INTERNAL armv8_crc32_little(
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
uint32_t c = (uint32_t) ~crc;
while (len && ((uintptr_t)buf & 7)) {
c = __crc32b(c, *buf++);
--len;
}
const uint64_t *buf8 = (const uint64_t *)buf;
while (len >= 64) {
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
c = __crc32d(c, *buf8++);
len -= 64;
}
while (len >= 8) {
c = __crc32d(c, *buf8++);
len -= 8;
}
buf = (const unsigned char *)buf8;
while (len--) {
c = __crc32b(c, *buf++);
}
return ~c;
}
#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
/*
* crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16. Based on:
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
*/
TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
{
uint8x16_t r;
__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
: "=w" (r) : "w" (a), "w" (b) );
return r;
}
TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
{
uint8x16_t r;
__asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
: "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
return r;
}
TARGET_ARMV8_WITH_CRC
static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
{
uint8x16_t r;
__asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
: "=w" (r) : "w" (a), "w" (b) );
return r;
}
TARGET_ARMV8_WITH_CRC
uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
const unsigned char *buf,
z_size_t len,
uint32_t crc)
{
/*
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
* the CRC32+Barrett polynomials given at the end of the paper.
*/
static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
/*
* There's at least one block of 64.
*/
x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
x0 = vld1q_u64(k1k2);
buf += 64;
len -= 64;
/*
* Parallel fold blocks of 64, if any.
*/
while (len >= 64)
{
x5 = (uint64x2_t) pmull_lo(x1, x0);
x6 = (uint64x2_t) pmull_lo(x2, x0);
x7 = (uint64x2_t) pmull_lo(x3, x0);
x8 = (uint64x2_t) pmull_lo(x4, x0);
y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
x1 = (uint64x2_t) pmull_hi(x1, x0);
x2 = (uint64x2_t) pmull_hi(x2, x0);
x3 = (uint64x2_t) pmull_hi(x3, x0);
x4 = (uint64x2_t) pmull_hi(x4, x0);
x1 = veorq_u64(x1, x5);
x2 = veorq_u64(x2, x6);
x3 = veorq_u64(x3, x7);
x4 = veorq_u64(x4, x8);
x1 = veorq_u64(x1, y5);
x2 = veorq_u64(x2, y6);
x3 = veorq_u64(x3, y7);
x4 = veorq_u64(x4, y8);
buf += 64;
len -= 64;
}
/*
* Fold into 128-bits.
*/
x0 = vld1q_u64(k3k4);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x2);
x1 = veorq_u64(x1, x5);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x3);
x1 = veorq_u64(x1, x5);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x4);
x1 = veorq_u64(x1, x5);
/*
* Single fold blocks of 16, if any.
*/
while (len >= 16)
{
x2 = vld1q_u64((const uint64_t *)buf);
x5 = (uint64x2_t) pmull_lo(x1, x0);
x1 = (uint64x2_t) pmull_hi(x1, x0);
x1 = veorq_u64(x1, x2);
x1 = veorq_u64(x1, x5);
buf += 16;
len -= 16;
}
/*
* Fold 128-bits to 64-bits.
*/
static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
x2 = (uint64x2_t) pmull_01(x1, x0);
x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
x3 = (uint64x2_t) vld1q_u32(mask);
x1 = veorq_u64(x1, x2);
x0 = vld1q_u64(k5k0);
x2 = (uint64x2_t) pmull_01(x2, x0);
x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
x1 = vandq_u64(x1, x3);
x1 = (uint64x2_t) pmull_lo(x1, x0);
x1 = veorq_u64(x1, x2);
/*
* Barret reduce to 32-bits.
*/
x0 = vld1q_u64(poly);
x2 = vandq_u64(x1, x3);
x2 = (uint64x2_t) pmull_01(x2, x0);
x2 = vandq_u64(x2, x3);
x2 = (uint64x2_t) pmull_lo(x2, x0);
x1 = veorq_u64(x1, x2);
/*
* Return the crc32.
*/
return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
}
#endif /* aarch64 specific code. */
#endif