Support avx512f + vpclmulqdq crc32() acceleration

Cosmo's _Cz_crc32() function now goes 73 GiB/s on Threadripper. This
will significantly improve the performance of the PKZIP file format.
This algorithm is also used by apelink, to create deterministic ids.
This commit is contained in:
Justine Tunney 2024-05-29 10:13:37 -07:00
parent 7c8df05042
commit a05ce3ad9d
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
8 changed files with 385 additions and 8 deletions

View file

@ -38,16 +38,31 @@ $(THIRD_PARTY_ZLIB_A).pkg: \
ifeq ($(ARCH), x86_64)
o/$(MODE)/third_party/zlib/adler32_simd.o: private \
TARGET_ARCH += \
-O3 \
-mssse3
o/$(MODE)/third_party/zlib/crc_folding.o \
o/$(MODE)/third_party/zlib/crc32_simd.o: private \
o/$(MODE)/third_party/zlib/crc32_simd_sse42.o: private \
TARGET_ARCH += \
-O3 \
-msse4.2 \
-mpclmul
-mpclmul \
-UCRC32_SIMD_AVX512_PCLMUL \
-DCRC32_SIMD_SSE42_PCLMUL \
-DBUILD_SSE42
o/$(MODE)/third_party/zlib/crc32_simd_avx512.o: private \
TARGET_ARCH += \
-O3 \
-mpclmul \
-mavx512f \
-mvpclmulqdq \
-UCRC32_SIMD_SSE42_PCLMUL \
-DCRC32_SIMD_AVX512_PCLMUL \
-DBUILD_AVX512
$(THIRD_PARTY_ZLIB_A_OBJS): private \
CPPFLAGS += \
-DADLER32_SIMD_SSSE3 \
-DCRC32_SIMD_SSE42_PCLMUL \
-DCRC32_SIMD_AVX512_PCLMUL \
-DDEFLATE_SLIDE_HASH_SSE2 \
-DINFLATE_CHUNK_SIMD_SSE2 \
-DINFLATE_CHUNK_READ_64LE
@ -55,8 +70,10 @@ endif
ifeq ($(ARCH), aarch64)
o/$(MODE)/third_party/zlib/deflate.o \
o/$(MODE)/third_party/zlib/crc32_simd.o: private \
o/$(MODE)/third_party/zlib/crc32_simd_neon.o: private \
TARGET_ARCH += \
-O3 \
-DBUILD_NEON \
-march=armv8-a+aes+crc
$(THIRD_PARTY_ZLIB_A_OBJS): private \
CPPFLAGS += \

View file

@ -16,7 +16,7 @@ COSMOPOLITAN_C_START_
#define x86_cpu_enable_sse2 X86_HAVE(SSE2)
#define x86_cpu_enable_ssse3 X86_HAVE(SSSE3)
#define x86_cpu_enable_simd (X86_HAVE(SSE4_2) && X86_HAVE(PCLMUL))
#define x86_cpu_enable_avx512 X86_HAVE(AVX512F)
#define x86_cpu_enable_avx512 (X86_HAVE(AVX512F) && X86_HAVE(PCLMUL) && X86_HAVE(VPCLMULQDQ))
#define cpu_check_features() ((void)0)
#elif defined(__aarch64__)

View file

@ -780,6 +780,7 @@ uint32_t ZEXPORT crc32_z(crc, buf_, len)
}
#endif
#if defined(__x86_64__)
#if defined(CRC32_SIMD_AVX512_PCLMUL)
if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
/* crc32 64-byte chunks */
@ -792,7 +793,8 @@ uint32_t ZEXPORT crc32_z(crc, buf_, len)
/* Fall into the default crc32 for the remaining data. */
buf += chunk_size;
}
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
#endif
#if defined(CRC32_SIMD_SSE42_PCLMUL)
if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
/* crc32 16-byte chunks */
z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
@ -804,6 +806,7 @@ uint32_t ZEXPORT crc32_z(crc, buf_, len)
/* Fall into the default crc32 for the remaining data. */
buf += chunk_size;
}
#endif
#elif defined(CRC32_ARMV8_CRC32)
if (arm_cpu_enable_crc32) {
#if defined(__aarch64__)

3
third_party/zlib/crc32_simd_avx512.c vendored Normal file
View file

@ -0,0 +1,3 @@
#ifdef BUILD_AVX512
#include "third_party/zlib/crc32_simd.inc"
#endif

3
third_party/zlib/crc32_simd_neon.c vendored Normal file
View file

@ -0,0 +1,3 @@
#ifdef BUILD_NEON
#include "third_party/zlib/crc32_simd.inc"
#endif

3
third_party/zlib/crc32_simd_sse42.c vendored Normal file
View file

@ -0,0 +1,3 @@
#ifdef BUILD_SSE42
#include "third_party/zlib/crc32_simd.inc"
#endif