Support AVX512VNNI

This change causes some quants (e.g. Q4_0, Q8_0) to go faster on some
architectures (e.g. AMD Zen 4).
This commit is contained in:
Justine Tunney 2024-03-24 08:31:48 -07:00
parent ea279d5609
commit 209df3defb
No known key found for this signature in database
GPG key ID: BE714B4575D6E328

View file

@ -132,7 +132,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
} }
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
#if __AVXVNNI__ #if defined(__AVXVNNI__) || defined(__AVX512VNNI__)
const __m256i zero = _mm256_setzero_si256(); const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
return _mm256_cvtepi32_ps(summed_pairs); return _mm256_cvtepi32_ps(summed_pairs);