mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-07-08 04:08:32 +00:00
Release Cosmopolitan v3.6.0
This release is an atomic upgrade to GCC 14.1.0 with C23 and C++23
This commit is contained in:
parent
62ace3623a
commit
5660ec4741
1585 changed files with 117353 additions and 271644 deletions
23
third_party/intel/amxcomplexintrin.internal.h
vendored
Normal file
23
third_party/intel/amxcomplexintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,23 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AMXCOMPLEXINTRIN_H_INCLUDED
|
||||
#define _AMXCOMPLEXINTRIN_H_INCLUDED
|
||||
#if !defined(__AMX_COMPLEX__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("amx-complex")
|
||||
#define __DISABLE_AMX_COMPLEX__
|
||||
#endif
|
||||
#if defined(__x86_64__)
|
||||
#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) __asm__ volatile ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
|
||||
#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) __asm__ volatile ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
|
||||
#define _tile_cmmimfp16ps(src1_dst,src2,src3) _tile_cmmimfp16ps_internal (src1_dst, src2, src3)
|
||||
#define _tile_cmmrlfp16ps(src1_dst,src2,src3) _tile_cmmrlfp16ps_internal (src1_dst, src2, src3)
|
||||
#endif
|
||||
#ifdef __DISABLE_AMX_COMPLEX__
|
||||
#undef __DISABLE_AMX_COMPLEX__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
16
third_party/intel/amxfp16intrin.internal.h
vendored
Normal file
16
third_party/intel/amxfp16intrin.internal.h
vendored
Normal file
|
@ -0,0 +1,16 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <amxfp16intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AMXFP16INTRIN_H_INCLUDED
|
||||
#define _AMXFP16INTRIN_H_INCLUDED
|
||||
#if defined(__x86_64__)
|
||||
#define _tile_dpfp16ps_internal(dst,src1,src2) __asm__ volatile ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
|
||||
#define _tile_dpfp16ps(dst,src1,src2) _tile_dpfp16ps_internal (dst,src1,src2)
|
||||
#endif
|
||||
#ifdef __DISABLE_AMX_FP16__
|
||||
#undef __DISABLE_AMX_FP16__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
10
third_party/intel/amxtileintrin.internal.h
vendored
10
third_party/intel/amxtileintrin.internal.h
vendored
|
@ -14,13 +14,13 @@ extern __inline void
|
|||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_tile_loadconfig (const void *__config)
|
||||
{
|
||||
__asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
|
||||
__builtin_ia32_ldtilecfg (__config);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_tile_storeconfig (void *__config)
|
||||
{
|
||||
__asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
|
||||
__builtin_ia32_sttilecfg (__config);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -29,11 +29,11 @@ _tile_release (void)
|
|||
__asm__ volatile ("tilerelease" ::);
|
||||
}
|
||||
#define _tile_loadd(dst,base,stride) _tile_loadd_internal (dst, base, stride)
|
||||
#define _tile_loadd_internal(dst,base,stride) __asm__ volatile ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" :: "r" ((const void*) (base)), "r" ((long) (stride)))
|
||||
#define _tile_loadd_internal(dst,base,stride) __asm__ volatile ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
|
||||
#define _tile_stream_loadd(dst,base,stride) _tile_stream_loadd_internal (dst, base, stride)
|
||||
#define _tile_stream_loadd_internal(dst,base,stride) __asm__ volatile ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" :: "r" ((const void*) (base)), "r" ((long) (stride)))
|
||||
#define _tile_stream_loadd_internal(dst,base,stride) __asm__ volatile ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
|
||||
#define _tile_stored(dst,base,stride) _tile_stored_internal (dst, base, stride)
|
||||
#define _tile_stored_internal(src,base,stride) __asm__ volatile ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" :: "r" ((void*) (base)), "r" ((long) (stride)) : "memory")
|
||||
#define _tile_stored_internal(src,base,stride) __asm__ volatile ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) : "memory")
|
||||
#define _tile_zero(dst) _tile_zero_internal (dst)
|
||||
#define _tile_zero_internal(dst) __asm__ volatile ("tilezero\t%%tmm"#dst ::)
|
||||
#endif
|
||||
|
|
200
third_party/intel/avx2intrin.internal.h
vendored
200
third_party/intel/avx2intrin.internal.h
vendored
|
@ -1443,6 +1443,206 @@ _mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
|
|||
#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), (int const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4si)_mm_set1_epi32(-1), (int) (SCALE))
|
||||
#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC), (int const *) (BASE), (__v4di)(__m256i) (INDEX), (__v4si)(__m128i) (MASK), (int) (SCALE))
|
||||
#endif
|
||||
#define _MM_REDUCE_OPERATOR_BASIC_EPI16(op) __v8hi __T1 = (__v8hi)__W; __v8hi __T2 = __builtin_shufflevector (__T1, __T1, 4, 5, 6, 7, 4, 5, 6, 7); __v8hi __T3 = __T1 op __T2; __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 2, 3, 2, 3, 4, 5, 6, 7); __v8hi __T5 = __T3 op __T4; __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 1, 1, 2, 3, 4, 5, 6, 7); __v8hi __T7 = __T5 op __T6; return __T7[0]
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_add_epi16 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI16 (+);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_mul_epi16 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI16 (*);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_and_epi16 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI16 (&);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_or_epi16 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI16 (|);
|
||||
}
|
||||
#define _MM_REDUCE_OPERATOR_MAX_MIN_EP16(op) __m128i __T1 = (__m128i)__builtin_shufflevector ((__v8hi)__V, (__v8hi)__V, 4, 5, 6, 7, 4, 5, 6, 7); __m128i __T2 = _mm_##op (__V, __T1); __m128i __T3 = (__m128i)__builtin_shufflevector ((__v8hi)__T2, (__v8hi)__T2, 2, 3, 2, 3, 4, 5, 6, 7); __m128i __T4 = _mm_##op (__T2, __T3); __m128i __T5 = (__m128i)__builtin_shufflevector ((__v8hi)__T4, (__v8hi)__T4, 1, 1, 2, 3, 4, 5, 6, 7); __v8hi __T6 = (__v8hi)_mm_##op (__T4, __T5); return __T6[0]
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_max_epi16 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
|
||||
}
|
||||
extern __inline unsigned short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_max_epu16 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_min_epi16 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
|
||||
}
|
||||
extern __inline unsigned short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_min_epu16 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
|
||||
}
|
||||
#define _MM256_REDUCE_OPERATOR_BASIC_EPI16(op) __v8hi __T1 = (__v8hi)_mm256_extracti128_si256 (__W, 0); __v8hi __T2 = (__v8hi)_mm256_extracti128_si256 (__W, 1); __v8hi __T3 = __T1 op __T2; __v8hi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7); __v8hi __T5 = __T3 op __T4; __v8hi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7); __v8hi __T7 = __T5 op __T6; __v8hi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7); __v8hi __T9 = __T7 op __T8; return __T9[0]
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_add_epi16 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI16 (+);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_mul_epi16 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI16 (*);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_and_epi16 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI16 (&);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_or_epi16 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI16 (|);
|
||||
}
|
||||
#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP16(op) __m128i __T1 = _mm256_extracti128_si256 (__V, 0); __m128i __T2 = _mm256_extracti128_si256 (__V, 1); __m128i __T3 = _mm_##op (__T1, __T2); __m128i __T4 = (__m128i)__builtin_shufflevector ((__v8hi)__T3, (__v8hi)__T3, 4, 5, 6, 7, 4, 5, 6, 7); __m128i __T5 = _mm_##op (__T3, __T4); __m128i __T6 = (__m128i)__builtin_shufflevector ((__v8hi)__T5, (__v8hi)__T5, 2, 3, 2, 3, 4, 5, 6, 7); __m128i __T7 = _mm_##op (__T5, __T6); __m128i __T8 = (__m128i)__builtin_shufflevector ((__v8hi)__T7, (__v8hi)__T7, 1, 1, 2, 3, 4, 5, 6, 7); __v8hi __T9 = (__v8hi)_mm_##op (__T7, __T8); return __T9[0]
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_max_epi16 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epi16);
|
||||
}
|
||||
extern __inline unsigned short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_max_epu16 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (max_epu16);
|
||||
}
|
||||
extern __inline short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_min_epi16 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epi16);
|
||||
}
|
||||
extern __inline unsigned short
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_min_epu16 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP16 (min_epu16);
|
||||
}
|
||||
#define _MM_REDUCE_OPERATOR_BASIC_EPI8(op) __v16qi __T1 = (__v16qi)__W; __v16qi __T2 = __builtin_shufflevector (__T1, __T1, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T3 = __T1 op __T2; __v16qi __T4 = __builtin_shufflevector (__T3, __T3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T5 = __T3 op __T4; __v16qi __T6 = __builtin_shufflevector (__T5, __T5, 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T7 = __T5 op __T6; __v16qi __T8 = __builtin_shufflevector (__T7, __T7, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T9 = __T7 op __T8; return __T9[0]
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_add_epi8 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI8 (+);
|
||||
}
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_mul_epi8 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI8 (*);
|
||||
}
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_and_epi8 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI8 (&);
|
||||
}
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_or_epi8 (__m128i __W)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_BASIC_EPI8 (|);
|
||||
}
|
||||
#define _MM_REDUCE_OPERATOR_MAX_MIN_EP8(op) __m128i __T1 = (__m128i)__builtin_shufflevector ((__v16qi)__V, (__v16qi)__V, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); __m128i __T2 = _mm_##op (__V, __T1); __m128i __T3 = (__m128i)__builtin_shufflevector ((__v16qi)__T2, (__v16qi)__T2, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i __T4 = _mm_##op (__T2, __T3); __m128i __T5 = (__m128i)__builtin_shufflevector ((__v16qi)__T4, (__v16qi)__T4, 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i __T6 = _mm_##op (__T4, __T5); __m128i __T7 = (__m128i)__builtin_shufflevector ((__v16qi)__T6, (__v16qi)__T6, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T8 = (__v16qi)_mm_##op (__T6, __T7); return __T8[0]
|
||||
extern __inline signed char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_max_epi8 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_max_epu8 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
|
||||
}
|
||||
extern __inline signed char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_min_epi8 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_min_epu8 (__m128i __V)
|
||||
{
|
||||
_MM_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
|
||||
}
|
||||
#define _MM256_REDUCE_OPERATOR_BASIC_EPI8(op) __v16qi __T1 = (__v16qi)_mm256_extracti128_si256 (__W, 0); __v16qi __T2 = (__v16qi)_mm256_extracti128_si256 (__W, 1); __v16qi __T3 = __T1 op __T2; __v16qi __T4 = __builtin_shufflevector (__T3, __T3, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T5 = __T3 op __T4; __v16qi __T6 = __builtin_shufflevector (__T5, __T5, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T7 = __T5 op __T6; __v16qi __T8 = __builtin_shufflevector (__T7, __T7, 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T9 = __T7 op __T8; __v16qi __T10 = __builtin_shufflevector (__T9, __T9, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T11 = __T9 op __T10; return __T11[0]
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_add_epi8 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI8 (+);
|
||||
}
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_mul_epi8 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI8 (*);
|
||||
}
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_and_epi8 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI8 (&);
|
||||
}
|
||||
extern __inline char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_or_epi8 (__m256i __W)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_BASIC_EPI8 (|);
|
||||
}
|
||||
#define _MM256_REDUCE_OPERATOR_MAX_MIN_EP8(op) __m128i __T1 = _mm256_extracti128_si256 (__V, 0); __m128i __T2 = _mm256_extracti128_si256 (__V, 1); __m128i __T3 = _mm_##op (__T1, __T2); __m128i __T4 = (__m128i)__builtin_shufflevector ((__v16qi)__T3, (__v16qi)__T3, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); __m128i __T5 = _mm_##op (__T3, __T4); __m128i __T6 = (__m128i)__builtin_shufflevector ((__v16qi)__T5, (__v16qi)__T5, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i __T7 = _mm_##op (__T5, __T6); __m128i __T8 = (__m128i)__builtin_shufflevector ((__v16qi)__T7, (__v16qi)__T5, 2, 3, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i __T9 = _mm_##op (__T7, __T8); __m128i __T10 = (__m128i)__builtin_shufflevector ((__v16qi)__T9, (__v16qi)__T9, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __v16qi __T11 = (__v16qi)_mm_##op (__T9, __T10); return __T11[0]
|
||||
extern __inline signed char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_max_epi8 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epi8);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_max_epu8 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (max_epu8);
|
||||
}
|
||||
extern __inline signed char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_min_epi8 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epi8);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_min_epu8 (__m256i __V)
|
||||
{
|
||||
_MM256_REDUCE_OPERATOR_MAX_MIN_EP8 (min_epu8);
|
||||
}
|
||||
#ifdef __DISABLE_AVX2__
|
||||
#undef __DISABLE_AVX2__
|
||||
#pragma GCC pop_options
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#define _AVX5124FMAPSINTRIN_H_INCLUDED
|
||||
#ifndef __AVX5124FMAPS__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx5124fmaps")
|
||||
#pragma GCC target("avx5124fmaps,evex512")
|
||||
#define __DISABLE_AVX5124FMAPS__
|
||||
#endif
|
||||
extern __inline __m512
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
#define _AVX5124VNNIWINTRIN_H_INCLUDED
|
||||
#ifndef __AVX5124VNNIW__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx5124vnniw")
|
||||
#pragma GCC target("avx5124vnniw,evex512")
|
||||
#define __DISABLE_AVX5124VNNIW__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
|
33
third_party/intel/avx512bf16intrin.internal.h
vendored
33
third_party/intel/avx512bf16intrin.internal.h
vendored
|
@ -4,38 +4,45 @@
|
|||
#endif
|
||||
#ifndef _AVX512BF16INTRIN_H_INCLUDED
|
||||
#define _AVX512BF16INTRIN_H_INCLUDED
|
||||
#ifndef __AVX512BF16__
|
||||
#if !defined (__AVX512BF16__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bf16")
|
||||
#pragma GCC target("avx512bf16,no-evex512")
|
||||
#define __DISABLE_AVX512BF16__
|
||||
#endif
|
||||
typedef short __v32bh __attribute__ ((__vector_size__ (64)));
|
||||
typedef short __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
|
||||
extern __inline float
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtsbh_ss (__bfloat16 __A)
|
||||
_mm_cvtsbh_ss (__bf16 __A)
|
||||
{
|
||||
union{ float a; unsigned int b;} __tmp;
|
||||
__tmp.b = ((unsigned int)(__A)) << 16;
|
||||
return __tmp.a;
|
||||
return __builtin_ia32_cvtbf2sf (__A);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BF16__
|
||||
#undef __DISABLE_AVX512BF16__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined (__AVX512BF16__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bf16,evex512")
|
||||
#define __DISABLE_AVX512BF16_512__
|
||||
#endif
|
||||
typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
|
||||
typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
|
||||
extern __inline __m512bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
|
||||
{
|
||||
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi(__A, __B);
|
||||
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
|
||||
}
|
||||
extern __inline __m512bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
|
||||
{
|
||||
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_mask(__C, __D, __A, __B);
|
||||
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
|
||||
}
|
||||
extern __inline __m512bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
|
||||
{
|
||||
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32hi_maskz(__B, __C, __A);
|
||||
return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
|
||||
}
|
||||
extern __inline __m256bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -96,8 +103,8 @@ _mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A)
|
|||
(__m512i)__S, (__mmask16)__U,
|
||||
(__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)));
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BF16__
|
||||
#undef __DISABLE_AVX512BF16__
|
||||
#ifdef __DISABLE_AVX512BF16_512__
|
||||
#undef __DISABLE_AVX512BF16_512__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
|
|
98
third_party/intel/avx512bf16vlintrin.internal.h
vendored
98
third_party/intel/avx512bf16vlintrin.internal.h
vendored
|
@ -4,57 +4,85 @@
|
|||
#endif
|
||||
#ifndef _AVX512BF16VLINTRIN_H_INCLUDED
|
||||
#define _AVX512BF16VLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512BF16__)
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512BF16__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bf16,avx512vl")
|
||||
#pragma GCC target("avx512bf16,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512BF16VL__
|
||||
#endif
|
||||
typedef short __v16bh __attribute__ ((__vector_size__ (32)));
|
||||
typedef short __v8bh __attribute__ ((__vector_size__ (16)));
|
||||
typedef short __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
|
||||
typedef short __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
|
||||
typedef unsigned short __bfloat16;
|
||||
typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
|
||||
typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
|
||||
typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
|
||||
typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
|
||||
typedef __bf16 __bfloat16;
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_castsi128_ps(__m128i __A)
|
||||
{
|
||||
return (__m128) __A;
|
||||
}
|
||||
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_avx512_castsi256_ps (__m256i __A)
|
||||
{
|
||||
return (__m256) __A;
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_slli_epi32 (__m128i __A, int __B)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_avx512_slli_epi32 (__m256i __A, int __B)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_cvtepi16_epi32 (__m128i __X)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_avx512_cvtepi16_epi32 (__m128i __X)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
|
||||
}
|
||||
#define _mm256_cvtneps_pbh(A) (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A)
|
||||
#define _mm_cvtneps_pbh(A) (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (A)
|
||||
extern __inline __m256bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
|
||||
{
|
||||
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi(__A, __B);
|
||||
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
|
||||
}
|
||||
extern __inline __m256bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
|
||||
{
|
||||
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_mask(__C, __D, __A, __B);
|
||||
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
|
||||
}
|
||||
extern __inline __m256bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
|
||||
{
|
||||
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16hi_maskz(__B, __C, __A);
|
||||
return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
|
||||
{
|
||||
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi(__A, __B);
|
||||
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
|
||||
{
|
||||
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_mask(__C, __D, __A, __B);
|
||||
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
|
||||
{
|
||||
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8hi_maskz(__B, __C, __A);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtneps_pbh (__m256 __A)
|
||||
{
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf(__A);
|
||||
return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -70,12 +98,6 @@ _mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B)
|
|||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtneps_pbh (__m128 __A)
|
||||
{
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf(__A);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C)
|
||||
{
|
||||
return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B);
|
||||
|
@ -122,34 +144,34 @@ _mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
|
|||
{
|
||||
return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
|
||||
}
|
||||
extern __inline __bfloat16
|
||||
extern __inline __bf16
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtness_sbh (float __A)
|
||||
{
|
||||
__v4sf __V = {__A, 0, 0, 0};
|
||||
__v8hi __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
|
||||
(__v8hi)_mm_undefined_si128 (), (__mmask8)-1);
|
||||
__v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
|
||||
(__v8bf)_mm_avx512_undefined_si128 (), (__mmask8)-1);
|
||||
return __R[0];
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtpbh_ps (__m128bh __A)
|
||||
{
|
||||
return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
|
||||
(__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16));
|
||||
return (__m128)_mm_avx512_castsi128_ps ((__m128i)_mm_avx512_slli_epi32 (
|
||||
(__m128i)_mm_avx512_cvtepi16_epi32 ((__m128i)__A), 16));
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtpbh_ps (__m128bh __A)
|
||||
{
|
||||
return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
|
||||
(__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16));
|
||||
return (__m256)_mm256_avx512_castsi256_ps ((__m256i)_mm256_avx512_slli_epi32 (
|
||||
(__m256i)_mm256_avx512_cvtepi16_epi32 ((__m128i)__A), 16));
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
|
||||
{
|
||||
return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
|
||||
return (__m128)_mm_avx512_castsi128_ps ((__m128i)_mm_avx512_slli_epi32 (
|
||||
(__m128i)_mm_maskz_cvtepi16_epi32 (
|
||||
(__mmask8)__U, (__m128i)__A), 16));
|
||||
}
|
||||
|
@ -157,7 +179,7 @@ extern __inline __m256
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
|
||||
{
|
||||
return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
|
||||
return (__m256)_mm256_avx512_castsi256_ps ((__m256i)_mm256_avx512_slli_epi32 (
|
||||
(__m256i)_mm256_maskz_cvtepi16_epi32 (
|
||||
(__mmask8)__U, (__m128i)__A), 16));
|
||||
}
|
||||
|
@ -165,16 +187,16 @@ extern __inline __m128
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A)
|
||||
{
|
||||
return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
|
||||
(__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 (
|
||||
return (__m128)_mm_avx512_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
|
||||
(__m128i)__S, (__mmask8)__U, (__m128i)_mm_avx512_cvtepi16_epi32 (
|
||||
(__m128i)__A), 16));
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A)
|
||||
{
|
||||
return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
|
||||
(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 (
|
||||
return (__m256)_mm256_avx512_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
|
||||
(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_avx512_cvtepi16_epi32 (
|
||||
(__m128i)__A), 16));
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BF16VL__
|
||||
|
|
161
third_party/intel/avx512bitalgintrin.internal.h
vendored
161
third_party/intel/avx512bitalgintrin.internal.h
vendored
|
@ -1,12 +1,12 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _IMMINTRIN_H_INCLUDED
|
||||
# error "Never use <avx512bitalgintrin.h> directly; include <x86intrin.h> instead."
|
||||
# error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AVX512BITALGINTRIN_H_INCLUDED
|
||||
#define _AVX512BITALGINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512BITALG__
|
||||
#if !defined (__AVX512BITALG__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bitalg")
|
||||
#pragma GCC target("avx512bitalg,evex512")
|
||||
#define __DISABLE_AVX512BITALG__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
@ -21,15 +21,6 @@ _mm512_popcnt_epi16 (__m512i __A)
|
|||
{
|
||||
return (__m512i) __builtin_ia32_vpopcountw_v32hi ((__v32hi) __A);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BITALG__
|
||||
#undef __DISABLE_AVX512BITALG__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bitalg,avx512bw")
|
||||
#define __DISABLE_AVX512BITALGBW__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_popcnt_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
|
||||
|
@ -80,150 +71,8 @@ _mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B)
|
|||
(__v64qi) __B,
|
||||
(__mmask64) __M);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BITALGBW__
|
||||
#undef __DISABLE_AVX512BITALGBW__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bitalg,avx512vl,avx512bw")
|
||||
#define __DISABLE_AVX512BITALGVLBW__
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_popcnt_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
|
||||
(__v32qi) __W,
|
||||
(__mmask32) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_maskz_popcnt_epi8 (__mmask32 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
|
||||
(__v32qi)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask32) __U);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_bitshuffle_epi64_mask (__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
(__mmask32) -1);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
(__mmask32) __M);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BITALGVLBW__
|
||||
#undef __DISABLE_AVX512BITALGVLBW__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bitalg,avx512vl")
|
||||
#define __DISABLE_AVX512BITALGVL__
|
||||
#endif
|
||||
extern __inline __mmask16
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __mmask16
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_bitshuffle_epi64_mask (__mmask16 __M, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
(__mmask16) __M);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_popcnt_epi8 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountb_v32qi ((__v32qi) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_popcnt_epi16 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountw_v16hi ((__v16hi) __A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_popcnt_epi8 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountb_v16qi ((__v16qi) __A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_popcnt_epi16 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountw_v8hi ((__v8hi) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_popcnt_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
|
||||
(__v16hi) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_maskz_popcnt_epi16 (__mmask16 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_popcnt_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
|
||||
(__v16qi) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_popcnt_epi8 (__mmask16 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
|
||||
(__v16qi)
|
||||
_mm_setzero_si128 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_popcnt_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
|
||||
(__v8hi) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BITALGVL__
|
||||
#undef __DISABLE_AVX512BITALGVL__
|
||||
#ifdef __DISABLE_AVX512BITALG__
|
||||
#undef __DISABLE_AVX512BITALG__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
|
|
141
third_party/intel/avx512bitalgvlintrin.internal.h
vendored
Normal file
141
third_party/intel/avx512bitalgvlintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,141 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _IMMINTRIN_H_INCLUDED
|
||||
# error "Never use <avx512bitalgvlintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AVX512BITALGVLINTRIN_H_INCLUDED
|
||||
#define _AVX512BITALGVLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bitalg,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512BITALGVL__
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_popcnt_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
|
||||
(__v32qi) __W,
|
||||
(__mmask32) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_maskz_popcnt_epi8 (__mmask32 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
|
||||
(__v32qi)
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask32) __U);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_bitshuffle_epi64_mask (__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
(__mmask32) -1);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
|
||||
(__v32qi) __B,
|
||||
(__mmask32) __M);
|
||||
}
|
||||
extern __inline __mmask16
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __mmask16
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_bitshuffle_epi64_mask (__mmask16 __M, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
|
||||
(__v16qi) __B,
|
||||
(__mmask16) __M);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_popcnt_epi8 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountb_v32qi ((__v32qi) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_popcnt_epi16 (__m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountw_v16hi ((__v16hi) __A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_popcnt_epi8 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountb_v16qi ((__v16qi) __A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_popcnt_epi16 (__m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountw_v8hi ((__v8hi) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_popcnt_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
|
||||
(__v16hi) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_maskz_popcnt_epi16 (__mmask16 __U, __m256i __A)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
|
||||
(__v16hi)
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_popcnt_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
|
||||
(__v16qi) __W,
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_popcnt_epi8 (__mmask16 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
|
||||
(__v16qi)
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_popcnt_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
|
||||
(__v8hi) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
|
||||
(__v8hi)
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BITALGVL__
|
||||
#undef __DISABLE_AVX512BITALGVL__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
337
third_party/intel/avx512bwintrin.internal.h
vendored
337
third_party/intel/avx512bwintrin.internal.h
vendored
|
@ -4,16 +4,35 @@
|
|||
#endif
|
||||
#ifndef _AVX512BWINTRIN_H_INCLUDED
|
||||
#define _AVX512BWINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512BW__
|
||||
#if !defined (__AVX512BW__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bw")
|
||||
#pragma GCC target("avx512bw,no-evex512")
|
||||
#define __DISABLE_AVX512BW__
|
||||
#endif
|
||||
typedef short __v32hi __attribute__ ((__vector_size__ (64)));
|
||||
typedef short __v32hi_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
|
||||
typedef char __v64qi __attribute__ ((__vector_size__ (64)));
|
||||
typedef char __v64qi_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
|
||||
typedef unsigned long long __mmask64;
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_set_epi32 (int __q3, int __q2, int __q1, int __q0)
|
||||
{
|
||||
return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_set_epi16 (short __q7, short __q6, short __q5, short __q4,
|
||||
short __q3, short __q2, short __q1, short __q0)
|
||||
{
|
||||
return __extension__ (__m128i)(__v8hi){
|
||||
__q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_set_epi8 (char __q15, char __q14, char __q13, char __q12,
|
||||
char __q11, char __q10, char __q09, char __q08,
|
||||
char __q07, char __q06, char __q05, char __q04,
|
||||
char __q03, char __q02, char __q01, char __q00)
|
||||
{
|
||||
return __extension__ (__m128i)(__v16qi){
|
||||
__q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
|
||||
__q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
|
||||
};
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF)
|
||||
|
@ -23,37 +42,18 @@ _ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF)
|
|||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF)
|
||||
{
|
||||
*__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
|
||||
return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF)
|
||||
{
|
||||
*__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
|
||||
|
@ -61,6 +61,136 @@ _kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF)
|
|||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kadd_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline unsigned int
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cvtmask32_u32 (__mmask32 __A)
|
||||
{
|
||||
return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cvtu32_mask32 (unsigned int __A)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_load_mask32 (__mmask32 *__A)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kmovd (*__A);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_store_mask32 (__mmask32 *__A, __mmask32 __B)
|
||||
{
|
||||
*(__mmask32 *) __A = __builtin_ia32_kmovd (__B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_knot_mask32 (__mmask32 __A)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kor_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kxnor_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kxor_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kand_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kandn_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
|
||||
(__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kunpackw_mask32 (__mmask16 __A, __mmask16 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
|
||||
(__mmask32) __B);
|
||||
}
|
||||
#if __OPTIMIZE__
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftli_mask32 (__mmask32 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A,
|
||||
(__mmask8) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftri_mask32 (__mmask32 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A,
|
||||
(__mmask8) __B);
|
||||
}
|
||||
#else
|
||||
#define _kshiftli_mask32(X, Y) ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y)))
|
||||
#define _kshiftri_mask32(X, Y) ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y)))
|
||||
#endif
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF)
|
||||
{
|
||||
*__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
|
||||
return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF)
|
||||
{
|
||||
*__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
|
||||
|
@ -68,70 +198,34 @@ _kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF)
|
|||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
|
||||
}
|
||||
extern __inline unsigned char
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kadd_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kadd_mask64 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B);
|
||||
}
|
||||
extern __inline unsigned int
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cvtmask32_u32 (__mmask32 __A)
|
||||
{
|
||||
return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A);
|
||||
}
|
||||
extern __inline unsigned long long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cvtmask64_u64 (__mmask64 __A)
|
||||
{
|
||||
return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cvtu32_mask32 (unsigned int __A)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cvtu64_mask64 (unsigned long long __A)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_load_mask32 (__mmask32 *__A)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kmovd (*__A);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_load_mask64 (__mmask64 *__A)
|
||||
|
@ -140,88 +234,59 @@ _load_mask64 (__mmask64 *__A)
|
|||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_store_mask32 (__mmask32 *__A, __mmask32 __B)
|
||||
{
|
||||
*(__mmask32 *) __A = __builtin_ia32_kmovd (__B);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_store_mask64 (__mmask64 *__A, __mmask64 __B)
|
||||
{
|
||||
*(__mmask64 *) __A = __builtin_ia32_kmovq (__B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_knot_mask32 (__mmask32 __A)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_knot_mask64 (__mmask64 __A)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kor_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kor_mask64 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kxnor_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kxnor_mask64 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kxor_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kxor_mask64 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kand_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kand_mask64 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kandn_mask32 (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kandn_mask64 (__mmask64 __A, __mmask64 __B)
|
||||
{
|
||||
return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512BW__
|
||||
#undef __DISABLE_AVX512BW__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined (__AVX512BW__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512bw,evex512")
|
||||
#define __DISABLE_AVX512BW_512__
|
||||
#endif
|
||||
typedef short __v32hi __attribute__ ((__vector_size__ (64)));
|
||||
typedef short __v32hi_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
|
||||
typedef char __v64qi __attribute__ ((__vector_size__ (64)));
|
||||
typedef char __v64qi_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
|
||||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
|
||||
|
@ -293,20 +358,6 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
|
|||
_mm512_setzero_si512 (),
|
||||
(__mmask64) __U);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
|
||||
(__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kunpackw_mask32 (__mmask16 __A, __mmask16 __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
|
||||
(__mmask32) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
|
||||
|
@ -2463,13 +2514,6 @@ _mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
|
|||
__M);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftli_mask32 (__mmask32 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A,
|
||||
(__mmask8) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftli_mask64 (__mmask64 __A, unsigned int __B)
|
||||
|
@ -2477,13 +2521,6 @@ _kshiftli_mask64 (__mmask64 __A, unsigned int __B)
|
|||
return (__mmask64) __builtin_ia32_kshiftlidi ((__mmask64) __A,
|
||||
(__mmask8) __B);
|
||||
}
|
||||
extern __inline __mmask32
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftri_mask32 (__mmask32 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A,
|
||||
(__mmask8) __B);
|
||||
}
|
||||
extern __inline __mmask64
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftri_mask64 (__mmask64 __A, unsigned int __B)
|
||||
|
@ -2557,7 +2594,7 @@ _mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B,
|
|||
}
|
||||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_srli_epi16 (__m512i __A, const int __imm)
|
||||
_mm512_srli_epi16 (__m512i __A, const unsigned int __imm)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
|
||||
(__v32hi)
|
||||
|
@ -2567,7 +2604,7 @@ _mm512_srli_epi16 (__m512i __A, const int __imm)
|
|||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
|
||||
const int __imm)
|
||||
const unsigned int __imm)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
|
||||
(__v32hi) __W,
|
||||
|
@ -2584,7 +2621,7 @@ _mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm)
|
|||
}
|
||||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_slli_epi16 (__m512i __A, const int __B)
|
||||
_mm512_slli_epi16 (__m512i __A, const unsigned int __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
|
||||
(__v32hi)
|
||||
|
@ -2594,7 +2631,7 @@ _mm512_slli_epi16 (__m512i __A, const int __B)
|
|||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
|
||||
const int __B)
|
||||
const unsigned int __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
|
||||
(__v32hi) __W,
|
||||
|
@ -2602,7 +2639,7 @@ _mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
|
|||
}
|
||||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B)
|
||||
_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const unsigned int __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
|
||||
(__v32hi)
|
||||
|
@ -2673,7 +2710,7 @@ _mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A,
|
|||
}
|
||||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_srai_epi16 (__m512i __A, const int __imm)
|
||||
_mm512_srai_epi16 (__m512i __A, const unsigned int __imm)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
|
||||
(__v32hi)
|
||||
|
@ -2683,7 +2720,7 @@ _mm512_srai_epi16 (__m512i __A, const int __imm)
|
|||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
|
||||
const int __imm)
|
||||
const unsigned int __imm)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
|
||||
(__v32hi) __W,
|
||||
|
@ -2691,7 +2728,7 @@ _mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
|
|||
}
|
||||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm)
|
||||
_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const unsigned int __imm)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
|
||||
(__v32hi)
|
||||
|
@ -2795,9 +2832,7 @@ _mm512_bsrli_epi128 (__m512i __A, const int __N)
|
|||
return (__m512i) __builtin_ia32_psrldq512 (__A, __N * 8);
|
||||
}
|
||||
#else
|
||||
#define _kshiftli_mask32(X, Y) ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y)))
|
||||
#define _kshiftli_mask64(X, Y) ((__mmask64) __builtin_ia32_kshiftlidi ((__mmask64)(X), (__mmask8)(Y)))
|
||||
#define _kshiftri_mask32(X, Y) ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y)))
|
||||
#define _kshiftri_mask64(X, Y) ((__mmask64) __builtin_ia32_kshiftridi ((__mmask64)(X), (__mmask8)(Y)))
|
||||
#define _mm512_alignr_epi8(X, Y, N) ((__m512i) __builtin_ia32_palignr512 ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)((N) * 8)))
|
||||
#define _mm512_mask_alignr_epi8(W, U, X, Y, N) ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), (__v8di)(__m512i)(Y), (int)((N) * 8), (__v8di)(__m512i)(W), (__mmask64)(U)))
|
||||
|
@ -2805,21 +2840,21 @@ _mm512_bsrli_epi128 (__m512i __A, const int __N)
|
|||
#define _mm512_dbsad_epu8(X, Y, C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_dbsad_epu8(W, U, X, Y, C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_dbsad_epu8(U, X, Y, C) ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), (__v64qi)(__m512i) (Y), (int) (C), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_srli_epi16(A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_srli_epi16(W, U, A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_srli_epi16(A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (unsigned int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_srli_epi16(W, U, A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (unsigned int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_srli_epi16(U, A, B) ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_slli_epi16(X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_slli_epi16(W, U, X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_slli_epi16(U, X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_slli_epi16(X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (unsigned int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_slli_epi16(W, U, X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (unsigned int)(C), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_slli_epi16(U, X, C) ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (unsigned int)(C), (__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_shufflehi_epi16(A, B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_shufflehi_epi16(W, U, A, B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_shufflehi_epi16(U, A, B) ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_shufflelo_epi16(A, B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_shufflelo_epi16(W, U, A, B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_shufflelo_epi16(U, A, B) ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i) _mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_srai_epi16(A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_srai_epi16(W, U, A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_srai_epi16(U, A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_srai_epi16(A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (unsigned int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
|
||||
#define _mm512_mask_srai_epi16(W, U, A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (unsigned int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
|
||||
#define _mm512_maskz_srai_epi16(U, A, B) ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), (unsigned int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
|
||||
#define _mm512_mask_blend_epi16(__U, __A, __W) ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A), (__v32hi) (__W), (__mmask32) (__U)))
|
||||
#define _mm512_mask_blend_epi8(__U, __A, __W) ((__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) (__A), (__v64qi) (__W), (__mmask64) (__U)))
|
||||
#define _mm512_cmp_epi16_mask(X, Y, P) ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), (__v32hi)(__m512i)(Y), (int)(P), (__mmask32)(-1)))
|
||||
|
@ -2833,8 +2868,8 @@ _mm512_bsrli_epi128 (__m512i __A, const int __N)
|
|||
#define _mm512_bslli_epi128(A, N) ((__m512i)__builtin_ia32_pslldq512 ((__m512i)(A), (int)(N) * 8))
|
||||
#define _mm512_bsrli_epi128(A, N) ((__m512i)__builtin_ia32_psrldq512 ((__m512i)(A), (int)(N) * 8))
|
||||
#endif
|
||||
#ifdef __DISABLE_AVX512BW__
|
||||
#undef __DISABLE_AVX512BW__
|
||||
#ifdef __DISABLE_AVX512BW_512__
|
||||
#undef __DISABLE_AVX512BW_512__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
|
|
2
third_party/intel/avx512cdintrin.internal.h
vendored
2
third_party/intel/avx512cdintrin.internal.h
vendored
|
@ -6,7 +6,7 @@
|
|||
#define _AVX512CDINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512CD__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512cd")
|
||||
#pragma GCC target("avx512cd,evex512")
|
||||
#define __DISABLE_AVX512CD__
|
||||
#endif
|
||||
typedef long long __v8di __attribute__ ((__vector_size__ (64)));
|
||||
|
|
644
third_party/intel/avx512dqintrin.internal.h
vendored
644
third_party/intel/avx512dqintrin.internal.h
vendored
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512DQINTRIN_H_INCLUDED
|
||||
#define _AVX512DQINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512DQ__
|
||||
#if !defined (__AVX512DQ__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512dq")
|
||||
#pragma GCC target("avx512dq,no-evex512")
|
||||
#define __DISABLE_AVX512DQ__
|
||||
#endif
|
||||
extern __inline unsigned char
|
||||
|
@ -138,6 +138,330 @@ _kandn_mask8 (__mmask8 __A, __mmask8 __B)
|
|||
{
|
||||
return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftli_mask8 (__mmask8 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftri_mask8 (__mmask8 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_sd (__m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) _mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A,
|
||||
__m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
|
||||
__m128d __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) _mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_avx512_setzero_pd (),
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_ss (__m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) _mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A,
|
||||
__m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
|
||||
__m128 __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) _mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_avx512_setzero_ps (),
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_sd (__m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_ss (__m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C,
|
||||
const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C,
|
||||
const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_fpclass_ss_mask (__m128 __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_fpclass_sd_mask (__m128d __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_fpclass_ss_mask (__mmask8 __U, __m128 __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, __U);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, __U);
|
||||
}
|
||||
#else
|
||||
#define _kshiftli_mask8(X, Y) ((__mmask8) __builtin_ia32_kshiftliqi ((__mmask8)(X), (__mmask8)(Y)))
|
||||
#define _kshiftri_mask8(X, Y) ((__mmask8) __builtin_ia32_kshiftriqi ((__mmask8)(X), (__mmask8)(Y)))
|
||||
#define _mm_range_sd(A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_mask_range_sd(W, U, A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_maskz_range_sd(U, A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_range_ss(A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_mask_range_ss(W, U, A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_maskz_range_ss(U, A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_range_round_sd(A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), (__mmask8) -1, (R)))
|
||||
#define _mm_mask_range_round_sd(W, U, A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), (R)))
|
||||
#define _mm_maskz_range_round_sd(U, A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), (__mmask8)(U), (R)))
|
||||
#define _mm_range_round_ss(A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), (__mmask8) -1, (R)))
|
||||
#define _mm_mask_range_round_ss(W, U, A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), (R)))
|
||||
#define _mm_maskz_range_round_ss(U, A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), (__mmask8)(U), (R)))
|
||||
#define _mm_fpclass_ss_mask(X, C) ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), (int) (C), (__mmask8) (-1)))
|
||||
#define _mm_fpclass_sd_mask(X, C) ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), (int) (C), (__mmask8) (-1)))
|
||||
#define _mm_mask_fpclass_ss_mask(X, C, U) ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), (int) (C), (__mmask8) (U)))
|
||||
#define _mm_mask_fpclass_sd_mask(X, C, U) ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), (int) (C), (__mmask8) (U)))
|
||||
#define _mm_reduce_sd(A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), (__mmask8)-1))
|
||||
#define _mm_mask_reduce_sd(W, U, A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_reduce_sd(U, A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), (__mmask8)(U)))
|
||||
#define _mm_reduce_round_sd(A, B, C, R) ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_maskz_reduce_round_sd(U, A, B, C, R) ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_reduce_ss(A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), (__mmask8)-1))
|
||||
#define _mm_mask_reduce_ss(W, U, A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_reduce_ss(U, A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), (__mmask8)(U)))
|
||||
#define _mm_reduce_round_ss(A, B, C, R) ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_maskz_reduce_round_ss(U, A, B, C, R) ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), (__mmask8)(U), (int)(R)))
|
||||
#endif
|
||||
#ifdef __DISABLE_AVX512DQ__
|
||||
#undef __DISABLE_AVX512DQ__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined (__AVX512DQ__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512dq,evex512")
|
||||
#define __DISABLE_AVX512DQ_512__
|
||||
#endif
|
||||
extern __inline __m512d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_broadcast_f64x2 (__m128d __A)
|
||||
|
@ -939,18 +1263,6 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A)
|
|||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftli_mask8 (__mmask8 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_kshiftri_mask8 (__mmask8 __A, unsigned int __B)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B);
|
||||
}
|
||||
extern __inline __m512d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_range_pd (__m512d __A, __m512d __B, int __C)
|
||||
|
@ -1017,276 +1329,6 @@ _mm512_maskz_range_ps (__mmask16 __U, __m512 __A, __m512 __B, int __C)
|
|||
(__mmask16) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_sd (__m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) _mm_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A,
|
||||
__m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
|
||||
__m128d __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) _mm_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_ss (__m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) _mm_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A,
|
||||
__m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_reduce_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
|
||||
__m128 __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) _mm_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
__U, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_sd (__m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
(__mmask8) -1,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_ss (__m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
(__mmask8) -1,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
(__mmask8) __U,
|
||||
_MM_FROUND_CUR_DIRECTION);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df) __W,
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __m128d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C,
|
||||
const int __R)
|
||||
{
|
||||
return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_range_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_range_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
|
||||
int __C, const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf) __W,
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_range_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C,
|
||||
const int __R)
|
||||
{
|
||||
return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
(__mmask8) __U, __R);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_fpclass_ss_mask (__m128 __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_fpclass_sd_mask (__m128d __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_fpclass_ss_mask (__mmask8 __U, __m128 __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, __U);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const int __imm)
|
||||
{
|
||||
return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, __U);
|
||||
}
|
||||
extern __inline __m512i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_cvtt_roundpd_epi64 (__m512d __A, const int __R)
|
||||
|
@ -2145,20 +2187,6 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
|
|||
(__mmask16) -1);
|
||||
}
|
||||
#else
|
||||
#define _kshiftli_mask8(X, Y) ((__mmask8) __builtin_ia32_kshiftliqi ((__mmask8)(X), (__mmask8)(Y)))
|
||||
#define _kshiftri_mask8(X, Y) ((__mmask8) __builtin_ia32_kshiftriqi ((__mmask8)(X), (__mmask8)(Y)))
|
||||
#define _mm_range_sd(A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_mask_range_sd(W, U, A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_maskz_range_sd(U, A, B, C) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_range_ss(A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_mask_range_ss(W, U, A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_maskz_range_ss(U, A, B, C) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
|
||||
#define _mm_range_round_sd(A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8) -1, (R)))
|
||||
#define _mm_mask_range_round_sd(W, U, A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), (R)))
|
||||
#define _mm_maskz_range_round_sd(U, A, B, C, R) ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), (R)))
|
||||
#define _mm_range_round_ss(A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8) -1, (R)))
|
||||
#define _mm_mask_range_round_ss(W, U, A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), (R)))
|
||||
#define _mm_maskz_range_round_ss(U, A, B, C, R) ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), (R)))
|
||||
#define _mm512_cvtt_roundpd_epi64(A, B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di) _mm512_setzero_si512 (), -1, (B)))
|
||||
#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)(W), (U), (B)))
|
||||
#define _mm512_maskz_cvtt_roundpd_epi64(U, A, B) ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
|
||||
|
@ -2243,29 +2271,13 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm)
|
|||
#define _mm512_inserti32x8(X, Y, C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)-1))
|
||||
#define _mm512_mask_inserti32x8(W, U, X, Y, C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)(W), (__mmask16)(U)))
|
||||
#define _mm512_maskz_inserti32x8(U, X, Y, C) ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), (__v8si)(__m256i) (Y), (int) (C), (__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(U)))
|
||||
#define _mm_fpclass_ss_mask(X, C) ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), (int) (C), (__mmask8) (-1)))
|
||||
#define _mm_fpclass_sd_mask(X, C) ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), (int) (C), (__mmask8) (-1)))
|
||||
#define _mm_mask_fpclass_ss_mask(X, C, U) ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), (int) (C), (__mmask8) (U)))
|
||||
#define _mm_mask_fpclass_sd_mask(X, C, U) ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), (int) (C), (__mmask8) (U)))
|
||||
#define _mm512_mask_fpclass_pd_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), (int) (C), (__mmask8)(u)))
|
||||
#define _mm512_mask_fpclass_ps_mask(u, x, c) ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x), (int) (c),(__mmask16)(u)))
|
||||
#define _mm512_fpclass_pd_mask(X, C) ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), (int) (C), (__mmask8)-1))
|
||||
#define _mm512_fpclass_ps_mask(x, c) ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x), (int) (c),(__mmask16)-1))
|
||||
#define _mm_reduce_sd(A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)-1))
|
||||
#define _mm_mask_reduce_sd(W, U, A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_reduce_sd(U, A, B, C) ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U)))
|
||||
#define _mm_reduce_round_sd(A, B, C, R) ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_maskz_reduce_round_sd(U, A, B, C, R) ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_reduce_ss(A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)-1))
|
||||
#define _mm_mask_reduce_ss(W, U, A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_reduce_ss(U, A, B, C) ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U)))
|
||||
#define _mm_reduce_round_ss(A, B, C, R) ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U), (int)(R)))
|
||||
#define _mm_maskz_reduce_round_ss(U, A, B, C, R) ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), (__mmask8)(U), (int)(R)))
|
||||
#endif
|
||||
#ifdef __DISABLE_AVX512DQ__
|
||||
#undef __DISABLE_AVX512DQ__
|
||||
#ifdef __DISABLE_AVX512DQ_512__
|
||||
#undef __DISABLE_AVX512DQ_512__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
|
|
20
third_party/intel/avx512erintrin.internal.h
vendored
20
third_party/intel/avx512erintrin.internal.h
vendored
|
@ -6,7 +6,7 @@
|
|||
#define _AVX512ERINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512ER__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512er")
|
||||
#pragma GCC target("avx512er,evex512")
|
||||
#define __DISABLE_AVX512ER__
|
||||
#endif
|
||||
typedef double __v8df __attribute__ ((__vector_size__ (64)));
|
||||
|
@ -20,9 +20,8 @@ extern __inline __m512d
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_exp2a23_round_pd (__m512d __A, int __R)
|
||||
{
|
||||
__m512d __W;
|
||||
return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
|
||||
(__v8df) __W,
|
||||
(__v8df) _mm512_undefined_pd (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m512d
|
||||
|
@ -45,9 +44,8 @@ extern __inline __m512
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_exp2a23_round_ps (__m512 __A, int __R)
|
||||
{
|
||||
__m512 __W;
|
||||
return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
|
||||
(__v16sf) __W,
|
||||
(__v16sf) _mm512_undefined_ps (),
|
||||
(__mmask16) -1, __R);
|
||||
}
|
||||
extern __inline __m512
|
||||
|
@ -70,9 +68,8 @@ extern __inline __m512d
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_rcp28_round_pd (__m512d __A, int __R)
|
||||
{
|
||||
__m512d __W;
|
||||
return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
|
||||
(__v8df) __W,
|
||||
(__v8df) _mm512_undefined_pd (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m512d
|
||||
|
@ -95,9 +92,8 @@ extern __inline __m512
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_rcp28_round_ps (__m512 __A, int __R)
|
||||
{
|
||||
__m512 __W;
|
||||
return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
|
||||
(__v16sf) __W,
|
||||
(__v16sf) _mm512_undefined_ps (),
|
||||
(__mmask16) -1, __R);
|
||||
}
|
||||
extern __inline __m512
|
||||
|
@ -180,9 +176,8 @@ extern __inline __m512d
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_rsqrt28_round_pd (__m512d __A, int __R)
|
||||
{
|
||||
__m512d __W;
|
||||
return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
|
||||
(__v8df) __W,
|
||||
(__v8df) _mm512_undefined_pd (),
|
||||
(__mmask8) -1, __R);
|
||||
}
|
||||
extern __inline __m512d
|
||||
|
@ -205,9 +200,8 @@ extern __inline __m512
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_rsqrt28_round_ps (__m512 __A, int __R)
|
||||
{
|
||||
__m512 __W;
|
||||
return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
|
||||
(__v16sf) __W,
|
||||
(__v16sf) _mm512_undefined_ps (),
|
||||
(__mmask16) -1, __R);
|
||||
}
|
||||
extern __inline __m512
|
||||
|
|
5797
third_party/intel/avx512fintrin.internal.h
vendored
5797
third_party/intel/avx512fintrin.internal.h
vendored
File diff suppressed because it is too large
Load diff
4345
third_party/intel/avx512fp16intrin.internal.h
vendored
4345
third_party/intel/avx512fp16intrin.internal.h
vendored
File diff suppressed because it is too large
Load diff
194
third_party/intel/avx512fp16vlintrin.internal.h
vendored
194
third_party/intel/avx512fp16vlintrin.internal.h
vendored
|
@ -4,11 +4,33 @@
|
|||
#endif
|
||||
#ifndef __AVX512FP16VLINTRIN_H_INCLUDED
|
||||
#define __AVX512FP16VLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512FP16__)
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512FP16__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512fp16,avx512vl")
|
||||
#pragma GCC target("avx512fp16,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512FP16VL__
|
||||
#endif
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_set1_ps (float __F)
|
||||
{
|
||||
return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
|
||||
}
|
||||
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_avx512_set1_ps (float __A)
|
||||
{
|
||||
return __extension__ (__m256){ __A, __A, __A, __A,
|
||||
__A, __A, __A, __A };
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_avx512_and_si128 (__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) ((__v2du)__A & (__v2du)__B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_avx512_and_si256 (__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) ((__v4du)__A & (__v4du)__B);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_castph_ps (__m128h __a)
|
||||
|
@ -87,10 +109,10 @@ _mm256_castph256_ph128 (__m256h __A)
|
|||
{
|
||||
union
|
||||
{
|
||||
__m128h a[2];
|
||||
__m256h v;
|
||||
} u = { .v = __A };
|
||||
return u.a[0];
|
||||
__m128h __a[2];
|
||||
__m256h __v;
|
||||
} __u = { .__v = __A };
|
||||
return __u.__a[0];
|
||||
}
|
||||
extern __inline __m256h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -98,24 +120,24 @@ _mm256_castph128_ph256 (__m128h __A)
|
|||
{
|
||||
union
|
||||
{
|
||||
__m128h a[2];
|
||||
__m256h v;
|
||||
} u;
|
||||
u.a[0] = __A;
|
||||
return u.v;
|
||||
__m128h __a[2];
|
||||
__m256h __v;
|
||||
} __u;
|
||||
__u.__a[0] = __A;
|
||||
return __u.__v;
|
||||
}
|
||||
extern __inline __m256h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_zextph128_ph256 (__m128h __A)
|
||||
{
|
||||
return (__m256h) _mm256_insertf128_ps (_mm256_setzero_ps (),
|
||||
return (__m256h) _mm256_avx512_insertf128_ps (_mm256_avx512_setzero_ps (),
|
||||
(__m128) __A, 0);
|
||||
}
|
||||
extern __inline __m256h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_conj_pch (__m256h __A)
|
||||
{
|
||||
return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31));
|
||||
return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_avx512_set1_epi32 (1<<31));
|
||||
}
|
||||
extern __inline __m256h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -133,14 +155,14 @@ _mm256_maskz_conj_pch (__mmask8 __U, __m256h __A)
|
|||
return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
|
||||
_mm256_conj_pch (__A),
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_conj_pch (__m128h __A)
|
||||
{
|
||||
return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31));
|
||||
return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_avx512_set1_epi32 (1<<31));
|
||||
}
|
||||
extern __inline __m128h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -155,7 +177,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_conj_pch (__mmask8 __U, __m128h __A)
|
||||
{
|
||||
return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
|
||||
(__v4sf) _mm_setzero_ps (),
|
||||
(__v4sf) _mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128h
|
||||
|
@ -398,15 +420,15 @@ extern __inline __m128h
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_abs_ph (__m128h __A)
|
||||
{
|
||||
return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF),
|
||||
(__m128i) __A);
|
||||
return (__m128h) _mm_avx512_and_si128 (_mm_avx512_set1_epi32 (0x7FFF7FFF),
|
||||
(__m128i) __A);
|
||||
}
|
||||
extern __inline __m256h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_abs_ph (__m256h __A)
|
||||
{
|
||||
return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF),
|
||||
(__m256i) __A);
|
||||
return (__m256h) _mm256_avx512_and_si256 (_mm256_avx512_set1_epi32 (0x7FFF7FFF),
|
||||
(__m256i) __A);
|
||||
}
|
||||
#ifdef __OPTIMIZE
|
||||
extern __inline __mmask8
|
||||
|
@ -884,7 +906,7 @@ _mm_cvtph_epi32 (__m128h __A)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2dq128_mask (__A,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -900,7 +922,7 @@ _mm_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
|
|||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2dq128_mask (__B,
|
||||
(__v4si) _mm_setzero_si128 (),
|
||||
(__v4si) _mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -910,7 +932,7 @@ _mm256_cvtph_epi32 (__m128h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2dq256_mask (__A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -927,7 +949,7 @@ _mm256_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2dq256_mask (__B,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -937,7 +959,7 @@ _mm_cvtph_epu32 (__m128h __A)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2udq128_mask (__A,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -954,7 +976,7 @@ _mm_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2udq128_mask (__B,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -964,7 +986,7 @@ _mm256_cvtph_epu32 (__m128h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2udq256_mask (__A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -980,7 +1002,7 @@ _mm256_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
|
|||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2udq256_mask (__B,
|
||||
(__v8si) _mm256_setzero_si256 (),
|
||||
(__v8si) _mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -989,7 +1011,7 @@ _mm_cvttph_epi32 (__m128h __A)
|
|||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2dq128_mask (__A,
|
||||
(__v4si) _mm_setzero_si128 (),
|
||||
(__v4si) _mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1006,7 +1028,7 @@ _mm_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
|
|||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2dq128_mask (__B,
|
||||
(__v4si) _mm_setzero_si128 (),
|
||||
(__v4si) _mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1016,7 +1038,7 @@ _mm256_cvttph_epi32 (__m128h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2dq256_mask (__A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1035,7 +1057,7 @@ _mm256_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2dq256_mask (__B,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1045,7 +1067,7 @@ _mm_cvttph_epu32 (__m128h __A)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2udq128_mask (__A,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1064,7 +1086,7 @@ _mm_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2udq128_mask (__B,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1074,7 +1096,7 @@ _mm256_cvttph_epu32 (__m128h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2udq256_mask (__A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (), (__mmask8) -1);
|
||||
_mm256_avx512_setzero_si256 (), (__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -1092,7 +1114,7 @@ _mm256_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2udq256_mask (__B,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128h
|
||||
|
@ -1191,7 +1213,7 @@ _mm_cvtph_epi64 (__m128h __A)
|
|||
{
|
||||
return
|
||||
__builtin_ia32_vcvtph2qq128_mask (__A,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1205,7 +1227,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2qq128_mask (__B,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1213,7 +1235,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_cvtph_epi64 (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2qq256_mask (__A,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1227,7 +1249,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2qq256_mask (__B,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1235,7 +1257,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_cvtph_epu64 (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2uqq128_mask (__A,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1249,7 +1271,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2uqq128_mask (__B,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1257,7 +1279,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_cvtph_epu64 (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2uqq256_mask (__A,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1271,7 +1293,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2uqq256_mask (__B,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1279,7 +1301,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_cvttph_epi64 (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2qq128_mask (__A,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1295,7 +1317,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2qq128_mask (__B,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1303,7 +1325,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_cvttph_epi64 (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2qq256_mask (__A,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1319,7 +1341,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2qq256_mask (__B,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1327,7 +1349,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_cvttph_epu64 (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2uqq128_mask (__A,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1343,7 +1365,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2uqq128_mask (__B,
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1351,7 +1373,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_cvttph_epu64 (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2uqq256_mask (__A,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1367,7 +1389,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvttph2uqq256_mask (__B,
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128h
|
||||
|
@ -1465,7 +1487,7 @@ _mm_cvtph_epi16 (__m128h __A)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2w128_mask (__A,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1482,7 +1504,7 @@ _mm_maskz_cvtph_epi16 (__mmask8 __A, __m128h __B)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2w128_mask (__B,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1492,7 +1514,7 @@ _mm256_cvtph_epi16 (__m256h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2w256_mask (__A,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1509,7 +1531,7 @@ _mm256_maskz_cvtph_epi16 (__mmask16 __A, __m256h __B)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2w256_mask (__B,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1519,7 +1541,7 @@ _mm_cvtph_epu16 (__m128h __A)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2uw128_mask (__A,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1536,7 +1558,7 @@ _mm_maskz_cvtph_epu16 (__mmask8 __A, __m128h __B)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvtph2uw128_mask (__B,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1546,7 +1568,7 @@ _mm256_cvtph_epu16 (__m256h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2uw256_mask (__A,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1563,7 +1585,7 @@ _mm256_maskz_cvtph_epu16 (__mmask16 __A, __m256h __B)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvtph2uw256_mask (__B,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1573,7 +1595,7 @@ _mm_cvttph_epi16 (__m128h __A)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2w128_mask (__A,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1592,7 +1614,7 @@ _mm_maskz_cvttph_epi16 (__mmask8 __A, __m128h __B)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2w128_mask (__B,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1602,7 +1624,7 @@ _mm256_cvttph_epi16 (__m256h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2w256_mask (__A,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1621,7 +1643,7 @@ _mm256_maskz_cvttph_epi16 (__mmask16 __A, __m256h __B)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2w256_mask (__B,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1631,7 +1653,7 @@ _mm_cvttph_epu16 (__m128h __A)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2uw128_mask (__A,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1650,7 +1672,7 @@ _mm_maskz_cvttph_epu16 (__mmask8 __A, __m128h __B)
|
|||
return (__m128i)
|
||||
__builtin_ia32_vcvttph2uw128_mask (__B,
|
||||
(__v8hi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1660,7 +1682,7 @@ _mm256_cvttph_epu16 (__m256h __A)
|
|||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2uw256_mask (__A,
|
||||
(__v16hi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1678,7 +1700,7 @@ _mm256_maskz_cvttph_epu16 (__mmask16 __A, __m256h __B)
|
|||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vcvttph2uw256_mask (__B,
|
||||
(__v16hi) _mm256_setzero_si256 (),
|
||||
(__v16hi) _mm256_avx512_setzero_si256 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128h
|
||||
|
@ -1778,7 +1800,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_cvtph_pd (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2pd128_mask (__A,
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1791,14 +1813,14 @@ extern __inline __m128d
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_setzero_pd (), __A);
|
||||
return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_avx512_setzero_pd (), __A);
|
||||
}
|
||||
extern __inline __m256d
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtph_pd (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2pd256_mask (__A,
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -1812,7 +1834,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2pd256_mask (__B,
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1820,7 +1842,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_cvtxph_ps (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2psx128_mask (__A,
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1833,14 +1855,14 @@ extern __inline __m128
|
|||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_setzero_ps (), __A);
|
||||
return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_avx512_setzero_ps (), __A);
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtxph_ps (__m128h __A)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2psx256_mask (__A,
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -1854,7 +1876,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
|
||||
{
|
||||
return __builtin_ia32_vcvtph2psx256_mask (__B,
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128h
|
||||
|
@ -2667,7 +2689,7 @@ _mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
|
|||
_mm256_setzero_ph (),
|
||||
__A);
|
||||
}
|
||||
#define _MM256_REDUCE_OP(op) __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); __m128h __T3 = (__T1 op __T2); __m128h __T4 = (__m128h) __builtin_shuffle (__T3, (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); __m128h __T5 = (__T3) op (__T4); __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); __m128h __T7 = __T5 op __T6; return __T7[0] op __T7[1]
|
||||
#define _MM256_REDUCE_OP(op) __m128h __T1 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 0); __m128h __T2 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 1); __m128h __T3 = (__T1 op __T2); __m128h __T4 = (__m128h) __builtin_shuffle (__T3, (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); __m128h __T5 = (__T3) op (__T4); __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); __m128h __T7 = __T5 op __T6; return __T7[0] op __T7[1]
|
||||
extern __inline _Float16
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_add_ph (__m256h __A)
|
||||
|
@ -2681,7 +2703,7 @@ _mm256_reduce_mul_ph (__m256h __A)
|
|||
_MM256_REDUCE_OP (*);
|
||||
}
|
||||
#undef _MM256_REDUCE_OP
|
||||
#define _MM256_REDUCE_OP(op) __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); __m128h __T3 = _mm_##op (__T1, __T2); __m128h __T4 = (__m128h) __builtin_shuffle (__T3, (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m128h __T5 = _mm_##op (__T3, __T4); __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); __m128h __T7 = _mm_##op (__T5, __T6); __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); __m128h __T9 = _mm_##op (__T7, __T8); return __T9[0]
|
||||
#define _MM256_REDUCE_OP(op) __m128h __T1 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 0); __m128h __T2 = (__m128h) _mm256_avx512_extractf128_pd ((__m256d) __A, 1); __m128h __T3 = _mm_##op (__T1, __T2); __m128h __T4 = (__m128h) __builtin_shuffle (__T3, (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); __m128h __T5 = _mm_##op (__T3, __T4); __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); __m128h __T7 = _mm_##op (__T5, __T6); __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); __m128h __T9 = _mm_##op (__T7, __T8); return __T9[0]
|
||||
extern __inline _Float16
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_reduce_min_ph (__m256h __A)
|
||||
|
@ -2783,10 +2805,10 @@ _mm256_set1_pch (_Float16 _Complex __A)
|
|||
{
|
||||
union
|
||||
{
|
||||
_Float16 _Complex a;
|
||||
float b;
|
||||
} u = { .a = __A };
|
||||
return (__m256h) _mm256_set1_ps (u.b);
|
||||
_Float16 _Complex __a;
|
||||
float __b;
|
||||
} __u = { .__a = __A };
|
||||
return (__m256h) _mm256_avx512_set1_ps (__u.__b);
|
||||
}
|
||||
extern __inline __m128h
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -2794,10 +2816,10 @@ _mm_set1_pch (_Float16 _Complex __A)
|
|||
{
|
||||
union
|
||||
{
|
||||
_Float16 _Complex a;
|
||||
float b;
|
||||
} u = { .a = __A };
|
||||
return (__m128h) _mm_set1_ps (u.b);
|
||||
_Float16 _Complex __a;
|
||||
float __b;
|
||||
} __u = { .__a = __A };
|
||||
return (__m128h) _mm_avx512_set1_ps (__u.__b);
|
||||
}
|
||||
#define _mm_mul_pch(A, B) _mm_fmul_pch ((A), (B))
|
||||
#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch ((W), (U), (A), (B))
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512IFMAINTRIN_H_INCLUDED
|
||||
#define _AVX512IFMAINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512IFMA__
|
||||
#if !defined (__AVX512IFMA__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512ifma")
|
||||
#pragma GCC target("avx512ifma,evex512")
|
||||
#define __DISABLE_AVX512IFMA__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
|
44
third_party/intel/avx512ifmavlintrin.internal.h
vendored
44
third_party/intel/avx512ifmavlintrin.internal.h
vendored
|
@ -4,47 +4,15 @@
|
|||
#endif
|
||||
#ifndef _AVX512IFMAVLINTRIN_H_INCLUDED
|
||||
#define _AVX512IFMAVLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512IFMA__)
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512IFMA__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512ifma,avx512vl")
|
||||
#pragma GCC target("avx512ifma,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512IFMAVL__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z,
|
||||
(__mmask8) -1);
|
||||
}
|
||||
#define _mm_madd52lo_epu64(A, B, C) ((__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) (A), (__v2di) (B), (__v2di) (C)))
|
||||
#define _mm_madd52hi_epu64(A, B, C) ((__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) (A), (__v2di) (B), (__v2di) (C)))
|
||||
#define _mm256_madd52lo_epu64(A, B, C) ((__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) (A), (__v4di) (B), (__v4di) (C)))
|
||||
#define _mm256_madd52hi_epu64(A, B, C) ((__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) (A), (__v4di) (B), (__v4di) (C)))
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
|
||||
|
|
2
third_party/intel/avx512pfintrin.internal.h
vendored
2
third_party/intel/avx512pfintrin.internal.h
vendored
|
@ -6,7 +6,7 @@
|
|||
#define _AVX512PFINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512PF__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512pf")
|
||||
#pragma GCC target("avx512pf,evex512")
|
||||
#define __DISABLE_AVX512PF__
|
||||
#endif
|
||||
typedef long long __v8di __attribute__ ((__vector_size__ (64)));
|
||||
|
|
17
third_party/intel/avx512vbmi2intrin.internal.h
vendored
17
third_party/intel/avx512vbmi2intrin.internal.h
vendored
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef __AVX512VBMI2INTRIN_H_INCLUDED
|
||||
#define __AVX512VBMI2INTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VBMI2__)
|
||||
#if !defined(__AVX512VBMI2__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vbmi2")
|
||||
#pragma GCC target("avx512vbmi2,evex512")
|
||||
#define __DISABLE_AVX512VBMI2__
|
||||
#endif
|
||||
#ifdef __OPTIMIZE__
|
||||
|
@ -224,15 +224,6 @@ _mm512_maskz_shldv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
|
|||
return (__m512i)__builtin_ia32_vpshldv_v8di_maskz ((__v8di)__B, (__v8di) __C,
|
||||
(__v8di) __D, (__mmask8)__A);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512VBMI2__
|
||||
#undef __DISABLE_AVX512VBMI2__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vbmi2,avx512bw")
|
||||
#define __DISABLE_AVX512VBMI2BW__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_compress_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
|
||||
|
@ -398,8 +389,8 @@ _mm512_maskz_shldv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
|
|||
return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz ((__v32hi)__B,
|
||||
(__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512VBMI2BW__
|
||||
#undef __DISABLE_AVX512VBMI2BW__
|
||||
#ifdef __DISABLE_AVX512VBMI2__
|
||||
#undef __DISABLE_AVX512VBMI2__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
|
|
89
third_party/intel/avx512vbmi2vlintrin.internal.h
vendored
89
third_party/intel/avx512vbmi2vlintrin.internal.h
vendored
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED
|
||||
#define _AVX512VBMI2VLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__)
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vbmi2,avx512vl")
|
||||
#pragma GCC target("avx512vbmi2,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512VBMI2VL__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
|
@ -21,7 +21,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_compress_epi8 (__mmask16 __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __B,
|
||||
(__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
|
||||
(__v16qi) _mm_avx512_setzero_si128 (), (__mmask16) __A);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -42,7 +42,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_compress_epi16 (__mmask8 __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __B,
|
||||
(__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
|
||||
(__v8hi) _mm_avx512_setzero_si128 (), (__mmask8) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -56,7 +56,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_compress_epi16 (__mmask16 __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __B,
|
||||
(__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
|
||||
(__v16hi) _mm256_avx512_setzero_si256 (), (__mmask16) __A);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -85,7 +85,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_expand_epi8 (__mmask16 __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandqi128_maskz ((__v16qi) __B,
|
||||
(__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
|
||||
(__v16qi) _mm_avx512_setzero_si128 (), (__mmask16) __A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -99,7 +99,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_expandloadu_epi8 (__mmask16 __A, const void * __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandloadqi128_maskz ((const __v16qi *) __B,
|
||||
(__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
|
||||
(__v16qi) _mm_avx512_setzero_si128 (), (__mmask16) __A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -114,7 +114,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_expand_epi16 (__mmask8 __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandhi128_maskz ((__v8hi) __B,
|
||||
(__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
|
||||
(__v8hi) _mm_avx512_setzero_si128 (), (__mmask8) __A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -128,7 +128,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_expandloadu_epi16 (__mmask8 __A, const void * __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_expandloadhi128_maskz ((const __v8hi *) __B,
|
||||
(__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
|
||||
(__v8hi) _mm_avx512_setzero_si128 (), (__mmask8) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -143,7 +143,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_expand_epi16 (__mmask16 __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandhi256_maskz ((__v16hi) __B,
|
||||
(__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
|
||||
(__v16hi) _mm256_avx512_setzero_si256 (), (__mmask16) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -157,7 +157,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_expandloadu_epi16 (__mmask16 __A, const void * __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandloadhi256_maskz ((const __v16hi *) __B,
|
||||
(__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
|
||||
(__v16hi) _mm256_avx512_setzero_si256 (), (__mmask16) __A);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __m256i
|
||||
|
@ -180,7 +180,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_shrdi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__B,
|
||||
(__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
|
||||
(__v16hi) __C, __D, (__v16hi) _mm256_avx512_setzero_si256 (), (__mmask16)__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -195,7 +195,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_shrdi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__B, (__v8si) __C,
|
||||
__D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
|
||||
__D, (__v8si) _mm256_avx512_setzero_si256 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -216,7 +216,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_shrdi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__B, (__v4di) __C,
|
||||
__D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
|
||||
__D, (__v4di) _mm256_avx512_setzero_si256 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -237,7 +237,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_shrdi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
|
||||
__D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
|
||||
__D, (__v8hi) _mm_avx512_setzero_si128 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -258,7 +258,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_shrdi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__B, (__v4si) __C,
|
||||
__D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
|
||||
__D, (__v4si) _mm_avx512_setzero_si128 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -279,7 +279,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_shrdi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__B, (__v2di) __C,
|
||||
__D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
|
||||
__D, (__v2di) _mm_avx512_setzero_si128 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -307,7 +307,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_shldi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__B,
|
||||
(__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
|
||||
(__v16hi) __C, __D, (__v16hi) _mm256_avx512_setzero_si256 (), (__mmask16)__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -322,7 +322,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_shldi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__B, (__v8si) __C,
|
||||
__D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
|
||||
__D, (__v8si) _mm256_avx512_setzero_si256 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -343,7 +343,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_shldi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
|
||||
{
|
||||
return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__B, (__v4di) __C,
|
||||
__D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
|
||||
__D, (__v4di) _mm256_avx512_setzero_si256 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -364,7 +364,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_shldi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
|
||||
__D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
|
||||
__D, (__v8hi) _mm_avx512_setzero_si128 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -385,7 +385,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_shldi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__B, (__v4si) __C,
|
||||
__D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
|
||||
__D, (__v4si) _mm_avx512_setzero_si128 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -406,7 +406,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_shldi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__B, (__v2di) __C,
|
||||
__D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
|
||||
__D, (__v2di) _mm_avx512_setzero_si128 (), (__mmask8)__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -417,40 +417,40 @@ _mm_shldi_epi64 (__m128i __A, __m128i __B, int __C)
|
|||
#else
|
||||
#define _mm256_shrdi_epi16(A, B, C) ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), (__v16hi)(__m256i)(B),(int)(C)))
|
||||
#define _mm256_mask_shrdi_epi16(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A), (__mmask16)(B)))
|
||||
#define _mm256_maskz_shrdi_epi16(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)))
|
||||
#define _mm256_maskz_shrdi_epi16(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask16)(A)))
|
||||
#define _mm256_shrdi_epi32(A, B, C) ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B),(int)(C)))
|
||||
#define _mm256_mask_shrdi_epi32(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A), (__mmask8)(B)))
|
||||
#define _mm256_maskz_shrdi_epi32(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm256_maskz_shrdi_epi32(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm256_shrdi_epi64(A, B, C) ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B),(int)(C)))
|
||||
#define _mm256_mask_shrdi_epi64(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A), (__mmask8)(B)))
|
||||
#define _mm256_maskz_shrdi_epi64(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm256_maskz_shrdi_epi64(A, B, C, D) ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm_shrdi_epi16(A, B, C) ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), (__v8hi)(__m128i)(B),(int)(C)))
|
||||
#define _mm_mask_shrdi_epi16(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A), (__mmask8)(B)))
|
||||
#define _mm_maskz_shrdi_epi16(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_maskz_shrdi_epi16(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_avx512_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_shrdi_epi32(A, B, C) ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B),(int)(C)))
|
||||
#define _mm_mask_shrdi_epi32(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A), (__mmask8)(B)))
|
||||
#define _mm_maskz_shrdi_epi32(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_maskz_shrdi_epi32(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_avx512_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_shrdi_epi64(A, B, C) ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B),(int)(C)))
|
||||
#define _mm_mask_shrdi_epi64(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A), (__mmask8)(B)))
|
||||
#define _mm_maskz_shrdi_epi64(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_maskz_shrdi_epi64(A, B, C, D) ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_avx512_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm256_shldi_epi16(A, B, C) ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), (__v16hi)(__m256i)(B),(int)(C)))
|
||||
#define _mm256_mask_shldi_epi16(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), (__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A), (__mmask16)(B)))
|
||||
#define _mm256_maskz_shldi_epi16(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A)))
|
||||
#define _mm256_maskz_shldi_epi16(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), (__v16hi)(__m256i)(C),(int)(D), (__v16hi)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask16)(A)))
|
||||
#define _mm256_shldi_epi32(A, B, C) ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), (__v8si)(__m256i)(B),(int)(C)))
|
||||
#define _mm256_mask_shldi_epi32(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), (__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A), (__mmask8)(B)))
|
||||
#define _mm256_maskz_shldi_epi32(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm256_maskz_shldi_epi32(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), (__v8si)(__m256i)(C),(int)(D), (__v8si)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm256_shldi_epi64(A, B, C) ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), (__v4di)(__m256i)(B),(int)(C)))
|
||||
#define _mm256_mask_shldi_epi64(A, B, C, D, E) ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), (__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A), (__mmask8)(B)))
|
||||
#define _mm256_maskz_shldi_epi64(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm256_maskz_shldi_epi64(A, B, C, D) ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), (__v4di)(__m256i)(C),(int)(D), (__v4di)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask8)(A)))
|
||||
#define _mm_shldi_epi16(A, B, C) ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), (__v8hi)(__m128i)(B),(int)(C)))
|
||||
#define _mm_mask_shldi_epi16(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), (__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A), (__mmask8)(B)))
|
||||
#define _mm_maskz_shldi_epi16(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_maskz_shldi_epi16(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), (__v8hi)(__m128i)(C),(int)(D), (__v8hi)(__m128i)_mm_avx512_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_shldi_epi32(A, B, C) ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), (__v4si)(__m128i)(B),(int)(C)))
|
||||
#define _mm_mask_shldi_epi32(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), (__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A), (__mmask8)(B)))
|
||||
#define _mm_maskz_shldi_epi32(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_maskz_shldi_epi32(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), (__v4si)(__m128i)(C),(int)(D), (__v4si)(__m128i)_mm_avx512_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_shldi_epi64(A, B, C) ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), (__v2di)(__m128i)(B),(int)(C)))
|
||||
#define _mm_mask_shldi_epi64(A, B, C, D, E) ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), (__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A), (__mmask8)(B)))
|
||||
#define _mm_maskz_shldi_epi64(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A)))
|
||||
#define _mm_maskz_shldi_epi64(A, B, C, D) ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), (__v2di)(__m128i)(C),(int)(D), (__v2di)(__m128i)_mm_avx512_setzero_si128 (), (__mmask8)(A)))
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -704,15 +704,6 @@ _mm_maskz_shldv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
|
|||
return (__m128i)__builtin_ia32_vpshldv_v2di_maskz ((__v2di)__B, (__v2di) __C,
|
||||
(__v2di) __D, (__mmask8)__A);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512VBMI2VL__
|
||||
#undef __DISABLE_AVX512VBMI2VL__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || !defined(__AVX512BW__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vbmi2,avx512vl,avx512bw")
|
||||
#define __DISABLE_AVX512VBMI2VLBW__
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_mask_compress_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
|
||||
|
@ -725,7 +716,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_compress_epi8 (__mmask32 __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __B,
|
||||
(__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
|
||||
(__v32qi) _mm256_avx512_setzero_si256 (), (__mmask32) __A);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -747,7 +738,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_expand_epi8 (__mmask32 __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandqi256_maskz ((__v32qi) __B,
|
||||
(__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
|
||||
(__v32qi) _mm256_avx512_setzero_si256 (), (__mmask32) __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -761,10 +752,10 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_expandloadu_epi8 (__mmask32 __A, const void * __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_expandloadqi256_maskz ((const __v32qi *) __B,
|
||||
(__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
|
||||
(__v32qi) _mm256_avx512_setzero_si256 (), (__mmask32) __A);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512VBMI2VLBW__
|
||||
#undef __DISABLE_AVX512VBMI2VLBW__
|
||||
#ifdef __DISABLE_AVX512VBMI2VL__
|
||||
#undef __DISABLE_AVX512VBMI2VL__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VBMIINTRIN_H_INCLUDED
|
||||
#define _AVX512VBMIINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512VBMI__
|
||||
#if !defined (__AVX512VBMI__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vbmi")
|
||||
#pragma GCC target("avx512vbmi,evex512")
|
||||
#define __DISABLE_AVX512VBMI__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
|
20
third_party/intel/avx512vbmivlintrin.internal.h
vendored
20
third_party/intel/avx512vbmivlintrin.internal.h
vendored
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VBMIVLINTRIN_H_INCLUDED
|
||||
#define _AVX512VBMIVLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI__)
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vbmi,avx512vl")
|
||||
#pragma GCC target("avx512vbmi,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512VBMIVL__
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
|
@ -25,7 +25,7 @@ _mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
|
|||
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
|
||||
(__v32qi) __Y,
|
||||
(__v32qi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask32) __M);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -35,7 +35,7 @@ _mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
|
|||
return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
|
||||
(__v32qi) __Y,
|
||||
(__v32qi)
|
||||
_mm256_undefined_si256 (),
|
||||
_mm256_avx512_undefined_si256 (),
|
||||
(__mmask32) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -54,7 +54,7 @@ _mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
|
|||
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
|
||||
(__v16qi) __Y,
|
||||
(__v16qi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask16) __M);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -64,7 +64,7 @@ _mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
|
|||
return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
|
||||
(__v16qi) __Y,
|
||||
(__v16qi)
|
||||
_mm_undefined_si128 (),
|
||||
_mm_avx512_undefined_si128 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -74,7 +74,7 @@ _mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
|
|||
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
|
||||
(__v32qi) __A,
|
||||
(__v32qi)
|
||||
_mm256_undefined_si256 (),
|
||||
_mm256_avx512_undefined_si256 (),
|
||||
(__mmask32) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -85,7 +85,7 @@ _mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
|
|||
return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
|
||||
(__v32qi) __A,
|
||||
(__v32qi)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask32) __M);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -105,7 +105,7 @@ _mm_permutexvar_epi8 (__m128i __A, __m128i __B)
|
|||
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
|
||||
(__v16qi) __A,
|
||||
(__v16qi)
|
||||
_mm_undefined_si128 (),
|
||||
_mm_avx512_undefined_si128 (),
|
||||
(__mmask16) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -115,7 +115,7 @@ _mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
|
|||
return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
|
||||
(__v16qi) __A,
|
||||
(__v16qi)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask16) __M);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
|
804
third_party/intel/avx512vlbwintrin.internal.h
vendored
804
third_party/intel/avx512vlbwintrin.internal.h
vendored
File diff suppressed because it is too large
Load diff
252
third_party/intel/avx512vldqintrin.internal.h
vendored
252
third_party/intel/avx512vldqintrin.internal.h
vendored
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VLDQINTRIN_H_INCLUDED
|
||||
#define _AVX512VLDQINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512DQ__)
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512DQ__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vl,avx512dq")
|
||||
#pragma GCC target("avx512vl,avx512dq,no-evex512")
|
||||
#define __DISABLE_AVX512VLDQ__
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
|
@ -15,7 +15,7 @@ _mm256_cvttpd_epi64 (__m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -32,7 +32,7 @@ _mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -41,7 +41,7 @@ _mm_cvttpd_epi64 (__m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -58,7 +58,7 @@ _mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -67,7 +67,7 @@ _mm256_cvttpd_epu64 (__m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -84,7 +84,7 @@ _mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -93,7 +93,7 @@ _mm_cvttpd_epu64 (__m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -110,7 +110,7 @@ _mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -119,7 +119,7 @@ _mm256_cvtpd_epi64 (__m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -136,7 +136,7 @@ _mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -145,7 +145,7 @@ _mm_cvtpd_epi64 (__m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -162,7 +162,7 @@ _mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -171,7 +171,7 @@ _mm256_cvtpd_epu64 (__m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -188,7 +188,7 @@ _mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -197,7 +197,7 @@ _mm_cvtpd_epu64 (__m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -214,7 +214,7 @@ _mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -223,7 +223,7 @@ _mm256_cvttps_epi64 (__m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -240,7 +240,7 @@ _mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -249,7 +249,7 @@ _mm_cvttps_epi64 (__m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -266,7 +266,7 @@ _mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -275,7 +275,7 @@ _mm256_cvttps_epu64 (__m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -292,7 +292,7 @@ _mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -301,7 +301,7 @@ _mm_cvttps_epu64 (__m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -318,7 +318,7 @@ _mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -327,7 +327,7 @@ _mm256_broadcast_f64x2 (__m128d __A)
|
|||
{
|
||||
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
|
||||
__A,
|
||||
(__v4df)_mm256_undefined_pd(),
|
||||
(__v4df)_mm256_avx512_undefined_pd(),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -346,7 +346,7 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
|
|||
return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
|
||||
__A,
|
||||
(__v4df)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
__M);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -355,7 +355,7 @@ _mm256_broadcast_i64x2 (__m128i __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
|
||||
__A,
|
||||
(__v4di)_mm256_undefined_si256(),
|
||||
(__v4di)_mm256_avx512_undefined_si256(),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -374,7 +374,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
|
|||
return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
|
||||
__A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__M);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -382,7 +382,7 @@ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_broadcast_f32x2 (__m128 __A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
|
||||
(__v8sf)_mm256_undefined_ps(),
|
||||
(__v8sf)_mm256_avx512_undefined_ps(),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -399,7 +399,7 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
|
|||
{
|
||||
return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
__M);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -408,7 +408,7 @@ _mm256_broadcast_i32x2 (__m128i __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
|
||||
__A,
|
||||
(__v8si)_mm256_undefined_si256(),
|
||||
(__v8si)_mm256_avx512_undefined_si256(),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -427,7 +427,7 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
|
|||
return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
|
||||
__A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
__M);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -436,7 +436,7 @@ _mm_broadcast_i32x2 (__m128i __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
|
||||
__A,
|
||||
(__v4si)_mm_undefined_si128(),
|
||||
(__v4si)_mm_avx512_undefined_si128(),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -455,7 +455,7 @@ _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
|
|||
return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
|
||||
__A,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
__M);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -481,7 +481,7 @@ _mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
|
|||
return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
|
||||
(__v4di) __B,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -507,7 +507,7 @@ _mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
|
|||
return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
|
||||
(__v2di) __B,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -527,7 +527,7 @@ _mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B)
|
|||
return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
|
||||
(__v4df) __B,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -547,7 +547,7 @@ _mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B)
|
|||
return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
|
||||
(__v2df) __B,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -567,7 +567,7 @@ _mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B)
|
|||
return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
|
||||
(__v8sf) __B,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -586,7 +586,7 @@ _mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B)
|
|||
return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
|
||||
(__v4sf) __B,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -595,7 +595,7 @@ _mm256_cvtps_epi64 (__m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -612,7 +612,7 @@ _mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -621,7 +621,7 @@ _mm_cvtps_epi64 (__m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -638,7 +638,7 @@ _mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -647,7 +647,7 @@ _mm256_cvtps_epu64 (__m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -664,7 +664,7 @@ _mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -673,7 +673,7 @@ _mm_cvtps_epu64 (__m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -690,7 +690,7 @@ _mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -699,7 +699,7 @@ _mm256_cvtepi64_ps (__m256i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -716,7 +716,7 @@ _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -725,7 +725,7 @@ _mm_cvtepi64_ps (__m128i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -742,7 +742,7 @@ _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -751,7 +751,7 @@ _mm256_cvtepu64_ps (__m256i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -768,7 +768,7 @@ _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -777,7 +777,7 @@ _mm_cvtepu64_ps (__m128i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -794,7 +794,7 @@ _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A)
|
|||
{
|
||||
return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -803,7 +803,7 @@ _mm256_cvtepi64_pd (__m256i __A)
|
|||
{
|
||||
return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -820,7 +820,7 @@ _mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A)
|
|||
{
|
||||
return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -829,7 +829,7 @@ _mm_cvtepi64_pd (__m128i __A)
|
|||
{
|
||||
return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -846,7 +846,7 @@ _mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A)
|
|||
{
|
||||
return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -855,7 +855,7 @@ _mm256_cvtepu64_pd (__m256i __A)
|
|||
{
|
||||
return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -872,7 +872,7 @@ _mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A)
|
|||
{
|
||||
return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -892,7 +892,7 @@ _mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B)
|
|||
return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
|
||||
(__v4df) __B,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -911,7 +911,7 @@ _mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B)
|
|||
return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
|
||||
(__v2df) __B,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -930,7 +930,7 @@ _mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B)
|
|||
return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
|
||||
(__v8sf) __B,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -949,7 +949,7 @@ _mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B)
|
|||
return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
|
||||
(__v4sf) __B,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -958,7 +958,7 @@ _mm_cvtepu64_pd (__m128i __A)
|
|||
{
|
||||
return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -975,7 +975,7 @@ _mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A)
|
|||
{
|
||||
return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -995,7 +995,7 @@ _mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B)
|
|||
return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
|
||||
(__v4df) __B,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1014,7 +1014,7 @@ _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B)
|
|||
return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
|
||||
(__v2df) __B,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -1033,7 +1033,7 @@ _mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B)
|
|||
return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
|
||||
(__v8sf) __B,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1052,7 +1052,7 @@ _mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B)
|
|||
return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
|
||||
(__v4sf) __B,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -1071,7 +1071,7 @@ _mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B)
|
|||
return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
|
||||
(__v4df) __B,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1090,7 +1090,7 @@ _mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B)
|
|||
return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
|
||||
(__v2df) __B,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -1109,7 +1109,7 @@ _mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B)
|
|||
return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
|
||||
(__v8sf) __B,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1128,7 +1128,7 @@ _mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B)
|
|||
return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
|
||||
(__v4sf) __B,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1187,7 +1187,7 @@ _mm256_extractf64x2_pd (__m256d __A, const int __imm)
|
|||
return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
|
||||
__imm,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1209,7 +1209,7 @@ _mm256_maskz_extractf64x2_pd (__mmask8 __U, __m256d __A,
|
|||
return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
|
||||
__imm,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8)
|
||||
__U);
|
||||
}
|
||||
|
@ -1220,7 +1220,7 @@ _mm256_extracti64x2_epi64 (__m256i __A, const int __imm)
|
|||
return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
|
||||
__imm,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -1242,7 +1242,7 @@ _mm256_maskz_extracti64x2_epi64 (__mmask8 __U, __m256i __A,
|
|||
return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
|
||||
__imm,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8)
|
||||
__U);
|
||||
}
|
||||
|
@ -1252,7 +1252,7 @@ _mm256_reduce_pd (__m256d __A, int __B)
|
|||
{
|
||||
return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -1269,7 +1269,7 @@ _mm256_maskz_reduce_pd (__mmask8 __U, __m256d __A, int __B)
|
|||
{
|
||||
return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1278,7 +1278,7 @@ _mm_reduce_pd (__m128d __A, int __B)
|
|||
{
|
||||
return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1295,7 +1295,7 @@ _mm_maskz_reduce_pd (__mmask8 __U, __m128d __A, int __B)
|
|||
{
|
||||
return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -1304,7 +1304,7 @@ _mm256_reduce_ps (__m256 __A, int __B)
|
|||
{
|
||||
return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -1321,7 +1321,7 @@ _mm256_maskz_reduce_ps (__mmask8 __U, __m256 __A, int __B)
|
|||
{
|
||||
return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1330,7 +1330,7 @@ _mm_reduce_ps (__m128 __A, int __B)
|
|||
{
|
||||
return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1347,7 +1347,7 @@ _mm_maskz_reduce_ps (__mmask8 __U, __m128 __A, int __B)
|
|||
{
|
||||
return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -1357,7 +1357,7 @@ _mm256_range_pd (__m256d __A, __m256d __B, int __C)
|
|||
return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
|
||||
(__v4df) __B, __C,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -1377,7 +1377,7 @@ _mm256_maskz_range_pd (__mmask8 __U, __m256d __A, __m256d __B, int __C)
|
|||
return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
|
||||
(__v4df) __B, __C,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1387,7 +1387,7 @@ _mm_range_pd (__m128d __A, __m128d __B, int __C)
|
|||
return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128d
|
||||
|
@ -1407,7 +1407,7 @@ _mm_maskz_range_pd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
|
|||
return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
|
||||
(__v2df) __B, __C,
|
||||
(__v2df)
|
||||
_mm_setzero_pd (),
|
||||
_mm_avx512_setzero_pd (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -1417,7 +1417,7 @@ _mm256_range_ps (__m256 __A, __m256 __B, int __C)
|
|||
return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
|
||||
(__v8sf) __B, __C,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256
|
||||
|
@ -1437,7 +1437,7 @@ _mm256_maskz_range_ps (__mmask8 __U, __m256 __A, __m256 __B, int __C)
|
|||
return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
|
||||
(__v8sf) __B, __C,
|
||||
(__v8sf)
|
||||
_mm256_setzero_ps (),
|
||||
_mm256_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1447,7 +1447,7 @@ _mm_range_ps (__m128 __A, __m128 __B, int __C)
|
|||
return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m128
|
||||
|
@ -1467,7 +1467,7 @@ _mm_maskz_range_ps (__mmask8 __U, __m128 __A, __m128 __B, int __C)
|
|||
return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
|
||||
(__v4sf) __B, __C,
|
||||
(__v4sf)
|
||||
_mm_setzero_ps (),
|
||||
_mm_avx512_setzero_ps (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __mmask8
|
||||
|
@ -1539,7 +1539,7 @@ _mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm)
|
|||
(__v2di) __B,
|
||||
__imm,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -1563,7 +1563,7 @@ _mm256_maskz_inserti64x2 (__mmask8 __U, __m256i __A, __m128i __B,
|
|||
(__v2di) __B,
|
||||
__imm,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8)
|
||||
__U);
|
||||
}
|
||||
|
@ -1575,7 +1575,7 @@ _mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm)
|
|||
(__v2df) __B,
|
||||
__imm,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8) -1);
|
||||
}
|
||||
extern __inline __m256d
|
||||
|
@ -1599,47 +1599,47 @@ _mm256_maskz_insertf64x2 (__mmask8 __U, __m256d __A, __m128d __B,
|
|||
(__v2df) __B,
|
||||
__imm,
|
||||
(__v4df)
|
||||
_mm256_setzero_pd (),
|
||||
_mm256_avx512_setzero_pd (),
|
||||
(__mmask8)
|
||||
__U);
|
||||
}
|
||||
#else
|
||||
#define _mm256_insertf64x2(X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_insertf64x2(X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_avx512_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_mask_insertf64x2(W, U, X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)(W), (__mmask8)(U)))
|
||||
#define _mm256_maskz_insertf64x2(U, X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm256_inserti64x2(X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)-1))
|
||||
#define _mm256_maskz_insertf64x2(U, X, Y, C) ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X), (__v2df)(__m128d) (Y), (int) (C), (__v4df)(__m256d)_mm256_avx512_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm256_inserti64x2(X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask8)-1))
|
||||
#define _mm256_mask_inserti64x2(W, U, X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)(W), (__mmask8)(U)))
|
||||
#define _mm256_maskz_inserti64x2(U, X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(U)))
|
||||
#define _mm256_extractf64x2_pd(X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_maskz_inserti64x2(U, X, Y, C) ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X), (__v2di)(__m128i) (Y), (int) (C), (__v4di)(__m256i)_mm256_avx512_setzero_si256 (), (__mmask8)(U)))
|
||||
#define _mm256_extractf64x2_pd(X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_avx512_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_mask_extractf64x2_pd(W, U, X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U)))
|
||||
#define _mm256_maskz_extractf64x2_pd(U, X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8) (U)))
|
||||
#define _mm256_extracti64x2_epi64(X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1))
|
||||
#define _mm256_maskz_extractf64x2_pd(U, X, C) ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X), (int) (C), (__v2df)(__m128d) _mm_avx512_setzero_pd(), (__mmask8) (U)))
|
||||
#define _mm256_extracti64x2_epi64(X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_avx512_setzero_si128 (), (__mmask8)-1))
|
||||
#define _mm256_mask_extracti64x2_epi64(W, U, X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U)))
|
||||
#define _mm256_maskz_extracti64x2_epi64(U, X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
|
||||
#define _mm256_reduce_pd(A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_maskz_extracti64x2_epi64(U, X, C) ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X), (int) (C), (__v2di)(__m128i) _mm_avx512_setzero_si128 (), (__mmask8) (U)))
|
||||
#define _mm256_reduce_pd(A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_avx512_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_mask_reduce_pd(W, U, A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
|
||||
#define _mm256_maskz_reduce_pd(U, A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm_reduce_pd(A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_maskz_reduce_pd(U, A, B) ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), (int)(B), (__v4df)_mm256_avx512_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm_reduce_pd(A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_avx512_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm_mask_reduce_pd(W, U, A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_reduce_pd(U, A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm256_reduce_ps(A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm_maskz_reduce_pd(U, A, B) ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), (int)(B), (__v2df)_mm_avx512_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm256_reduce_ps(A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_avx512_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm256_mask_reduce_ps(W, U, A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U)))
|
||||
#define _mm256_maskz_reduce_ps(U, A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm_reduce_ps(A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm256_maskz_reduce_ps(U, A, B) ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), (int)(B), (__v8sf)_mm256_avx512_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm_reduce_ps(A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_avx512_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm_mask_reduce_ps(W, U, A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_reduce_ps(U, A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm256_range_pd(A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_maskz_range_pd(U, A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm_range_pd(A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_range_ps(A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm_maskz_reduce_ps(U, A, B) ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), (int)(B), (__v4sf)_mm_avx512_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm256_range_pd(A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_avx512_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_maskz_range_pd(U, A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)_mm256_avx512_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm_range_pd(A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_avx512_setzero_pd(), (__mmask8)-1))
|
||||
#define _mm256_range_ps(A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_avx512_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm256_mask_range_ps(W, U, A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)(__m256)(W), (__mmask8)(U)))
|
||||
#define _mm256_maskz_range_ps(U, A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm_range_ps(A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm256_maskz_range_ps(U, A, B, C) ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), (__v8sf)(__m256)(B), (int)(C), (__v8sf)_mm256_avx512_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm_range_ps(A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_avx512_setzero_ps(), (__mmask8)-1))
|
||||
#define _mm_mask_range_ps(W, U, A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_range_ps(U, A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm_maskz_range_ps(U, A, B, C) ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), (int)(C), (__v4sf)_mm_avx512_setzero_ps(), (__mmask8)(U)))
|
||||
#define _mm256_mask_range_pd(W, U, A, B, C) ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), (__v4df)(__m256d)(B), (int)(C), (__v4df)(__m256d)(W), (__mmask8)(U)))
|
||||
#define _mm_mask_range_pd(W, U, A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U)))
|
||||
#define _mm_maskz_range_pd(U, A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm_maskz_range_pd(U, A, B, C) ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), (int)(C), (__v2df)_mm_avx512_setzero_pd(), (__mmask8)(U)))
|
||||
#define _mm256_mask_fpclass_pd_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), (int) (C),(__mmask8)(u)))
|
||||
#define _mm256_mask_fpclass_ps_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), (int) (C),(__mmask8)(u)))
|
||||
#define _mm_mask_fpclass_pd_mask(u, X, C) ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), (int) (C),(__mmask8)(u)))
|
||||
|
|
1869
third_party/intel/avx512vlintrin.internal.h
vendored
1869
third_party/intel/avx512vlintrin.internal.h
vendored
File diff suppressed because it is too large
Load diff
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef __AVX512VNNIINTRIN_H_INCLUDED
|
||||
#define __AVX512VNNIINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VNNI__)
|
||||
#if !defined(__AVX512VNNI__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vnni")
|
||||
#pragma GCC target("avx512vnni,evex512")
|
||||
#define __DISABLE_AVX512VNNI__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VNNIVLINTRIN_H_INCLUDED
|
||||
#define _AVX512VNNIVLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__)
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vnni,avx512vl")
|
||||
#pragma GCC target("avx512vnni,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512VNNIVL__
|
||||
#endif
|
||||
#define _mm256_dpbusd_epi32(A, B, C) ((__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) (A), (__v8si) (B), (__v8si) (C)))
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VP2INTERSECTINTRIN_H_INCLUDED
|
||||
#define _AVX512VP2INTERSECTINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VP2INTERSECT__)
|
||||
#if !defined(__AVX512VP2INTERSECT__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vp2intersect")
|
||||
#pragma GCC target("avx512vp2intersect,evex512")
|
||||
#define __DISABLE_AVX512VP2INTERSECT__
|
||||
#endif
|
||||
extern __inline void
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
|
||||
#define _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VP2INTERSECT__) || !defined(__AVX512VL__)
|
||||
#if !defined(__AVX512VP2INTERSECT__) || !defined(__AVX512VL__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vp2intersect,avx512vl")
|
||||
#pragma GCC target("avx512vp2intersect,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512VP2INTERSECTVL__
|
||||
#endif
|
||||
extern __inline void
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED
|
||||
#define _AVX512VPOPCNTDQINTRIN_H_INCLUDED
|
||||
#ifndef __AVX512VPOPCNTDQ__
|
||||
#if !defined (__AVX512VPOPCNTDQ__) || !defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vpopcntdq")
|
||||
#pragma GCC target("avx512vpopcntdq,evex512")
|
||||
#define __DISABLE_AVX512VPOPCNTDQ__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
|
||||
#define _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
|
||||
#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__)
|
||||
#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__) || defined (__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avx512vpopcntdq,avx512vl")
|
||||
#pragma GCC target("avx512vpopcntdq,avx512vl,no-evex512")
|
||||
#define __DISABLE_AVX512VPOPCNTDQVL__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
|
@ -29,7 +29,7 @@ _mm_maskz_popcnt_epi32 (__mmask16 __U, __m128i __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A,
|
||||
(__v4si)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -52,7 +52,7 @@ _mm256_maskz_popcnt_epi32 (__mmask16 __U, __m256i __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A,
|
||||
(__v8si)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask16) __U);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -75,7 +75,7 @@ _mm_maskz_popcnt_epi64 (__mmask8 __U, __m128i __A)
|
|||
{
|
||||
return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A,
|
||||
(__v2di)
|
||||
_mm_setzero_si128 (),
|
||||
_mm_avx512_setzero_si128 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
extern __inline __m256i
|
||||
|
@ -98,7 +98,7 @@ _mm256_maskz_popcnt_epi64 (__mmask8 __U, __m256i __A)
|
|||
{
|
||||
return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A,
|
||||
(__v4di)
|
||||
_mm256_setzero_si256 (),
|
||||
_mm256_avx512_setzero_si256 (),
|
||||
(__mmask8) __U);
|
||||
}
|
||||
#ifdef __DISABLE_AVX512VPOPCNTDQVL__
|
||||
|
|
49
third_party/intel/avxifmaintrin.internal.h
vendored
Normal file
49
third_party/intel/avxifmaintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,49 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AVXIFMAINTRIN_H_INCLUDED
|
||||
#define _AVXIFMAINTRIN_H_INCLUDED
|
||||
#ifndef __AVXIFMA__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avxifma")
|
||||
#define __DISABLE_AVXIFMA__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) __X,
|
||||
(__v2di) __Y,
|
||||
(__v2di) __Z);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) __X,
|
||||
(__v4di) __Y,
|
||||
(__v4di) __Z);
|
||||
}
|
||||
#ifdef __DISABLE_AVXIFMA__
|
||||
#undef __DISABLE_AVXIFMA__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
9
third_party/intel/avxintrin.internal.h
vendored
9
third_party/intel/avxintrin.internal.h
vendored
|
@ -872,19 +872,28 @@ _mm256_movemask_ps (__m256 __A)
|
|||
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_undefined_pd (void)
|
||||
{
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Winit-self"
|
||||
__m256d __Y = __Y;
|
||||
#pragma GCC diagnostic pop
|
||||
return __Y;
|
||||
}
|
||||
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_undefined_ps (void)
|
||||
{
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Winit-self"
|
||||
__m256 __Y = __Y;
|
||||
#pragma GCC diagnostic pop
|
||||
return __Y;
|
||||
}
|
||||
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_undefined_si256 (void)
|
||||
{
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Winit-self"
|
||||
__m256i __Y = __Y;
|
||||
#pragma GCC diagnostic pop
|
||||
return __Y;
|
||||
}
|
||||
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
|
101
third_party/intel/avxneconvertintrin.internal.h
vendored
Normal file
101
third_party/intel/avxneconvertintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,101 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AVXNECONVERTINTRIN_H_INCLUDED
|
||||
#define _AVXNECONVERTINTRIN_H_INCLUDED
|
||||
#ifndef __AVXNECONVERT__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target ("avxneconvert")
|
||||
#define __DISABLE_AVXNECONVERT__
|
||||
#endif
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_bcstnebf16_ps (const void *__P)
|
||||
{
|
||||
return (__m128) __builtin_ia32_vbcstnebf162ps128 ((const __bf16 *) __P);
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_bcstnebf16_ps (const void *__P)
|
||||
{
|
||||
return (__m256) __builtin_ia32_vbcstnebf162ps256 ((const __bf16 *) __P);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_bcstnesh_ps (const void *__P)
|
||||
{
|
||||
return (__m128) __builtin_ia32_vbcstnesh2ps128 ((const _Float16 *) __P);
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_bcstnesh_ps (const void *__P)
|
||||
{
|
||||
return (__m256) __builtin_ia32_vbcstnesh2ps256 ((const _Float16 *) __P);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtneebf16_ps (const __m128bh *__A)
|
||||
{
|
||||
return (__m128) __builtin_ia32_vcvtneebf162ps128 ((const __v8bf *) __A);
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtneebf16_ps (const __m256bh *__A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_vcvtneebf162ps256 ((const __v16bf *) __A);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtneeph_ps (const __m128h *__A)
|
||||
{
|
||||
return (__m128) __builtin_ia32_vcvtneeph2ps128 ((const __v8hf *) __A);
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtneeph_ps (const __m256h *__A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_vcvtneeph2ps256 ((const __v16hf *) __A);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtneobf16_ps (const __m128bh *__A)
|
||||
{
|
||||
return (__m128) __builtin_ia32_vcvtneobf162ps128 ((const __v8bf *) __A);
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtneobf16_ps (const __m256bh *__A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_vcvtneobf162ps256 ((const __v16bf *) __A);
|
||||
}
|
||||
extern __inline __m128
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtneoph_ps (const __m128h *__A)
|
||||
{
|
||||
return (__m128) __builtin_ia32_vcvtneoph2ps128 ((const __v8hf *) __A);
|
||||
}
|
||||
extern __inline __m256
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtneoph_ps (const __m256h *__A)
|
||||
{
|
||||
return (__m256) __builtin_ia32_vcvtneoph2ps256 ((const __v16hf *) __A);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_cvtneps_avx_pbh (__m128 __A)
|
||||
{
|
||||
return (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (__A);
|
||||
}
|
||||
extern __inline __m128bh
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_cvtneps_avx_pbh (__m256 __A)
|
||||
{
|
||||
return (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (__A);
|
||||
}
|
||||
#ifdef __DISABLE_AVXNECONVERT__
|
||||
#undef __DISABLE_AVXNECONVERT__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
101
third_party/intel/avxvnniint16intrin.internal.h
vendored
Normal file
101
third_party/intel/avxvnniint16intrin.internal.h
vendored
Normal file
|
@ -0,0 +1,101 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AVXVNNIINT16INTRIN_H_INCLUDED
|
||||
#define _AVXVNNIINT16INTRIN_H_INCLUDED
|
||||
#if !defined(__AVXVNNIINT16__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avxvnniint16")
|
||||
#define __DISABLE_AVXVNNIINT16__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpwsud_avx_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpwsud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpwsuds_avx_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpwsuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpwusd_avx_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpwusd128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpwusds_avx_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpwusds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpwuud_avx_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpwuud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpwuuds_avx_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpwuuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpwsud_avx_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpwsud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpwsuds_avx_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpwsuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpwusd_avx_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpwusd256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpwusds_avx_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpwusds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpwuud_avx_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpwuud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpwuuds_avx_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpwuuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
#ifdef __DISABLE_AVXVNNIINT16__
|
||||
#undef __DISABLE_AVXVNNIINT16__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
101
third_party/intel/avxvnniint8intrin.internal.h
vendored
Normal file
101
third_party/intel/avxvnniint8intrin.internal.h
vendored
Normal file
|
@ -0,0 +1,101 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <avxvnniint8vlintrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _AVXVNNIINT8INTRIN_H_INCLUDED
|
||||
#define _AVXVNNIINT8INTRIN_H_INCLUDED
|
||||
#if !defined(__AVXVNNIINT8__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("avxvnniint8")
|
||||
#define __DISABLE_AVXVNNIINT8__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpbssd_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpbssd128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpbssds_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpbssds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpbsud_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpbsud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpbsuds_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpbsuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpbuud_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpbuud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_dpbuuds_epi32 (__m128i __W, __m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i)
|
||||
__builtin_ia32_vpdpbuuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpbssd_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpbssd256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpbssds_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpbssds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpbsud_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpbsud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpbsuds_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpbsuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpbuud_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpbuud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_dpbuuds_epi32 (__m256i __W, __m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i)
|
||||
__builtin_ia32_vpdpbuuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
#ifdef __DISABLE_AVXVNNIINT8__
|
||||
#undef __DISABLE_AVXVNNIINT8__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
6
third_party/intel/bmmintrin.internal.h
vendored
Normal file
6
third_party/intel/bmmintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,6 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _BMMINTRIN_H_INCLUDED
|
||||
#define _BMMINTRIN_H_INCLUDED
|
||||
# error "SSE5 instruction set removed from compiler"
|
||||
#endif
|
||||
#endif
|
55
third_party/intel/cmpccxaddintrin.internal.h
vendored
Normal file
55
third_party/intel/cmpccxaddintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,55 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _X86GPRINTRIN_H_INCLUDED
|
||||
#error "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _CMPCCXADDINTRIN_H_INCLUDED
|
||||
#define _CMPCCXADDINTRIN_H_INCLUDED
|
||||
#ifdef __x86_64__
|
||||
#ifndef __CMPCCXADD__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("cmpccxadd")
|
||||
#define __DISABLE_CMPCCXADD__
|
||||
#endif
|
||||
typedef enum {
|
||||
_CMPCCX_O,
|
||||
_CMPCCX_NO,
|
||||
_CMPCCX_B,
|
||||
_CMPCCX_NB,
|
||||
_CMPCCX_Z,
|
||||
_CMPCCX_NZ,
|
||||
_CMPCCX_BE,
|
||||
_CMPCCX_NBE,
|
||||
_CMPCCX_S,
|
||||
_CMPCCX_NS,
|
||||
_CMPCCX_P,
|
||||
_CMPCCX_NP,
|
||||
_CMPCCX_L,
|
||||
_CMPCCX_NL,
|
||||
_CMPCCX_LE,
|
||||
_CMPCCX_NLE,
|
||||
} _CMPCCX_ENUM;
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline int
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cmpccxadd_epi32 (int *__A, int __B, int __C, const _CMPCCX_ENUM __D)
|
||||
{
|
||||
return __builtin_ia32_cmpccxadd (__A, __B, __C, __D);
|
||||
}
|
||||
extern __inline long long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_cmpccxadd_epi64 (long long *__A, long long __B, long long __C,
|
||||
const _CMPCCX_ENUM __D)
|
||||
{
|
||||
return __builtin_ia32_cmpccxadd64 (__A, __B, __C, __D);
|
||||
}
|
||||
#else
|
||||
#define _cmpccxadd_epi32(A,B,C,D) __builtin_ia32_cmpccxadd ((int *) (A), (int) (B), (int) (C), (_CMPCCX_ENUM) (D))
|
||||
#define _cmpccxadd_epi64(A,B,C,D) __builtin_ia32_cmpccxadd64 ((long long *) (A), (long long) (B), (long long) (C), (_CMPCCX_ENUM) (D))
|
||||
#endif
|
||||
#ifdef __DISABLE_CMPCCXADD__
|
||||
#undef __DISABLE_CMPCCXADD__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
44
third_party/intel/cpuid.internal.h
vendored
44
third_party/intel/cpuid.internal.h
vendored
|
@ -1,9 +1,6 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _CPUID_H_INCLUDED
|
||||
#define _CPUID_H_INCLUDED
|
||||
#define bit_AVXVNNI (1 << 4)
|
||||
#define bit_AVX512BF16 (1 << 5)
|
||||
#define bit_HRESET (1 << 22)
|
||||
#define bit_SSE3 (1 << 0)
|
||||
#define bit_PCLMUL (1 << 1)
|
||||
#define bit_LZCNT (1 << 5)
|
||||
|
@ -70,34 +67,54 @@
|
|||
#define bit_SHSTK (1 << 7)
|
||||
#define bit_GFNI (1 << 8)
|
||||
#define bit_VAES (1 << 9)
|
||||
#define bit_AVX512VNNI (1 << 11)
|
||||
#define bit_VPCLMULQDQ (1 << 10)
|
||||
#define bit_AVX512VNNI (1 << 11)
|
||||
#define bit_AVX512BITALG (1 << 12)
|
||||
#define bit_AVX512VPOPCNTDQ (1 << 14)
|
||||
#define bit_RDPID (1 << 22)
|
||||
#define bit_KL (1 << 23)
|
||||
#define bit_CLDEMOTE (1 << 25)
|
||||
#define bit_MOVDIRI (1 << 27)
|
||||
#define bit_MOVDIR64B (1 << 28)
|
||||
#define bit_ENQCMD (1 << 29)
|
||||
#define bit_CLDEMOTE (1 << 25)
|
||||
#define bit_KL (1 << 23)
|
||||
#define bit_AVX5124VNNIW (1 << 2)
|
||||
#define bit_AVX5124FMAPS (1 << 3)
|
||||
#define bit_AVX512VP2INTERSECT (1 << 8)
|
||||
#define bit_AVX512FP16 (1 << 23)
|
||||
#define bit_IBT (1 << 20)
|
||||
#define bit_UINTR (1 << 5)
|
||||
#define bit_PCONFIG (1 << 18)
|
||||
#define bit_AVX512VP2INTERSECT (1 << 8)
|
||||
#define bit_SERIALIZE (1 << 14)
|
||||
#define bit_TSXLDTRK (1 << 16)
|
||||
#define bit_PCONFIG (1 << 18)
|
||||
#define bit_IBT (1 << 20)
|
||||
#define bit_AMX_BF16 (1 << 22)
|
||||
#define bit_AVX512FP16 (1 << 23)
|
||||
#define bit_AMX_TILE (1 << 24)
|
||||
#define bit_AMX_INT8 (1 << 25)
|
||||
#define bit_SHA512 (1 << 0)
|
||||
#define bit_SM3 (1 << 1)
|
||||
#define bit_SM4 (1 << 2)
|
||||
#define bit_RAOINT (1 << 3)
|
||||
#define bit_AVXVNNI (1 << 4)
|
||||
#define bit_AVX512BF16 (1 << 5)
|
||||
#define bit_CMPCCXADD (1 << 7)
|
||||
#define bit_AMX_COMPLEX (1 << 8)
|
||||
#define bit_AMX_FP16 (1 << 21)
|
||||
#define bit_HRESET (1 << 22)
|
||||
#define bit_AVXIFMA (1 << 23)
|
||||
#define bit_AVXVNNIINT8 (1 << 4)
|
||||
#define bit_AVXNECONVERT (1 << 5)
|
||||
#define bit_AVXVNNIINT16 (1 << 10)
|
||||
#define bit_PREFETCHI (1 << 14)
|
||||
#define bit_USER_MSR (1 << 15)
|
||||
#define bit_AVX10 (1 << 19)
|
||||
#define bit_APX_F (1 << 21)
|
||||
#define bit_XSAVEOPT (1 << 0)
|
||||
#define bit_XSAVEC (1 << 1)
|
||||
#define bit_XSAVES (1 << 3)
|
||||
#define bit_PTWRITE (1 << 4)
|
||||
#define bit_AESKLE ( 1<<0 )
|
||||
#define bit_WIDEKL ( 1<<2 )
|
||||
#define bit_AVX10_256 (1 << 17)
|
||||
#define bit_AVX10_512 (1 << 18)
|
||||
#define signature_AMD_ebx 0x68747541
|
||||
#define signature_AMD_ecx 0x444d4163
|
||||
#define signature_AMD_edx 0x69746e65
|
||||
|
@ -137,6 +154,9 @@
|
|||
#define signature_VORTEX_ebx 0x74726f56
|
||||
#define signature_VORTEX_ecx 0x436f5320
|
||||
#define signature_VORTEX_edx 0x36387865
|
||||
#define signature_SHANGHAI_ebx 0x68532020
|
||||
#define signature_SHANGHAI_ecx 0x20206961
|
||||
#define signature_SHANGHAI_edx 0x68676e61
|
||||
#ifndef __x86_64__
|
||||
#define __cpuid(level, a, b, c, d) do { if (__builtin_constant_p (level) && (level) != 1) __asm__ __volatile__ ("cpuid\n\t" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (level)); else __asm__ __volatile__ ("cpuid\n\t" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "0" (level), "1" (0), "2" (0)); } while (0)
|
||||
#else
|
||||
|
@ -175,7 +195,7 @@ __get_cpuid_max (unsigned int __ext, unsigned int *__sig)
|
|||
: "=&r" (__eax), "=&r" (__ebx)
|
||||
: "i" (0x00200000));
|
||||
#endif
|
||||
if (!((__eax ^ __ebx) & 0x00200000))
|
||||
if (__builtin_expect (!((__eax ^ __ebx) & 0x00200000), 0))
|
||||
return 0;
|
||||
#endif
|
||||
__cpuid (__ext, __eax, __ebx, __ecx, __edx);
|
||||
|
@ -202,7 +222,7 @@ __get_cpuid_count (unsigned int __leaf, unsigned int __subleaf,
|
|||
{
|
||||
unsigned int __ext = __leaf & 0x80000000;
|
||||
unsigned int __maxlevel = __get_cpuid_max (__ext, 0);
|
||||
if (__maxlevel == 0 || __maxlevel < __leaf)
|
||||
if (__builtin_expect (__maxlevel == 0, 0) || __maxlevel < __leaf)
|
||||
return 0;
|
||||
__cpuid_count (__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
|
||||
return 1;
|
||||
|
|
6
third_party/intel/emmintrin.internal.h
vendored
6
third_party/intel/emmintrin.internal.h
vendored
|
@ -50,7 +50,10 @@ _mm_setr_pd (double __W, double __X)
|
|||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_undefined_pd (void)
|
||||
{
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Winit-self"
|
||||
__m128d __Y = __Y;
|
||||
#pragma GCC diagnostic pop
|
||||
return __Y;
|
||||
}
|
||||
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -605,7 +608,10 @@ _mm_move_epi64 (__m128i __A)
|
|||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_undefined_si128 (void)
|
||||
{
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Winit-self"
|
||||
__m128i __Y = __Y;
|
||||
#pragma GCC diagnostic pop
|
||||
return __Y;
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
|
82
third_party/intel/gfniintrin.internal.h
vendored
82
third_party/intel/gfniintrin.internal.h
vendored
|
@ -94,7 +94,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm_maskz_gf2p8mul_epi8 (__mmask16 __A, __m128i __B, __m128i __C)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __B,
|
||||
(__v16qi) __C, (__v16qi) _mm_setzero_si128 (), __A);
|
||||
(__v16qi) __C, (__v16qi) _mm_avx512_setzero_si128 (), __A);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __m128i
|
||||
|
@ -115,7 +115,7 @@ _mm_maskz_gf2p8affineinv_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
|
|||
{
|
||||
return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __B,
|
||||
(__v16qi) __C, __D,
|
||||
(__v16qi) _mm_setzero_si128 (),
|
||||
(__v16qi) _mm_avx512_setzero_si128 (),
|
||||
__A);
|
||||
}
|
||||
extern __inline __m128i
|
||||
|
@ -132,13 +132,13 @@ _mm_maskz_gf2p8affine_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
|
|||
const int __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __B,
|
||||
(__v16qi) __C, __D, (__v16qi) _mm_setzero_si128 (), __A);
|
||||
(__v16qi) __C, __D, (__v16qi) _mm_avx512_setzero_si128 (), __A);
|
||||
}
|
||||
#else
|
||||
#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
|
||||
#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A)))
|
||||
#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_avx512_setzero_si128 (), (__mmask16)(A)))
|
||||
#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
|
||||
#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A)))
|
||||
#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), (__v16qi)(__m128i) _mm_avx512_setzero_si128 (), (__mmask16)(A)))
|
||||
#endif
|
||||
#ifdef __DISABLE_GFNIAVX512VL__
|
||||
#undef __DISABLE_GFNIAVX512VL__
|
||||
|
@ -163,7 +163,7 @@ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
|||
_mm256_maskz_gf2p8mul_epi8 (__mmask32 __A, __m256i __B, __m256i __C)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __B,
|
||||
(__v32qi) __C, (__v32qi) _mm256_setzero_si256 (), __A);
|
||||
(__v32qi) __C, (__v32qi) _mm256_avx512_setzero_si256 (), __A);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __m256i
|
||||
|
@ -184,7 +184,7 @@ _mm256_maskz_gf2p8affineinv_epi64_epi8 (__mmask32 __A, __m256i __B,
|
|||
{
|
||||
return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __B,
|
||||
(__v32qi) __C, __D,
|
||||
(__v32qi) _mm256_setzero_si256 (), __A);
|
||||
(__v32qi) _mm256_avx512_setzero_si256 (), __A);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -203,21 +203,56 @@ _mm256_maskz_gf2p8affine_epi64_epi8 (__mmask32 __A, __m256i __B,
|
|||
__m256i __C, const int __D)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __B,
|
||||
(__v32qi) __C, __D, (__v32qi)_mm256_setzero_si256 (), __A);
|
||||
(__v32qi) __C, __D, (__v32qi)_mm256_avx512_setzero_si256 (), __A);
|
||||
}
|
||||
#else
|
||||
#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B)))
|
||||
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
|
||||
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_avx512_setzero_si256 (), (__mmask32)(A)))
|
||||
#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B)))
|
||||
#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
|
||||
#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), (__v32qi)(__m256i) _mm256_avx512_setzero_si256 (), (__mmask32)(A)))
|
||||
#endif
|
||||
#ifdef __DISABLE_GFNIAVX512VLBW__
|
||||
#undef __DISABLE_GFNIAVX512VLBW__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__)
|
||||
#if !defined(__GFNI__) || !defined(__EVEX512__) || !defined(__AVX512F__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("gfni,avx512f,avx512bw")
|
||||
#pragma GCC target("gfni,avx512f,evex512")
|
||||
#define __DISABLE_GFNIAVX512F__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A,
|
||||
(__v64qi) __B);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A,
|
||||
(__v64qi) __B, __C);
|
||||
}
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A,
|
||||
(__v64qi) __B, __C);
|
||||
}
|
||||
#else
|
||||
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ( (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
|
||||
#define _mm512_gf2p8affine_epi64_epi8(A, B, C) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
|
||||
#endif
|
||||
#ifdef __DISABLE_GFNIAVX512F__
|
||||
#undef __DISABLE_GFNIAVX512F__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__GFNI__) || !defined(__EVEX512__) || !defined(__AVX512BW__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("gfni,avx512bw,evex512")
|
||||
#define __DISABLE_GFNIAVX512FBW__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
@ -235,13 +270,6 @@ _mm512_maskz_gf2p8mul_epi8 (__mmask64 __A, __m512i __B, __m512i __C)
|
|||
return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __B,
|
||||
(__v64qi) __C, (__v64qi) _mm512_setzero_si512 (), __A);
|
||||
}
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A,
|
||||
(__v64qi) __B);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
@ -265,13 +293,6 @@ _mm512_maskz_gf2p8affineinv_epi64_epi8 (__mmask64 __A, __m512i __B,
|
|||
}
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A,
|
||||
(__v64qi) __B, __C);
|
||||
}
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_mask_gf2p8affine_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
|
||||
__m512i __D, const int __E)
|
||||
{
|
||||
|
@ -286,20 +307,11 @@ _mm512_maskz_gf2p8affine_epi64_epi8 (__mmask64 __A, __m512i __B, __m512i __C,
|
|||
return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __B,
|
||||
(__v64qi) __C, __D, (__v64qi) _mm512_setzero_si512 (), __A);
|
||||
}
|
||||
extern __inline __m512i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
|
||||
{
|
||||
return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A,
|
||||
(__v64qi) __B, __C);
|
||||
}
|
||||
#else
|
||||
#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B)))
|
||||
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
|
||||
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ( (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
|
||||
#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B)))
|
||||
#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
|
||||
#define _mm512_gf2p8affine_epi64_epi8(A, B, C) ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
|
||||
#endif
|
||||
#ifdef __DISABLE_GFNIAVX512FBW__
|
||||
#undef __DISABLE_GFNIAVX512FBW__
|
||||
|
|
12
third_party/intel/immintrin.internal.h
vendored
12
third_party/intel/immintrin.internal.h
vendored
|
@ -11,6 +11,9 @@
|
|||
#include "third_party/intel/wmmintrin.internal.h"
|
||||
#include "third_party/intel/avxintrin.internal.h"
|
||||
#include "third_party/intel/avxvnniintrin.internal.h"
|
||||
#include "third_party/intel/avxifmaintrin.internal.h"
|
||||
#include "third_party/intel/avxvnniint8intrin.internal.h"
|
||||
#include "third_party/intel/avxvnniint16intrin.internal.h"
|
||||
#include "third_party/intel/avx2intrin.internal.h"
|
||||
#include "third_party/intel/avx512fintrin.internal.h"
|
||||
#include "third_party/intel/avx512erintrin.internal.h"
|
||||
|
@ -34,13 +37,15 @@
|
|||
#include "third_party/intel/avx512vnnivlintrin.internal.h"
|
||||
#include "third_party/intel/avx512vpopcntdqvlintrin.internal.h"
|
||||
#include "third_party/intel/avx512bitalgintrin.internal.h"
|
||||
#include "third_party/intel/avx512bitalgvlintrin.internal.h"
|
||||
#include "third_party/intel/avx512vp2intersectintrin.internal.h"
|
||||
#include "third_party/intel/avx512vp2intersectvlintrin.internal.h"
|
||||
#ifdef __SSE2__
|
||||
#include "third_party/intel/avx512fp16intrin.internal.h"
|
||||
#include "third_party/intel/avx512fp16vlintrin.internal.h"
|
||||
#endif
|
||||
#include "third_party/intel/shaintrin.internal.h"
|
||||
#include "third_party/intel/sm3intrin.internal.h"
|
||||
#include "third_party/intel/sha512intrin.internal.h"
|
||||
#include "third_party/intel/sm4intrin.internal.h"
|
||||
#include "third_party/intel/fmaintrin.internal.h"
|
||||
#include "third_party/intel/f16cintrin.internal.h"
|
||||
#include "third_party/intel/rtmintrin.internal.h"
|
||||
|
@ -49,10 +54,13 @@
|
|||
#include "third_party/intel/vpclmulqdqintrin.internal.h"
|
||||
#include "third_party/intel/avx512bf16vlintrin.internal.h"
|
||||
#include "third_party/intel/avx512bf16intrin.internal.h"
|
||||
#include "third_party/intel/avxneconvertintrin.internal.h"
|
||||
#include "third_party/intel/amxtileintrin.internal.h"
|
||||
#include "third_party/intel/amxint8intrin.internal.h"
|
||||
#include "third_party/intel/amxbf16intrin.internal.h"
|
||||
#include "third_party/intel/amxcomplexintrin.internal.h"
|
||||
#include "third_party/intel/prfchwintrin.internal.h"
|
||||
#include "third_party/intel/keylockerintrin.internal.h"
|
||||
#include "third_party/intel/amxfp16intrin.internal.h"
|
||||
#endif
|
||||
#endif
|
||||
|
|
24
third_party/intel/mm_malloc.internal.h
vendored
24
third_party/intel/mm_malloc.internal.h
vendored
|
@ -3,23 +3,27 @@
|
|||
#define _MM_MALLOC_H_INCLUDED
|
||||
#include "libc/mem/mem.h"
|
||||
#ifndef __cplusplus
|
||||
extern int _mm_posix_memalign(void **, size_t, size_t)
|
||||
extern int posix_memalign (void **, size_t, size_t);
|
||||
#else
|
||||
extern "C" int _mm_posix_memalign(void **, size_t, size_t) throw()
|
||||
extern "C" int posix_memalign (void **, size_t, size_t) throw ();
|
||||
#endif
|
||||
__asm__("posix_memalign");
|
||||
static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
|
||||
static __inline void *
|
||||
_mm_malloc (size_t __size, size_t __alignment)
|
||||
{
|
||||
void *__ptr;
|
||||
if (__alignment == 1) return malloc(__size);
|
||||
if (__alignment == 2 || (sizeof(void *) == 8 && __alignment == 4))
|
||||
__alignment = sizeof(void *);
|
||||
if (_mm_posix_memalign(&__ptr, __alignment, __size) == 0)
|
||||
if (__alignment == 1)
|
||||
return malloc (__size);
|
||||
if (__alignment == 2 || (sizeof (void *) == 8 && __alignment == 4))
|
||||
__alignment = sizeof (void *);
|
||||
if (posix_memalign (&__ptr, __alignment, __size) == 0)
|
||||
return __ptr;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
static __inline void _mm_free(void *__ptr) {
|
||||
free(__ptr);
|
||||
static __inline void
|
||||
_mm_free (void *__ptr)
|
||||
{
|
||||
free (__ptr);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
31
third_party/intel/prfchiintrin.internal.h
vendored
Normal file
31
third_party/intel/prfchiintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,31 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _X86GPRINTRIN_H_INCLUDED
|
||||
# error "Never use <prfchiintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _PRFCHIINTRIN_H_INCLUDED
|
||||
#define _PRFCHIINTRIN_H_INCLUDED
|
||||
#ifdef __x86_64__
|
||||
#ifndef __PREFETCHI__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("prefetchi")
|
||||
#define __DISABLE_PREFETCHI__
|
||||
#endif
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_m_prefetchit0 (void* __P)
|
||||
{
|
||||
__builtin_ia32_prefetchi (__P, 3);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_m_prefetchit1 (void* __P)
|
||||
{
|
||||
__builtin_ia32_prefetchi (__P, 2);
|
||||
}
|
||||
#ifdef __DISABLE_PREFETCHI__
|
||||
#undef __DISABLE_PREFETCHI__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
67
third_party/intel/raointintrin.internal.h
vendored
Normal file
67
third_party/intel/raointintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,67 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _X86GPRINTRIN_H_INCLUDED
|
||||
#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
#ifndef __RAOINTINTRIN_H_INCLUDED
|
||||
#define __RAOINTINTRIN_H_INCLUDED
|
||||
#ifndef __RAOINT__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("raoint")
|
||||
#define __DISABLE_RAOINT__
|
||||
#endif
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_aadd_i32 (int *__A, int __B)
|
||||
{
|
||||
__builtin_ia32_aadd32 ((int *)__A, __B);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_aand_i32 (int *__A, int __B)
|
||||
{
|
||||
__builtin_ia32_aand32 ((int *)__A, __B);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_aor_i32 (int *__A, int __B)
|
||||
{
|
||||
__builtin_ia32_aor32 ((int *)__A, __B);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_axor_i32 (int *__A, int __B)
|
||||
{
|
||||
__builtin_ia32_axor32 ((int *)__A, __B);
|
||||
}
|
||||
#ifdef __x86_64__
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_aadd_i64 (long long *__A, long long __B)
|
||||
{
|
||||
__builtin_ia32_aadd64 ((long long *)__A, __B);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_aand_i64 (long long *__A, long long __B)
|
||||
{
|
||||
__builtin_ia32_aand64 ((long long *)__A, __B);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_aor_i64 (long long *__A, long long __B)
|
||||
{
|
||||
__builtin_ia32_aor64 ((long long *)__A, __B);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_axor_i64 (long long *__A, long long __B)
|
||||
{
|
||||
__builtin_ia32_axor64 ((long long *)__A, __B);
|
||||
}
|
||||
#endif
|
||||
#ifdef __DISABLE_RAOINT__
|
||||
#undef __DISABLE_RAOINT__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
36
third_party/intel/sha512intrin.internal.h
vendored
Normal file
36
third_party/intel/sha512intrin.internal.h
vendored
Normal file
|
@ -0,0 +1,36 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <sha512intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _SHA512INTRIN_H_INCLUDED
|
||||
#define _SHA512INTRIN_H_INCLUDED
|
||||
#ifndef __SHA512__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("sha512")
|
||||
#define __DISABLE_SHA512__
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_sha512msg1_epi64 (__m256i __A, __m128i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vsha512msg1 ((__v4di) __A, (__v2di) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_sha512msg2_epi64 (__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vsha512msg2 ((__v4di) __A, (__v4di) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_sha512rnds2_epi64 (__m256i __A, __m256i __B, __m128i __C)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vsha512rnds2 ((__v4di) __A, (__v4di) __B,
|
||||
(__v2di) __C);
|
||||
}
|
||||
#ifdef __DISABLE_SHA512__
|
||||
#undef __DISABLE_SHA512__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
42
third_party/intel/sm3intrin.internal.h
vendored
Normal file
42
third_party/intel/sm3intrin.internal.h
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <sm3intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _SM3INTRIN_H_INCLUDED
|
||||
#define _SM3INTRIN_H_INCLUDED
|
||||
#ifndef __SM3__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("sm3")
|
||||
#define __DISABLE_SM3__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sm3msg1_epi32 (__m128i __A, __m128i __B, __m128i __C)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vsm3msg1 ((__v4si) __A, (__v4si) __B,
|
||||
(__v4si) __C);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sm3msg2_epi32 (__m128i __A, __m128i __B, __m128i __C)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vsm3msg2 ((__v4si) __A, (__v4si) __B,
|
||||
(__v4si) __C);
|
||||
}
|
||||
#ifdef __OPTIMIZE__
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sm3rnds2_epi32 (__m128i __A, __m128i __B, __m128i __C, const int __D)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vsm3rnds2 ((__v4si) __A, (__v4si) __B,
|
||||
(__v4si) __C, __D);
|
||||
}
|
||||
#else
|
||||
#define _mm_sm3rnds2_epi32(A, B, C, D) ((__m128i) __builtin_ia32_vsm3rnds2 ((__v4si) (A), (__v4si) (B), (__v4si) (C), (int) (D)))
|
||||
#endif
|
||||
#ifdef __DISABLE_SM3__
|
||||
#undef __DISABLE_SM3__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
41
third_party/intel/sm4intrin.internal.h
vendored
Normal file
41
third_party/intel/sm4intrin.internal.h
vendored
Normal file
|
@ -0,0 +1,41 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef _IMMINTRIN_H_INCLUDED
|
||||
#error "Never use <sm4intrin.h> directly; include <immintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _SM4INTRIN_H_INCLUDED
|
||||
#define _SM4INTRIN_H_INCLUDED
|
||||
#ifndef __SM4__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("sm4")
|
||||
#define __DISABLE_SM4__
|
||||
#endif
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sm4key4_epi32 (__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vsm4key4128 ((__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_sm4key4_epi32 (__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vsm4key4256 ((__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
extern __inline __m128i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_sm4rnds4_epi32 (__m128i __A, __m128i __B)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_vsm4rnds4128 ((__v4si) __A, (__v4si) __B);
|
||||
}
|
||||
extern __inline __m256i
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm256_sm4rnds4_epi32 (__m256i __A, __m256i __B)
|
||||
{
|
||||
return (__m256i) __builtin_ia32_vsm4rnds4256 ((__v8si) __A, (__v8si) __B);
|
||||
}
|
||||
#ifdef __DISABLE_SM4__
|
||||
#undef __DISABLE_SM4__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
8
third_party/intel/smmintrin.internal.h
vendored
8
third_party/intel/smmintrin.internal.h
vendored
|
@ -224,12 +224,12 @@ _mm_insert_ps (__m128 __D, __m128 __S, const int __N)
|
|||
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_extract_ps (__m128 __X, const int __N)
|
||||
{
|
||||
union { int i; float f; } __tmp;
|
||||
__tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
|
||||
return __tmp.i;
|
||||
union { int __i; float __f; } __tmp;
|
||||
__tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
|
||||
return __tmp.__i;
|
||||
}
|
||||
#else
|
||||
#define _mm_extract_ps(X, N) (__extension__ ({ union { int i; float f; } __tmp; __tmp.f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); __tmp.i; }))
|
||||
#define _mm_extract_ps(X, N) (__extension__ ({ union { int __i; float __f; } __tmp; __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), (int)(N)); __tmp.__i; }))
|
||||
#endif
|
||||
#define _MM_EXTRACT_FLOAT(D, S, N) { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
|
||||
#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps (), (X), _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
|
||||
|
|
15
third_party/intel/upgrade.sh
vendored
15
third_party/intel/upgrade.sh
vendored
|
@ -7,6 +7,8 @@ FILES='
|
|||
adxintrin
|
||||
ammintrin
|
||||
amxbf16intrin
|
||||
amxcomplexintrin
|
||||
amxfp16intrin
|
||||
amxint8intrin
|
||||
amxtileintrin
|
||||
avx2intrin
|
||||
|
@ -15,6 +17,7 @@ avx5124vnniwintrin
|
|||
avx512bf16intrin
|
||||
avx512bf16vlintrin
|
||||
avx512bitalgintrin
|
||||
avx512bitalgvlintrin
|
||||
avx512bwintrin
|
||||
avx512cdintrin
|
||||
avx512dqintrin
|
||||
|
@ -38,15 +41,21 @@ avx512vp2intersectintrin
|
|||
avx512vp2intersectvlintrin
|
||||
avx512vpopcntdqintrin
|
||||
avx512vpopcntdqvlintrin
|
||||
avxifmaintrin
|
||||
avxvnniint8intrin
|
||||
avxvnniint16intrin
|
||||
avxintrin
|
||||
avxneconvertintrin
|
||||
avxvnniintrin
|
||||
bmi2intrin
|
||||
bmmintrin
|
||||
bmiintrin
|
||||
cetintrin
|
||||
cldemoteintrin
|
||||
clflushoptintrin
|
||||
clwbintrin
|
||||
clzerointrin
|
||||
cmpccxaddintrin
|
||||
cpuid
|
||||
emmintrin
|
||||
enqcmdintrin
|
||||
|
@ -72,17 +81,23 @@ pconfigintrin
|
|||
pkuintrin
|
||||
pmmintrin
|
||||
popcntintrin
|
||||
prfchiintrin
|
||||
prfchwintrin
|
||||
raointintrin
|
||||
rdseedintrin
|
||||
rtmintrin
|
||||
serializeintrin
|
||||
sgxintrin
|
||||
sha512intrin
|
||||
shaintrin
|
||||
sm3intrin
|
||||
sm4intrin
|
||||
smmintrin
|
||||
tbmintrin
|
||||
tmmintrin
|
||||
tsxldtrkintrin
|
||||
uintrintrin
|
||||
usermsrintrin
|
||||
vaesintrin
|
||||
vpclmulqdqintrin
|
||||
waitpkgintrin
|
||||
|
|
31
third_party/intel/usermsrintrin.internal.h
vendored
Normal file
31
third_party/intel/usermsrintrin.internal.h
vendored
Normal file
|
@ -0,0 +1,31 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#if !defined _X86GPRINTRIN_H_INCLUDED
|
||||
#error "Never use <usermsrintrin.h> directly; include <x86gprintrin.h> instead."
|
||||
#endif
|
||||
#ifndef _USER_MSRINTRIN_H_INCLUDED
|
||||
#define _USER_MSRINTRIN_H_INCLUDED
|
||||
#ifdef __x86_64__
|
||||
#ifndef __USER_MSR__
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("usermsr")
|
||||
#define __DISABLE_USER_MSR__
|
||||
#endif
|
||||
extern __inline unsigned long long
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_urdmsr (unsigned long long __A)
|
||||
{
|
||||
return (unsigned long long) __builtin_ia32_urdmsr (__A);
|
||||
}
|
||||
extern __inline void
|
||||
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_uwrmsr (unsigned long long __A, unsigned long long __B)
|
||||
{
|
||||
__builtin_ia32_uwrmsr (__A, __B);
|
||||
}
|
||||
#ifdef __DISABLE_USER_MSR__
|
||||
#undef __DISABLE_USER_MSR__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
8
third_party/intel/vaesintrin.internal.h
vendored
8
third_party/intel/vaesintrin.internal.h
vendored
|
@ -1,9 +1,9 @@
|
|||
#if defined(__x86_64__) && !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
#ifndef __VAESINTRIN_H_INCLUDED
|
||||
#define __VAESINTRIN_H_INCLUDED
|
||||
#if !defined(__VAES__) || !defined(__AVX__)
|
||||
#if !defined(__VAES__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("vaes,avx")
|
||||
#pragma GCC target("vaes")
|
||||
#define __DISABLE_VAES__
|
||||
#endif
|
||||
extern __inline __m256i
|
||||
|
@ -36,9 +36,9 @@ _mm256_aesenclast_epi128 (__m256i __A, __m256i __B)
|
|||
#undef __DISABLE_VAES__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__VAES__) || !defined(__AVX512F__)
|
||||
#if !defined(__VAES__) || !defined(__AVX512F__) || !defined(__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("vaes,avx512f")
|
||||
#pragma GCC target("vaes,avx512f,evex512")
|
||||
#define __DISABLE_VAESF__
|
||||
#endif
|
||||
extern __inline __m512i
|
||||
|
|
|
@ -4,9 +4,9 @@
|
|||
#endif
|
||||
#ifndef _VPCLMULQDQINTRIN_H_INCLUDED
|
||||
#define _VPCLMULQDQINTRIN_H_INCLUDED
|
||||
#if !defined(__VPCLMULQDQ__) || !defined(__AVX512F__)
|
||||
#if !defined(__VPCLMULQDQ__) || !defined(__AVX512F__) || !defined(__EVEX512__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("vpclmulqdq,avx512f")
|
||||
#pragma GCC target("vpclmulqdq,avx512f,evex512")
|
||||
#define __DISABLE_VPCLMULQDQF__
|
||||
#endif
|
||||
#ifdef __OPTIMIZE__
|
||||
|
@ -24,9 +24,9 @@ _mm512_clmulepi64_epi128 (__m512i __A, __m512i __B, const int __C)
|
|||
#undef __DISABLE_VPCLMULQDQF__
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#if !defined(__VPCLMULQDQ__) || !defined(__AVX__)
|
||||
#if !defined(__VPCLMULQDQ__)
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target("vpclmulqdq,avx")
|
||||
#pragma GCC target("vpclmulqdq")
|
||||
#define __DISABLE_VPCLMULQDQ__
|
||||
#endif
|
||||
#ifdef __OPTIMIZE__
|
||||
|
|
25
third_party/intel/wmmintrin.internal.h
vendored
25
third_party/intel/wmmintrin.internal.h
vendored
|
@ -7,27 +7,10 @@
|
|||
#pragma GCC target("aes,sse2")
|
||||
#define __DISABLE_AES__
|
||||
#endif
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_aesdec_si128 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y);
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_aesdeclast_si128 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X,
|
||||
(__v2di)__Y);
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_aesenc_si128 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y);
|
||||
}
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_aesenclast_si128 (__m128i __X, __m128i __Y)
|
||||
{
|
||||
return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y);
|
||||
}
|
||||
#define _mm_aesdec_si128(X, Y) (__m128i) __builtin_ia32_aesdec128 ((__v2di) (X), (__v2di) (Y))
|
||||
#define _mm_aesdeclast_si128(X, Y) (__m128i) __builtin_ia32_aesdeclast128 ((__v2di) (X), (__v2di) (Y))
|
||||
#define _mm_aesenc_si128(X, Y) (__m128i) __builtin_ia32_aesenc128 ((__v2di) (X), (__v2di) (Y))
|
||||
#define _mm_aesenclast_si128(X, Y) (__m128i) __builtin_ia32_aesenclast128 ((__v2di) (X), (__v2di) (Y))
|
||||
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_aesimc_si128 (__m128i __X)
|
||||
{
|
||||
|
|
4
third_party/intel/x86gprintrin.internal.h
vendored
4
third_party/intel/x86gprintrin.internal.h
vendored
|
@ -16,6 +16,7 @@
|
|||
#include "third_party/intel/clflushoptintrin.internal.h"
|
||||
#include "third_party/intel/clwbintrin.internal.h"
|
||||
#include "third_party/intel/clzerointrin.internal.h"
|
||||
#include "third_party/intel/cmpccxaddintrin.internal.h"
|
||||
#include "third_party/intel/enqcmdintrin.internal.h"
|
||||
#include "third_party/intel/fxsrintrin.internal.h"
|
||||
#include "third_party/intel/lzcntintrin.internal.h"
|
||||
|
@ -26,6 +27,8 @@
|
|||
#include "third_party/intel/pconfigintrin.internal.h"
|
||||
#include "third_party/intel/popcntintrin.internal.h"
|
||||
#include "third_party/intel/pkuintrin.internal.h"
|
||||
#include "third_party/intel/prfchiintrin.internal.h"
|
||||
#include "third_party/intel/raointintrin.internal.h"
|
||||
#include "third_party/intel/rdseedintrin.internal.h"
|
||||
#include "third_party/intel/rtmintrin.internal.h"
|
||||
#include "third_party/intel/serializeintrin.internal.h"
|
||||
|
@ -41,6 +44,7 @@
|
|||
#include "third_party/intel/xsavesintrin.internal.h"
|
||||
#include "third_party/intel/xtestintrin.internal.h"
|
||||
#include "third_party/intel/hresetintrin.internal.h"
|
||||
#include "third_party/intel/usermsrintrin.internal.h"
|
||||
extern __inline void
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_wbinvd (void)
|
||||
|
|
10
third_party/intel/xmmintrin.internal.h
vendored
10
third_party/intel/xmmintrin.internal.h
vendored
|
@ -5,6 +5,8 @@
|
|||
#include "third_party/intel/mm_malloc.internal.h"
|
||||
enum _mm_hint
|
||||
{
|
||||
_MM_HINT_IT0 = 19,
|
||||
_MM_HINT_IT1 = 18,
|
||||
_MM_HINT_ET0 = 7,
|
||||
_MM_HINT_ET1 = 6,
|
||||
_MM_HINT_T0 = 3,
|
||||
|
@ -16,10 +18,11 @@ enum _mm_hint
|
|||
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_prefetch (const void *__P, enum _mm_hint __I)
|
||||
{
|
||||
__builtin_prefetch (__P, (__I & 0x4) >> 2, __I & 0x3);
|
||||
__builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
|
||||
__I & 0x3, (__I & 0x10) >> 4);
|
||||
}
|
||||
#else
|
||||
#define _mm_prefetch(P, I) __builtin_prefetch ((P), ((I & 0x4) >> 2), (I & 0x3))
|
||||
#define _mm_prefetch(P, I) __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
|
||||
#endif
|
||||
#ifndef __SSE__
|
||||
#pragma GCC push_options
|
||||
|
@ -55,7 +58,10 @@ typedef float __v4sf __attribute__ ((__vector_size__ (16)));
|
|||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
_mm_undefined_ps (void)
|
||||
{
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Winit-self"
|
||||
__m128 __Y = __Y;
|
||||
#pragma GCC diagnostic pop
|
||||
return __Y;
|
||||
}
|
||||
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue